### Import libraries:

In [None]:
#1# Mandatory libraries: 
import pandas as pd
import numpy as np
from preprocessing import *

#2# Random Forest model:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score,mean_absolute_error, mean_squared_error

RSEED = 5

In [None]:
#3# Data visualization: 
import matplotlib.pyplot as plt
import seaborn as sns

#3A# Plot formatting:
from matplotlib.ticker import PercentFormatter
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 6
plt.figure(figsize = (8, 8))
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Collecting and preprocessing the data: 

In [None]:
df = pd.read_csv("data/AQUASTAT_complete.csv", index_col=0)


pre_split_processor = get_pre_split_processor()
df_pre = pre_split_processor.transform(df)

### Splitting the data:

In [None]:
#To avoid having a certain amount of NaN values in the model, we decided to restrict the number of years:
df_pre = df_pre.query("year > 2010")

In [None]:
targets = ['gdp_per_capita', 'water_stress', 'total_population_with_access_to_safe_drinking_water']

Y = df_pre[targets]
X = df_pre.drop(columns=targets)

X_train, X_pretest, Y_train, Y_pretest = train_test_split(X, Y, test_size=0.30, random_state=RSEED, stratify=X.country)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_pretest, Y_pretest, test_size=0.50, random_state=RSEED, stratify=X_pretest.country)

#### Checking the train and validation data: 

In [None]:
print(f"y_train NaN:\n{Y_train.isna().sum()}","\n","\n",f"y_valid NaN:\n{Y_valid.isna().sum()}")

In [None]:
Y_train.fillna(0, inplace=True)

In [None]:
#Applying full preprocessor to remove NaN and Strings: 
full_preprocessor = get_full_preprocessor(X_train.columns.to_list(), hot=True)

X_train_pre = full_preprocessor.fit_transform(X_train)
X_valid_pre = full_preprocessor.transform(X_valid)

In [None]:
X_train_pre.head()

### Applying the model in general target:

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train)

In [None]:
y_pred_3t = rf_model.predict(X_valid_pre)

In [None]:
y_pred_3t

In [None]:
y_pred_3t = rf_model.predict(X_valid_pre)
mae = mean_absolute_error(Y_valid, y_pred_3t, multioutput="raw_values")
mse = mean_squared_error(Y_valid, y_pred_3t, multioutput="raw_values")

print(f"Mean Absolute Error:{mae}","\n","\n", f"Mean Squared Error:{mse}")

### Separeting the targets:

- Water Stress target:

In [None]:
Y_train_water = Y_train["water_stress"]
Y_valid_water = Y_valid["water_stress"]

print(f"Y_train:{Y_train_water} \n Y_valid:{Y_valid_water}")

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train_water)

y_pred_water = rf_model.predict(X_valid_pre)

mae_water = mean_absolute_error(Y_valid_water, y_pred_water)
mse_water = mean_squared_error(Y_valid_water, y_pred_water)

print(f"Mean Absolute Error:{mae_water}","\n","\n", f"Mean Squared Error:{mse_water}")

- GDP per capita target:

In [None]:
Y_train_gdp = Y_train["gdp_per_capita"]
Y_valid_gdp = Y_valid["gdp_per_capita"]

print(f"Y_train:{Y_train_gdp} \n Y_valid:{Y_valid_gdp}")

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train_gdp)

y_pred_gdp = rf_model.predict(X_valid_pre)

mae_gdp = mean_absolute_error(Y_valid_gdp, y_pred_gdp)
mse_gdp = mean_squared_error(Y_valid_gdp, y_pred_gdp)

print(f"Mean Absolute Error:{mae_gdp}","\n","\n", f"Mean Squared Error:{mse_gdp}")

- Total population with access to safe drinking water:

In [None]:
Y_train_swp = Y_train["total_population_with_access_to_safe_drinking_water"]
Y_valid_swp = Y_valid["total_population_with_access_to_safe_drinking_water"]

print(f"Y_train:{Y_train_swp} \n Y_valid:{Y_valid_swp}")

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train_swp)

y_pred_swp = rf_model.predict(X_valid_pre)

mae_swp = mean_absolute_error(Y_valid_swp, y_pred_swp)
mse_swp = mean_squared_error(Y_valid_swp, y_pred_swp)

print(f"Mean Absolute Error:{mae_swp}","\n","\n", f"Mean Squared Error:{mse_swp}")