### Import libraries:

In [116]:
#1# Mandatory libraries: 
import pandas as pd
import numpy as np
from preprocessing import *

#2# Random Forest model:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score,mean_absolute_error, mean_squared_error

RSEED = 5

In [117]:
#3# Data visualization: 
import matplotlib.pyplot as plt
import seaborn as sns

#3A# Plot formatting:
from matplotlib.ticker import PercentFormatter
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 6
plt.figure(figsize = (8, 8))
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

<Figure size 800x800 with 0 Axes>

### Collecting and preprocessing the data: 

In [118]:
df = pd.read_csv("data/AQUASTAT_complete.csv", index_col=0)


pre_split_processor = get_pre_split_processor()
df_pre = pre_split_processor.transform(df)

### Splitting the data:

In [119]:
#To avoid having a certain amount of NaN values in the model, we decided to restrict the number of years:
df_pre = df_pre.query("year > 2010")

In [120]:
targets = ['gdp_per_capita', 'water_stress', 'total_population_with_access_to_safe_drinking_water']

Y = df_pre[targets]
X = df_pre.drop(columns=targets)

X_train, X_pretest, Y_train, Y_pretest = train_test_split(X, Y, test_size=0.30, random_state=RSEED, stratify=X.country)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_pretest, Y_pretest, test_size=0.50, random_state=RSEED, stratify=X_pretest.country)

#### Checking the train and validation data: 

In [121]:
print(f"y_train NaN:\n{Y_train.isna().sum()}","\n","\n",f"y_valid NaN:\n{Y_valid.isna().sum()}")

y_train NaN:
gdp_per_capita                                         0
water_stress                                           1
total_population_with_access_to_safe_drinking_water    0
dtype: int64 
 
 y_valid NaN:
gdp_per_capita                                         0
water_stress                                           0
total_population_with_access_to_safe_drinking_water    0
dtype: int64


In [122]:
Y_train.query("water_stress.isna()")

Unnamed: 0,gdp_per_capita,water_stress,total_population_with_access_to_safe_drinking_water
1901,9229.834,,97.3


In [123]:
Y_train.fillna(0, inplace=True)

In [124]:
#Applying full preprocessor to remove NaN and Strings: 
full_preprocessor = get_full_preprocessor(X_train.columns.to_list(), hot=True)

X_train_pre = full_preprocessor.fit_transform(X_train)
X_valid_pre = full_preprocessor.transform(X_valid)

### Applying the model in general target:

In [125]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train)

In [126]:
y_pred_3t = rf_model.predict(X_valid_pre)

In [127]:
y_pred_3t

array([[1.76501935e+03, 6.41792141e+01, 9.30590000e+01],
       [1.08151171e+03, 6.62338387e+00, 8.58880000e+01],
       [7.37735224e+02, 8.69991297e+00, 7.97990000e+01],
       [5.13893562e+03, 6.53849031e+01, 8.67860000e+01],
       [1.07288392e+04, 5.05341612e+00, 9.75450000e+01],
       [4.99706908e+02, 2.74431347e+00, 5.15910000e+01],
       [5.87709539e+04, 3.86212024e+01, 9.99120000e+01],
       [3.88826541e+03, 1.46266369e+00, 8.76300000e+01],
       [1.31543203e+03, 2.91782930e+01, 7.24200000e+01],
       [2.16711593e+03, 6.70364453e+00, 8.17690000e+01],
       [4.00726373e+04, 2.42020982e+01, 1.00000000e+02],
       [3.17056715e+03, 1.51178715e+00, 9.91220000e+01],
       [6.19967756e+03, 6.69473914e+00, 8.70560000e+01],
       [4.46189241e+03, 9.79784624e+01, 9.64260000e+01],
       [5.95138074e+03, 2.39676528e+01, 9.75390000e+01],
       [4.40810077e+04, 8.04808476e+00, 1.00000000e+02],
       [1.57676070e+03, 1.81369841e+01, 5.98510000e+01],
       [5.96469496e+04, 2.81849

In [128]:
y_pred_3t = rf_model.predict(X_valid_pre)
mae = mean_absolute_error(Y_valid, y_pred_3t, multioutput="raw_values")
mse = mean_squared_error(Y_valid, y_pred_3t, multioutput="raw_values")

print(f"Mean Absolute Error:{mae}","\n","\n", f"Mean Squared Error:{mse}")

Mean Absolute Error:[1.04100918e+03 1.06935350e+01 9.39518367e-01] 
 
 Mean Squared Error:[5.83914308e+06 2.13964875e+03 2.91612378e+00]


### Separeting the targets:

- Water Stress target:

In [106]:
Y_train_water = Y_train["water_stress"]
Y_valid_water = Y_valid["water_stress"]

print(f"Y_train:{Y_train_water} \n Y_valid:{Y_valid_water}")

Y_train:4360   50.039
5152   20.756
2073   27.987
4269   26.539
5096   13.246
        ...  
5097   13.246
989     2.062
624    87.500
4444    1.095
1910    5.352
Name: water_stress, Length: 1139, dtype: float64 
 Y_valid:3673   66.492
2962    2.210
1167    7.820
454    53.729
1903    4.208
        ...  
6212    1.835
6437   17.572
4982    8.003
3560    8.181
2251    0.227
Name: water_stress, Length: 245, dtype: float64


In [107]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train_water)

y_pred_water = rf_model.predict(X_valid_pre)

mae_water = mean_absolute_error(Y_valid_water, y_pred_water)
mse_water = mean_squared_error(Y_valid_water, y_pred_water)

print(f"Mean Absolute Error:{mae_water}","\n","\n", f"Mean Squared Error:{mse_water}")

Mean Absolute Error:2.056758381478721 
 
 Mean Squared Error:153.1139290794525


- GDP per capita target:

In [108]:
Y_train_gdp = Y_train["gdp_per_capita"]
Y_valid_gdp = Y_valid["gdp_per_capita"]

print(f"Y_train:{Y_train_gdp} \n Y_valid:{Y_valid_gdp}")

Y_train:4360    1312.035
5152    9206.776
2073   19738.393
4269    1325.976
5096    1815.219
          ...   
5097    1892.098
989     7617.292
624    17419.314
4444   14215.173
1910   12076.806
Name: gdp_per_capita, Length: 1139, dtype: float64 
 Y_valid:3673    1729.378
2962     732.727
1167     813.100
454     4735.143
1903   10744.099
          ...   
6212    5324.636
6437   21614.537
4982     778.670
3560   14711.084
2251     545.216
Name: gdp_per_capita, Length: 245, dtype: float64


In [109]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train_gdp)

y_pred_gdp = rf_model.predict(X_valid_pre)

mae_gdp = mean_absolute_error(Y_valid_gdp, y_pred_gdp)
mse_gdp = mean_squared_error(Y_valid_gdp, y_pred_gdp)

print(f"Mean Absolute Error:{mae_gdp}","\n","\n", f"Mean Squared Error:{mse_gdp}")

Mean Absolute Error:1036.9271510293875 
 
 Mean Squared Error:6070267.206866616


- Total population with access to safe drinking water:

In [112]:
Y_train_swp = Y_train["total_population_with_access_to_safe_drinking_water"]
Y_valid_swp = Y_valid["total_population_with_access_to_safe_drinking_water"]

print(f"Y_train:{Y_train_psw} \n Y_valid:{Y_valid_psw}")

Y_train:4360    90.000
5152    99.800
2073   100.000
4269    62.300
5096    56.700
         ...  
5097    57.900
989     96.000
624     99.700
4444    99.300
1910    97.800
Name: total_population_with_access_to_safe_drinking_water, Length: 1139, dtype: float64 
 Y_valid:3673    94.100
2962    90.200
1167    82.300
454     87.000
1903    97.600
         ...  
6212    98.000
6437   100.000
4982    70.900
3560   100.000
2251    52.400
Name: total_population_with_access_to_safe_drinking_water, Length: 245, dtype: float64


In [113]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=RSEED)
rf_model.fit(X_train_pre, Y_train_swp)

y_pred_swp = rf_model.predict(X_valid_pre)

mae_swp = mean_absolute_error(Y_valid_swp, y_pred_swp)
mse_swp = mean_squared_error(Y_valid_swp, y_pred_swp)

print(f"Mean Absolute Error:{mae_swp}","\n","\n", f"Mean Squared Error:{mse_swp}")

Mean Absolute Error:0.59508979591837 
 
 Mean Squared Error:2.307919595918377
