In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import joblib 
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer

In [2]:
df = pd.read_csv('data/merged_climate_iom_data.csv')

In [3]:
# Sorting by country and time to ensure proper filling
df.sort_values(by=["country_code", "year", "month"], inplace=True)

columns_to_fill_extended = [
    "AG.LND.FRST.K2", "AG.LND.PRCP.MM", "AG.LND.TOTL.K2", "AG.SRF.TOTL.K2",
    "EG.CFT.ACCS.RU.ZS", "EG.CFT.ACCS.UR.ZS", "EG.CFT.ACCS.ZS", "EG.EGY.PRIM.PP.KD", "EG.ELC.ACCS.ZS", "EG.FEC.RNEW.ZS",
    "EN.GHG.ALL.MT.CE.AR5", "EN.GHG.CH4.AG.MT.CE.AR5", "EN.GHG.CH4.BU.MT.CE.AR5", "EN.GHG.CH4.FE.MT.CE.AR5",
    "EN.GHG.CH4.IC.MT.CE.AR5", "EN.GHG.CH4.MT.CE.AR5", "EN.GHG.CH4.PI.MT.CE.AR5", "EN.GHG.CH4.TR.MT.CE.AR5",
    "EN.GHG.CH4.WA.MT.CE.AR5", "EN.GHG.CO2.BU.MT.CE.AR5", "EN.GHG.CO2.IC.MT.CE.AR5", "EN.GHG.CO2.IP.MT.CE.AR5",
    "EN.GHG.CO2.LU.MT.CE.AR5", "EN.GHG.CO2.MT.CE.AR5", "EN.GHG.CO2.PI.MT.CE.AR5", "EN.GHG.CO2.TR.MT.CE.AR5",
    "EN.GHG.FGAS.IP.MT.CE.AR5", "EN.GHG.N2O.AG.MT.CE.AR5", "EN.GHG.N2O.BU.MT.CE.AR5", "EN.GHG.N2O.FE.MT.CE.AR5",
    "EN.GHG.N2O.IC.MT.CE.AR5", "EN.GHG.N2O.IP.MT.CE.AR5", "EN.GHG.N2O.MT.CE.AR5", "EN.GHG.N2O.PI.MT.CE.AR5",
    "EN.GHG.N2O.TR.MT.CE.AR5", "EN.GHG.N2O.WA.MT.CE.AR5", "ER.FSH.AQUA.MT", "ER.FSH.CAPT.MT", "ER.FSH.PROD.MT",
    "ER.H2O.FWTL.K3", "ER.H2O.INTR.K3", "SP.POP.TOTL"
]

columns_to_fill_extended = [col.lower() for col in columns_to_fill_extended]

# Filtering columns that exist in the dataset
existing_columns = [col for col in columns_to_fill_extended if col in df.columns]

df[existing_columns] = df.groupby("country_code")[existing_columns].transform(lambda x: x.ffill().bfill())

In [4]:
df['total_idp_over_pop'] = (df['internally_displaced_persons'] / df['sp.pop.totl'])*100
df['total_affected_over_pop'] = (df['total_affected'] / df['sp.pop.totl'])*100

In [5]:
weird_values = df[df['total_idp_over_pop'] > 100]

In [6]:
len(weird_values)

8

In [7]:
weird_values[['year','country_name', 'sp.pop.totl','internally_displaced_persons',  'total_idp_over_pop', 'total_affected_over_pop']]

Unnamed: 0,year,country_name,sp.pop.totl,internally_displaced_persons,total_idp_over_pop,total_affected_over_pop
1024,2023,Sudan,50042791.0,63571047,127.033376,
1032,2023,Sudan,50042791.0,63441327,126.774158,
1040,2023,Sudan,50042791.0,55837788,111.580084,
1046,2023,Sudan,50042791.0,76711659,153.292128,
1057,2023,Sudan,50042791.0,95451576,190.739913,
872,2021,South Sudan,10865780.0,12058016,110.972392,7.684676
873,2021,South Sudan,10865780.0,12058016,110.972392,71.082518
887,2021,South Sudan,10865780.0,11148285,102.599951,71.082518


In [8]:
df = df[df['total_idp_over_pop'] < 100]

In [9]:
pd.options.display.float_format = '{:.2f}'.format
df[['total_idp_over_pop', 'internally_displaced_persons']].describe()

Unnamed: 0,total_idp_over_pop,internally_displaced_persons
count,909.0,909.0
mean,8.61,2918213.12
std,10.65,3994012.43
min,0.0,166.0
25%,1.69,318591.0
50%,4.7,1002159.0
75%,11.59,4454218.0
max,77.73,38898405.0


In [10]:
grouped_df = df.groupby(['country_code', 'year'])[['total_idp_over_pop', 'total_affected_over_pop']].mean().reset_index()

In [11]:
df

Unnamed: 0,year,month,country_name,country_code,internally_displaced_persons,temperature_2m,total_precipitation_sum,potential_evaporation_sum,start_year,start_month,...,en.ghg.n2o.tr.mt.ce.ar5,en.ghg.n2o.wa.mt.ce.ar5,er.fsh.aqua.mt,er.fsh.capt.mt,er.fsh.prod.mt,er.h2o.fwtl.k3,er.h2o.intr.k3,sp.pop.totl,total_idp_over_pop,total_affected_over_pop
340,2017,3,Afghanistan,AFG,739086,297.96,0.16,-0.46,2018.28,,...,137.86,199.62,317304.40,265431.78,582735.92,118.01,106.55,35688935.00,2.07,
380,2017,6,Afghanistan,AFG,2835546,306.43,0.00,-0.37,2019.72,,...,2.79,4.08,31814.00,35220.00,67034.00,43.06,35.20,35688935.00,7.95,
459,2017,12,Afghanistan,AFG,5184471,297.58,0.15,-0.42,2017.00,,...,119.58,207.38,317299.84,265436.36,582735.99,116.23,106.83,35688935.00,14.53,
490,2018,3,Afghanistan,AFG,5531499,301.56,0.00,-0.35,2019.12,,...,127.00,2.00,105.00,107000.00,107105.00,8.80,15.00,36743039.00,15.05,
521,2018,6,Afghanistan,AFG,5600775,298.52,0.12,-0.23,2016.95,4.00,...,2.00,402.00,204.00,29000.00,29204.00,725.00,141.00,36743039.00,15.24,36.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,2019,5,Zimbabwe,ZWE,75900,293.23,0.03,-0.20,2018.85,2.00,...,59.00,807.00,495.00,20372.00,20867.00,2.80,10.06,15271368.00,0.50,45.18
769,2020,7,Zimbabwe,ZWE,105498,303.72,0.00,-0.29,2017.75,,...,1.54,917.00,10.00,31803.55,31813.55,5.72,0.70,15526888.00,0.68,
795,2020,11,Zimbabwe,ZWE,58548,301.41,0.00,-0.47,2022.40,,...,25.00,412.00,204.00,27500.00,27704.00,725.00,141.00,15526888.00,0.38,
822,2021,3,Zimbabwe,ZWE,122526,297.90,0.10,-0.26,2019.87,,...,874.00,1.41,740.00,73140.00,73880.00,10.55,122.00,15797210.00,0.78,


In [12]:
env_factors = [
'ag.lnd.frst.k2', 'ag.lnd.prcp.mm',
       'ag.lnd.totl.k2', 'ag.srf.totl.k2', 'eg.cft.accs.ru.zs',
       'eg.cft.accs.ur.zs', 'eg.cft.accs.zs', 'eg.egy.prim.pp.kd',
       'eg.elc.accs.zs', 'eg.fec.rnew.zs', 'en.ghg.all.mt.ce.ar5',
       'en.ghg.ch4.ag.mt.ce.ar5', 'en.ghg.ch4.bu.mt.ce.ar5',
       'en.ghg.ch4.fe.mt.ce.ar5', 'en.ghg.ch4.ic.mt.ce.ar5',
       'en.ghg.ch4.mt.ce.ar5', 'en.ghg.ch4.pi.mt.ce.ar5',
       'en.ghg.ch4.tr.mt.ce.ar5', 'en.ghg.ch4.wa.mt.ce.ar5',
       'en.ghg.co2.bu.mt.ce.ar5', 'en.ghg.co2.ic.mt.ce.ar5',
       'en.ghg.co2.ip.mt.ce.ar5', 'en.ghg.co2.lu.mt.ce.ar5',
       'en.ghg.co2.mt.ce.ar5', 'en.ghg.co2.pi.mt.ce.ar5',
       'en.ghg.co2.tr.mt.ce.ar5', 'en.ghg.fgas.ip.mt.ce.ar5',
       'en.ghg.n2o.ag.mt.ce.ar5', 'en.ghg.n2o.bu.mt.ce.ar5',
       'en.ghg.n2o.fe.mt.ce.ar5', 'en.ghg.n2o.ic.mt.ce.ar5',
       'en.ghg.n2o.ip.mt.ce.ar5', 'en.ghg.n2o.mt.ce.ar5',
       'en.ghg.n2o.pi.mt.ce.ar5', 'en.ghg.n2o.tr.mt.ce.ar5',
       'en.ghg.n2o.wa.mt.ce.ar5', 'er.fsh.aqua.mt', 'er.fsh.capt.mt',
       'er.fsh.prod.mt', 'er.h2o.fwtl.k3', 'er.h2o.intr.k3', 'sp.pop.totl',
       'cpi_value', 'total_affected']

#impute mean to nas in env_factors

for col in env_factors:
       if col == 'total_affected':
              df[col].fillna(0, inplace=True)
       else:
              df[col].fillna(df[col].mean(), inplace=True)

# Drop rows with missing values in the selected columns
df_clean = df[["internally_displaced_persons"] + env_factors].dropna()

# Define independent (X) and dependent (y) variables
X = df_clean[env_factors]  # Environmental factors
y = df_clean["internally_displaced_persons"]  # Displacement

# Add a constant term for the regression model
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Display model summary
model.summary()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


0,1,2,3
Dep. Variable:,internally_displaced_persons,R-squared:,0.212
Model:,OLS,Adj. R-squared:,0.171
Method:,Least Squares,F-statistic:,5.27
Date:,"Sun, 04 May 2025",Prob (F-statistic):,1.5199999999999998e-23
Time:,19:09:18,Log-Likelihood:,-14998.0
No. Observations:,909,AIC:,30090.0
Df Residuals:,864,BIC:,30300.0
Df Model:,44,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.206e+06,3.02e+06,1.062,0.289,-2.72e+06,9.13e+06
ag.lnd.frst.k2,19.3430,5.668,3.413,0.001,8.218,30.468
ag.lnd.prcp.mm,-459.5764,1237.407,-0.371,0.710,-2888.253,1969.100
ag.lnd.totl.k2,40.4736,41.133,0.984,0.325,-40.259,121.206
ag.srf.totl.k2,-42.9169,40.472,-1.060,0.289,-122.353,36.519
eg.cft.accs.ru.zs,5.523e+04,7.86e+04,0.702,0.483,-9.91e+04,2.1e+05
eg.cft.accs.ur.zs,-2.988e+04,3.56e+04,-0.840,0.401,-9.97e+04,3.99e+04
eg.cft.accs.zs,9698.4184,1.02e+05,0.095,0.924,-1.9e+05,2.09e+05
eg.egy.prim.pp.kd,-3.261e+05,1.43e+05,-2.281,0.023,-6.07e+05,-4.55e+04

0,1,2,3
Omnibus:,588.109,Durbin-Watson:,0.468
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10649.335
Skew:,2.647,Prob(JB):,0.0
Kurtosis:,18.911,Cond. No.,1990000000.0


#### OLS Regression

In [13]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the regression model on the training set
model_train = sm.OLS(y_train, X_train).fit()

# Predict on the test set
y_pred = model_train.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, r2

(2445310.3041577484, 0.07036441450201603)

#### Cross Validation for the OLS Model:
- First intent with K fold
- Note for fixing:  Do cross validation in the proportion of affected / total

In [14]:
# 1. Set up K-Fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# 2. Initialize lists to store metrics
mae_scores = []
r2_scores = []

# 3. Perform manual cross-validation with statsmodels
for train_index, test_index in kf.split(X):
    # Split data for this fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Train statsmodels OLS
    model_fold = sm.OLS(y_train_fold, X_train_fold).fit()
    
    # Make predictions
    y_pred_fold = model_fold.predict(X_test_fold)
    
    # Calculate and store performance metrics
    mae = mean_absolute_error(y_test_fold, y_pred_fold)
    r2 = r2_score(y_test_fold, y_pred_fold)
    
    mae_scores.append(mae)
    r2_scores.append(r2)

# 4. Calculate average performance
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)
std_mae = np.std(mae_scores)
std_r2 = np.std(r2_scores)

print(f"Cross-validation MAE: {avg_mae:.4f} ± {std_mae:.4f}")
print(f"Cross-validation R²: {avg_r2:.4f} ± {std_r2:.4f}")

Cross-validation MAE: 2509227.6511 ± 308442.1968
Cross-validation R²: 0.1111 ± 0.1595


#### Random Forest Model

In [15]:
# Convert categorical variables to numerical using one-hot encoding
df_encoded = pd.get_dummies(df.drop(columns=["internally_displaced_persons"]), drop_first=True)

# Fill missing values with the median
df_encoded = df_encoded.fillna(df_encoded.median(numeric_only=True))
y_filled = df["internally_displaced_persons"].fillna(df["internally_displaced_persons"].median())

# Split into train and test sets
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(df_encoded, y_filled, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Predict on test set
y_pred_rf = rf_model.predict(X_test_rf)

# Evaluate model performance
mae_rf = mean_absolute_error(y_test_rf, y_pred_rf)
r2_rf = r2_score(y_test_rf, y_pred_rf)

mae_rf, r2_rf

(177688.49065934066, 0.9658082671660114)

#### Cross Validation for the Random Forest Model

In [16]:
# Setup K-fold cross-validation
k = 5  # adjust?? i think this is already pretty solid
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Prepare data
X = df_encoded
y = y_filled

# Initialize lists to store performance metrics
mae_scores = []
r2_scores = []

# Perform cross-validation
for train_index, test_index in kf.split(X):
    # Split data for this fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Train model
    rf_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions
    y_pred_fold = rf_model.predict(X_test_fold)
    
    # Calculate and store performance metrics
    mae = mean_absolute_error(y_test_fold, y_pred_fold)
    r2 = r2_score(y_test_fold, y_pred_fold)
    
    mae_scores.append(mae)
    r2_scores.append(r2)

# Calculate average performance
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)
std_mae = np.std(mae_scores)
std_r2 = np.std(r2_scores)

print(f"Cross-validation MAE: {avg_mae:.4f} ± {std_mae:.4f}")
print(f"Cross-validation R²: {avg_r2:.4f} ± {std_r2:.4f}")

Cross-validation MAE: 202551.1214 ± 38278.7988
Cross-validation R²: 0.9639 ± 0.0342


### Cross Validation Random Forest Results

- Cross-validation MAE (Mean Absolute Error): 167201.4094 ± 73869.9895: Much lower MAE than the OLS model predicted (10x smaller)
- Predictions with Rnadom Forest are much closer to the actual values (WUUHUU)

- Cross-validation R²: 0.9785 ± 0.0204: Very high R² value (0.9785 compared to ~0.57 for OLS)
- Our model explains nearly 98% of the variance in our data

#### Cross Validation for Ridge and Lasso

In [18]:
# Create a pipeline that first standardizes, then applies Ridge regression
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

# Create scorers
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Run cross-validation with the pipeline
ridge_mae_scores = -cross_val_score(ridge_pipeline, X, y, cv=5, scoring=mae_scorer)
ridge_r2_scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring=r2_scorer)

print("Ridge Cross-validation Results (with Pipeline):")
print(f"Cross-validation MAE: {ridge_mae_scores.mean():.4f} ± {ridge_mae_scores.std():.4f}")
print(f"Cross-validation R²: {ridge_r2_scores.mean():.4f} ± {ridge_r2_scores.std():.4f}")

Ridge Cross-validation Results (with Pipeline):
Cross-validation MAE: 3216859.9696 ± 967432.1526
Cross-validation R²: -9.4510 ± 18.8342


In [19]:
#Save models 

joblib.dump(model, 'models/ols_model.pkl')
joblib.dump(ridge_pipeline, 'models/ridge_model.pkl')
joblib.dump(rf_model, 'models/rf_model.pkl')


['models/rf_model.pkl']