In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data/merged_climate_iom_data.csv')

In [3]:
df

Unnamed: 0,year,month,country_name,country_code,internally_displaced_persons,temperature_2m,total_precipitation_sum,potential_evaporation_sum,start_year,start_month,...,en.ghg.n2o.mt.ce.ar5,en.ghg.n2o.pi.mt.ce.ar5,en.ghg.n2o.tr.mt.ce.ar5,en.ghg.n2o.wa.mt.ce.ar5,er.fsh.aqua.mt,er.fsh.capt.mt,er.fsh.prod.mt,er.h2o.fwtl.k3,er.h2o.intr.k3,sp.pop.totl
0,2011,1,Haiti,HTI,1612754,297.774782,0.000533,-0.369376,2010.0,10.0,...,261.2350,32.000,1.029,5.197,2000.00000,71008.000000,73008.000000,62.939919,12.626417,9914904.0
1,2011,2,Sudan,SDN,98298,297.503557,0.154594,-0.253921,,,...,1.3652,17.000,141.000,792.000,600.00000,16530.000000,17130.000000,1.450000,13.007000,36140806.0
2,2011,3,Haiti,HTI,1360988,298.800977,0.175626,-0.253900,2010.0,10.0,...,1.3652,17.000,141.000,792.000,600.00000,16530.000000,17130.000000,1.450000,13.007000,9914904.0
3,2011,3,Sudan,SDN,268848,298.619636,0.136928,-0.221418,,,...,1.3652,17.000,141.000,792.000,600.00000,16530.000000,17130.000000,1.450000,13.007000,36140806.0
4,2011,5,Haiti,HTI,1269614,302.280907,0.031463,-0.252683,2010.0,10.0,...,261.2350,32.000,1.029,5.197,2000.00000,71008.000000,73008.000000,63.578309,13.058291,9914904.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1055,2023,12,Mozambique,MOZ,2128587,296.770002,0.097072,-0.224189,2023.0,10.0,...,9.7108,48.000,492.000,3.932,331829.27764,250906.910133,582736.245727,53.902150,195.872109,33635160.0
1056,2023,12,Nigeria,NGA,2184392,274.393817,0.065047,-0.027804,,,...,220.0240,1.776,2.496,6.177,316286.29493,266449.991167,582736.273807,629.887780,51.182190,227882945.0
1057,2023,12,Sudan,SDN,95451576,,,,,,...,,,,,,,,,,50042791.0
1058,2023,12,Uganda,UGA,5577,,,,,,...,,,,,,,,,,48656601.0


In [4]:
# Sorting by country and time to ensure proper filling
df.sort_values(by=["country_code", "year", "month"], inplace=True)

columns_to_fill_extended = [
    "AG.LND.FRST.K2", "AG.LND.PRCP.MM", "AG.LND.TOTL.K2", "AG.SRF.TOTL.K2",
    "EG.CFT.ACCS.RU.ZS", "EG.CFT.ACCS.UR.ZS", "EG.CFT.ACCS.ZS", "EG.EGY.PRIM.PP.KD", "EG.ELC.ACCS.ZS", "EG.FEC.RNEW.ZS",
    "EN.GHG.ALL.MT.CE.AR5", "EN.GHG.CH4.AG.MT.CE.AR5", "EN.GHG.CH4.BU.MT.CE.AR5", "EN.GHG.CH4.FE.MT.CE.AR5",
    "EN.GHG.CH4.IC.MT.CE.AR5", "EN.GHG.CH4.MT.CE.AR5", "EN.GHG.CH4.PI.MT.CE.AR5", "EN.GHG.CH4.TR.MT.CE.AR5",
    "EN.GHG.CH4.WA.MT.CE.AR5", "EN.GHG.CO2.BU.MT.CE.AR5", "EN.GHG.CO2.IC.MT.CE.AR5", "EN.GHG.CO2.IP.MT.CE.AR5",
    "EN.GHG.CO2.LU.MT.CE.AR5", "EN.GHG.CO2.MT.CE.AR5", "EN.GHG.CO2.PI.MT.CE.AR5", "EN.GHG.CO2.TR.MT.CE.AR5",
    "EN.GHG.FGAS.IP.MT.CE.AR5", "EN.GHG.N2O.AG.MT.CE.AR5", "EN.GHG.N2O.BU.MT.CE.AR5", "EN.GHG.N2O.FE.MT.CE.AR5",
    "EN.GHG.N2O.IC.MT.CE.AR5", "EN.GHG.N2O.IP.MT.CE.AR5", "EN.GHG.N2O.MT.CE.AR5", "EN.GHG.N2O.PI.MT.CE.AR5",
    "EN.GHG.N2O.TR.MT.CE.AR5", "EN.GHG.N2O.WA.MT.CE.AR5", "ER.FSH.AQUA.MT", "ER.FSH.CAPT.MT", "ER.FSH.PROD.MT",
    "ER.H2O.FWTL.K3", "ER.H2O.INTR.K3", "SP.POP.TOTL"
]

columns_to_fill_extended = [col.lower() for col in columns_to_fill_extended]

# Filtering columns that exist in the dataset
existing_columns = [col for col in columns_to_fill_extended if col in df.columns]

df[existing_columns] = df.groupby("country_code")[existing_columns].transform(lambda x: x.ffill().bfill())

In [5]:
df['total_idp_over_pop'] = (df['internally_displaced_persons'] / df['sp.pop.totl'])*100
df['total_affected_over_pop'] = (df['total_affected'] / df['sp.pop.totl'])*100

In [6]:
weird_values = df[df['total_idp_over_pop'] > 100]

In [7]:
len(weird_values)

8

In [8]:
weird_values[['year','country_name', 'sp.pop.totl','internally_displaced_persons',  'total_idp_over_pop', 'total_affected_over_pop']]

Unnamed: 0,year,country_name,sp.pop.totl,internally_displaced_persons,total_idp_over_pop,total_affected_over_pop
1024,2023,Sudan,50042791.0,63571047,127.033376,
1032,2023,Sudan,50042791.0,63441327,126.774158,
1040,2023,Sudan,50042791.0,55837788,111.580084,
1046,2023,Sudan,50042791.0,76711659,153.292128,
1057,2023,Sudan,50042791.0,95451576,190.739913,
872,2021,South Sudan,10865780.0,12058016,110.972392,7.684676
873,2021,South Sudan,10865780.0,12058016,110.972392,71.082518
887,2021,South Sudan,10865780.0,11148285,102.599951,71.082518


In [9]:
df = df[df['total_idp_over_pop'] < 100]

In [10]:
pd.options.display.float_format = '{:.2f}'.format
df[['total_idp_over_pop', 'internally_displaced_persons']].describe()

Unnamed: 0,total_idp_over_pop,internally_displaced_persons
count,909.0,909.0
mean,8.61,2918213.12
std,10.65,3994012.43
min,0.0,166.0
25%,1.69,318591.0
50%,4.7,1002159.0
75%,11.59,4454218.0
max,77.73,38898405.0


In [11]:
grouped_df = df.groupby(['country_code', 'year'])[['total_idp_over_pop', 'total_affected_over_pop']].mean().reset_index()

In [13]:
df

Unnamed: 0,year,month,country_name,country_code,internally_displaced_persons,temperature_2m,total_precipitation_sum,potential_evaporation_sum,start_year,start_month,...,en.ghg.n2o.tr.mt.ce.ar5,en.ghg.n2o.wa.mt.ce.ar5,er.fsh.aqua.mt,er.fsh.capt.mt,er.fsh.prod.mt,er.h2o.fwtl.k3,er.h2o.intr.k3,sp.pop.totl,total_idp_over_pop,total_affected_over_pop
340,2017,3,Afghanistan,AFG,739086,297.96,0.16,-0.46,,,...,139.08,201.67,317303.86,265432.89,582736.78,117.90,107.28,35688935.00,2.07,
380,2017,6,Afghanistan,AFG,2835546,306.43,0.00,-0.37,,,...,2.79,4.08,31814.00,35220.00,67034.00,43.06,35.20,35688935.00,7.95,
459,2017,12,Afghanistan,AFG,5184471,297.58,0.15,-0.42,,,...,139.32,201.45,317303.10,265433.58,582736.71,117.90,107.11,35688935.00,14.53,
490,2018,3,Afghanistan,AFG,5531499,301.56,0.00,-0.35,,,...,127.00,2.00,105.00,107000.00,107105.00,8.80,15.00,36743039.00,15.05,
521,2018,6,Afghanistan,AFG,5600775,298.52,0.12,-0.23,2018.00,4.00,...,2.00,402.00,204.00,29000.00,29204.00,725.00,141.00,36743039.00,15.24,36.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,2019,5,Zimbabwe,ZWE,75900,293.23,0.03,-0.20,2017.00,2.00,...,59.00,807.00,495.00,20372.00,20867.00,2.80,10.06,15271368.00,0.50,45.18
769,2020,7,Zimbabwe,ZWE,105498,303.72,0.00,-0.29,,,...,1.54,917.00,10.00,31803.55,31813.55,5.72,0.70,15526888.00,0.68,
795,2020,11,Zimbabwe,ZWE,58548,301.41,0.00,-0.47,,,...,25.00,412.00,204.00,27500.00,27704.00,725.00,141.00,15526888.00,0.38,
822,2021,3,Zimbabwe,ZWE,122526,297.90,0.10,-0.26,,,...,874.00,1.41,740.00,73140.00,73880.00,10.55,122.00,15797210.00,0.78,


In [14]:
env_factors = [
'ag.lnd.frst.k2', 'ag.lnd.prcp.mm',
       'ag.lnd.totl.k2', 'ag.srf.totl.k2', 'eg.cft.accs.ru.zs',
       'eg.cft.accs.ur.zs', 'eg.cft.accs.zs', 'eg.egy.prim.pp.kd',
       'eg.elc.accs.zs', 'eg.fec.rnew.zs', 'en.ghg.all.mt.ce.ar5',
       'en.ghg.ch4.ag.mt.ce.ar5', 'en.ghg.ch4.bu.mt.ce.ar5',
       'en.ghg.ch4.fe.mt.ce.ar5', 'en.ghg.ch4.ic.mt.ce.ar5',
       'en.ghg.ch4.mt.ce.ar5', 'en.ghg.ch4.pi.mt.ce.ar5',
       'en.ghg.ch4.tr.mt.ce.ar5', 'en.ghg.ch4.wa.mt.ce.ar5',
       'en.ghg.co2.bu.mt.ce.ar5', 'en.ghg.co2.ic.mt.ce.ar5',
       'en.ghg.co2.ip.mt.ce.ar5', 'en.ghg.co2.lu.mt.ce.ar5',
       'en.ghg.co2.mt.ce.ar5', 'en.ghg.co2.pi.mt.ce.ar5',
       'en.ghg.co2.tr.mt.ce.ar5', 'en.ghg.fgas.ip.mt.ce.ar5',
       'en.ghg.n2o.ag.mt.ce.ar5', 'en.ghg.n2o.bu.mt.ce.ar5',
       'en.ghg.n2o.fe.mt.ce.ar5', 'en.ghg.n2o.ic.mt.ce.ar5',
       'en.ghg.n2o.ip.mt.ce.ar5', 'en.ghg.n2o.mt.ce.ar5',
       'en.ghg.n2o.pi.mt.ce.ar5', 'en.ghg.n2o.tr.mt.ce.ar5',
       'en.ghg.n2o.wa.mt.ce.ar5', 'er.fsh.aqua.mt', 'er.fsh.capt.mt',
       'er.fsh.prod.mt', 'er.h2o.fwtl.k3', 'er.h2o.intr.k3', 'sp.pop.totl',
       'cpi_value', 'total_affected']

#impute mean to nas in env_factors

for col in env_factors:
       if col == 'total_affected':
              df[col].fillna(0, inplace=True)
       else:
              df[col].fillna(df[col].mean(), inplace=True)

# Drop rows with missing values in the selected columns
df_clean = df[["internally_displaced_persons"] + env_factors].dropna()

# Define independent (X) and dependent (y) variables
X = df_clean[env_factors]  # Environmental factors
y = df_clean["internally_displaced_persons"]  # Displacement

# Add a constant term for the regression model
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Display model summary
model.summary()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) inst

0,1,2,3
Dep. Variable:,internally_displaced_persons,R-squared:,0.209
Model:,OLS,Adj. R-squared:,0.169
Method:,Least Squares,F-statistic:,5.203
Date:,"Sat, 03 May 2025",Prob (F-statistic):,3.9399999999999996e-23
Time:,17:58:11,Log-Likelihood:,-15000.0
No. Observations:,909,AIC:,30090.0
Df Residuals:,864,BIC:,30310.0
Df Model:,44,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.211e+06,3.1e+06,0.391,0.696,-4.87e+06,7.3e+06
ag.lnd.frst.k2,18.2729,5.851,3.123,0.002,6.789,29.757
ag.lnd.prcp.mm,138.8308,1269.784,0.109,0.913,-2353.392,2631.054
ag.lnd.totl.k2,58.8345,40.713,1.445,0.149,-21.073,138.742
ag.srf.totl.k2,-60.7259,40.155,-1.512,0.131,-139.539,18.087
eg.cft.accs.ru.zs,1.031e+05,8.2e+04,1.257,0.209,-5.79e+04,2.64e+05
eg.cft.accs.ur.zs,-4482.0706,3.57e+04,-0.125,0.900,-7.46e+04,6.57e+04
eg.cft.accs.zs,-4.467e+04,1.01e+05,-0.444,0.657,-2.42e+05,1.53e+05
eg.egy.prim.pp.kd,-2.666e+05,1.3e+05,-2.057,0.040,-5.21e+05,-1.22e+04

0,1,2,3
Omnibus:,596.306,Durbin-Watson:,0.462
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10692.654
Skew:,2.705,Prob(JB):,0.0
Kurtosis:,18.908,Cond. No.,5490000000.0


#### OLS Regression

In [15]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the regression model on the training set
model_train = sm.OLS(y_train, X_train).fit()

# Predict on the test set
y_pred = model_train.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, r2

(2406618.867375053, 0.11262590848843468)

#### Cross Validation for the OLS Model:
- First intent with K fold
- Note for fixing:  Do cross validation in the proportion of affected / total

In [16]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer

# 1. Set up K-Fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# 2. Initialize lists to store metrics
mae_scores = []
r2_scores = []

# 3. Perform manual cross-validation with statsmodels
for train_index, test_index in kf.split(X):
    # Split data for this fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Train statsmodels OLS
    model_fold = sm.OLS(y_train_fold, X_train_fold).fit()
    
    # Make predictions
    y_pred_fold = model_fold.predict(X_test_fold)
    
    # Calculate and store performance metrics
    mae = mean_absolute_error(y_test_fold, y_pred_fold)
    r2 = r2_score(y_test_fold, y_pred_fold)
    
    mae_scores.append(mae)
    r2_scores.append(r2)

# 4. Calculate average performance
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)
std_mae = np.std(mae_scores)
std_r2 = np.std(r2_scores)

print(f"Cross-validation MAE: {avg_mae:.4f} ± {std_mae:.4f}")
print(f"Cross-validation R²: {avg_r2:.4f} ± {std_r2:.4f}")

Cross-validation MAE: 2503322.8063 ± 311360.1343
Cross-validation R²: 0.1143 ± 0.1665


#### Random Forest Model

In [17]:
# Convert categorical variables to numerical using one-hot encoding
df_encoded = pd.get_dummies(df.drop(columns=["internally_displaced_persons"]), drop_first=True)

# Fill missing values with the median
df_encoded = df_encoded.fillna(df_encoded.median(numeric_only=True))
y_filled = df["internally_displaced_persons"].fillna(df["internally_displaced_persons"].median())

# Split into train and test sets
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(df_encoded, y_filled, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Predict on test set
y_pred_rf = rf_model.predict(X_test_rf)

# Evaluate model performance
mae_rf = mean_absolute_error(y_test_rf, y_pred_rf)
r2_rf = r2_score(y_test_rf, y_pred_rf)

mae_rf, r2_rf

(170964.6009340659, 0.9670834377427312)

#### Cross Validation for the Random Forest Model

In [None]:
# Setup K-fold cross-validation
k = 5  # adjust?? i think this is already pretty solid
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Prepare data
X = df_encoded
y = y_filled

# Initialize lists to store performance metrics
mae_scores = []
r2_scores = []

# Perform cross-validation
for train_index, test_index in kf.split(X):
    # Split data for this fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Train model
    rf_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions
    y_pred_fold = rf_model.predict(X_test_fold)
    
    # Calculate and store performance metrics
    mae = mean_absolute_error(y_test_fold, y_pred_fold)
    r2 = r2_score(y_test_fold, y_pred_fold)
    
    mae_scores.append(mae)
    r2_scores.append(r2)

# Calculate average performance
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)
std_mae = np.std(mae_scores)
std_r2 = np.std(r2_scores)

print(f"Cross-validation MAE: {avg_mae:.4f} ± {std_mae:.4f}")
print(f"Cross-validation R²: {avg_r2:.4f} ± {std_r2:.4f}")

Cross-validation MAE: 167201.4094 ± 73869.9895
Cross-validation R²: 0.9785 ± 0.0204


### Cross Validation Random Forest Results

- Cross-validation MAE (Mean Absolute Error): 167201.4094 ± 73869.9895: Much lower MAE than the OLS model predicted (10x smaller)
- Predictions with Rnadom Forest are much closer to the actual values (WUUHUU)

- Cross-validation R²: 0.9785 ± 0.0204: Very high R² value (0.9785 compared to ~0.57 for OLS)
- Our model explains nearly 98% of the variance in our data

#### Cross Validation for Ridge and Lasso

In [None]:
# Create a pipeline that first standardizes, then applies Ridge regression
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

# Create scorers
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Run cross-validation with the pipeline
ridge_mae_scores = -cross_val_score(ridge_pipeline, X, y, cv=5, scoring=mae_scorer)
ridge_r2_scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring=r2_scorer)

print("Ridge Cross-validation Results (with Pipeline):")
print(f"Cross-validation MAE: {ridge_mae_scores.mean():.4f} ± {ridge_mae_scores.std():.4f}")
print(f"Cross-validation R²: {ridge_r2_scores.mean():.4f} ± {ridge_r2_scores.std():.4f}")

Ridge Cross-validation Results (with Pipeline):
Cross-validation MAE: 2862005.5804 ± 393197.3814
Cross-validation R²: -4.8681 ± 9.4525
