In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('data/merged_climate_iom_data.csv')

In [4]:
df

Unnamed: 0,year,month,country_name,country_code,internally_displaced_persons,ag.lnd.frst.k2,ag.lnd.prcp.mm,ag.lnd.totl.k2,ag.srf.totl.k2,eg.cft.accs.ru.zs,...,er.h2o.intr.k3,sp.pop.totl,start_year,start_month,end_year,end_month,disaster_type,cpi,total_affected,climate_catastrophe
0,2010,6,Sudan,SDN,30933,,,,,,...,,,,,,,,,,0
1,2010,11,Haiti,HTI,2137764,,,,,,...,,,2010.0,10.0,2011.0,12.0,Epidemic,71.563596,513997.0,1
2,2010,11,Haiti,HTI,2137764,,,,,,...,,,2010.0,11.0,2010.0,11.0,Storm,71.563596,5020.0,1
3,2011,1,Haiti,HTI,1612754,3752.74,1440.0,27560.0,27750.0,1.0,...,13.007,9914904.0,2010.0,10.0,2011.0,12.0,Epidemic,71.563596,513997.0,1
4,2011,2,Sudan,SDN,98298,,,,2505810.0,25.9,...,,36140806.0,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139,2025,1,Lebanon,LBN,1333317,,,,,,...,,,,,,,,,,0
1140,2025,1,Sudan,SDN,69461010,,,,,,...,,,2024.0,8.0,2025.0,3.0,Epidemic,,57447.0,1
1141,2025,2,Lebanon,LBN,1236267,,,,,,...,,,,,,,,,,0
1142,2025,2,Syrian Arab Republic,SYR,20983938,,,,,,...,,,,,,,,,,0


In [5]:
# Sorting by country and time to ensure proper filling
df.sort_values(by=["country_code", "year", "month"], inplace=True)

columns_to_fill_extended = [
    "AG.LND.FRST.K2", "AG.LND.PRCP.MM", "AG.LND.TOTL.K2", "AG.SRF.TOTL.K2",
    "EG.CFT.ACCS.RU.ZS", "EG.CFT.ACCS.UR.ZS", "EG.CFT.ACCS.ZS", "EG.EGY.PRIM.PP.KD", "EG.ELC.ACCS.ZS", "EG.FEC.RNEW.ZS",
    "EN.GHG.ALL.MT.CE.AR5", "EN.GHG.CH4.AG.MT.CE.AR5", "EN.GHG.CH4.BU.MT.CE.AR5", "EN.GHG.CH4.FE.MT.CE.AR5",
    "EN.GHG.CH4.IC.MT.CE.AR5", "EN.GHG.CH4.MT.CE.AR5", "EN.GHG.CH4.PI.MT.CE.AR5", "EN.GHG.CH4.TR.MT.CE.AR5",
    "EN.GHG.CH4.WA.MT.CE.AR5", "EN.GHG.CO2.BU.MT.CE.AR5", "EN.GHG.CO2.IC.MT.CE.AR5", "EN.GHG.CO2.IP.MT.CE.AR5",
    "EN.GHG.CO2.LU.MT.CE.AR5", "EN.GHG.CO2.MT.CE.AR5", "EN.GHG.CO2.PI.MT.CE.AR5", "EN.GHG.CO2.TR.MT.CE.AR5",
    "EN.GHG.FGAS.IP.MT.CE.AR5", "EN.GHG.N2O.AG.MT.CE.AR5", "EN.GHG.N2O.BU.MT.CE.AR5", "EN.GHG.N2O.FE.MT.CE.AR5",
    "EN.GHG.N2O.IC.MT.CE.AR5", "EN.GHG.N2O.IP.MT.CE.AR5", "EN.GHG.N2O.MT.CE.AR5", "EN.GHG.N2O.PI.MT.CE.AR5",
    "EN.GHG.N2O.TR.MT.CE.AR5", "EN.GHG.N2O.WA.MT.CE.AR5", "ER.FSH.AQUA.MT", "ER.FSH.CAPT.MT", "ER.FSH.PROD.MT",
    "ER.H2O.FWTL.K3", "ER.H2O.INTR.K3", "SP.POP.TOTL"
]

columns_to_fill_extended = [col.lower() for col in columns_to_fill_extended]

# Filtering columns that exist in the dataset
existing_columns = [col for col in columns_to_fill_extended if col in df.columns]

df[existing_columns] = df.groupby("country_code")[existing_columns].transform(lambda x: x.ffill().bfill())

In [6]:
df['total_idp_over_pop'] = (df['internally_displaced_persons'] / df['sp.pop.totl'])*100
df['total_affected_over_pop'] = (df['total_affected'] / df['sp.pop.totl'])*100

In [7]:
weird_values = df[df['total_idp_over_pop'] > 100]

In [8]:
len(weird_values)

23

In [9]:
weird_values[['year','country_name', 'sp.pop.totl','internally_displaced_persons',  'total_idp_over_pop', 'total_affected_over_pop']]

Unnamed: 0,year,country_name,sp.pop.totl,internally_displaced_persons,total_idp_over_pop,total_affected_over_pop
1123,2024,Lebanon,5773493.0,21095955,365.393272,
1127,2024,Lebanon,5773493.0,18569637,321.636088,
1027,2023,Sudan,50042791.0,63571047,127.033376,
1035,2023,Sudan,50042791.0,63441327,126.774158,
1043,2023,Sudan,50042791.0,55837788,111.580084,
1049,2023,Sudan,50042791.0,76711659,153.292128,
1060,2023,Sudan,50042791.0,95451576,190.739913,
1068,2024,Sudan,50042791.0,72761718,145.399001,
1075,2024,Sudan,50042791.0,94069125,187.977375,
1081,2024,Sudan,50042791.0,78443607,156.753062,


In [10]:
df = df[df['total_idp_over_pop'] < 100]

In [11]:
pd.options.display.float_format = '{:.2f}'.format
df[['total_idp_over_pop', 'internally_displaced_persons']].describe()

Unnamed: 0,total_idp_over_pop,internally_displaced_persons
count,962.0,962.0
mean,8.98,2939246.39
std,10.97,4059056.5
min,0.0,166.0
25%,1.79,326019.0
50%,4.75,1027072.5
75%,12.16,4415163.0
max,77.73,38898405.0


In [12]:
grouped_df = df.groupby(['country_code', 'year'])[['total_idp_over_pop', 'total_affected_over_pop']].mean().reset_index()

In [16]:
env_factors = [
'ag.lnd.frst.k2', 'ag.lnd.prcp.mm',
       'ag.lnd.totl.k2', 'ag.srf.totl.k2', 'eg.cft.accs.ru.zs',
       'eg.cft.accs.ur.zs', 'eg.cft.accs.zs', 'eg.egy.prim.pp.kd',
       'eg.elc.accs.zs', 'eg.fec.rnew.zs', 'en.ghg.all.mt.ce.ar5',
       'en.ghg.ch4.ag.mt.ce.ar5', 'en.ghg.ch4.bu.mt.ce.ar5',
       'en.ghg.ch4.fe.mt.ce.ar5', 'en.ghg.ch4.ic.mt.ce.ar5',
       'en.ghg.ch4.mt.ce.ar5', 'en.ghg.ch4.pi.mt.ce.ar5',
       'en.ghg.ch4.tr.mt.ce.ar5', 'en.ghg.ch4.wa.mt.ce.ar5',
       'en.ghg.co2.bu.mt.ce.ar5', 'en.ghg.co2.ic.mt.ce.ar5',
       'en.ghg.co2.ip.mt.ce.ar5', 'en.ghg.co2.lu.mt.ce.ar5',
       'en.ghg.co2.mt.ce.ar5', 'en.ghg.co2.pi.mt.ce.ar5',
       'en.ghg.co2.tr.mt.ce.ar5', 'en.ghg.fgas.ip.mt.ce.ar5',
       'en.ghg.n2o.ag.mt.ce.ar5', 'en.ghg.n2o.bu.mt.ce.ar5',
       'en.ghg.n2o.fe.mt.ce.ar5', 'en.ghg.n2o.ic.mt.ce.ar5',
       'en.ghg.n2o.ip.mt.ce.ar5', 'en.ghg.n2o.mt.ce.ar5',
       'en.ghg.n2o.pi.mt.ce.ar5', 'en.ghg.n2o.tr.mt.ce.ar5',
       'en.ghg.n2o.wa.mt.ce.ar5', 'er.fsh.aqua.mt', 'er.fsh.capt.mt',
       'er.fsh.prod.mt', 'er.h2o.fwtl.k3', 'er.h2o.intr.k3', 'sp.pop.totl',
       'cpi', 'total_affected']

#impute mean to nas in env_factors

for col in env_factors:
       if col == 'total_affected':
              df[col].fillna(0, inplace=True)
       else:
              df[col].fillna(df[col].mean(), inplace=True)

# Drop rows with missing values in the selected columns
df_clean = df[["internally_displaced_persons"] + env_factors].dropna()

# Define independent (X) and dependent (y) variables
X = df_clean[env_factors]  # Environmental factors
y = df_clean["internally_displaced_persons"]  # Displacement

# Add a constant term for the regression model
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Display model summary
model.summary()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) inst

0,1,2,3
Dep. Variable:,internally_displaced_persons,R-squared:,0.623
Model:,OLS,Adj. R-squared:,0.605
Method:,Least Squares,F-statistic:,34.5
Date:,"Sun, 06 Apr 2025",Prob (F-statistic):,4.45e-163
Time:,15:56:28,Log-Likelihood:,-15533.0
No. Observations:,962,AIC:,31160.0
Df Residuals:,917,BIC:,31380.0
Df Model:,44,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.857e+06,2.15e+06,0.864,0.388,-2.36e+06,6.07e+06
ag.lnd.frst.k2,3.0537,2.823,1.082,0.280,-2.487,8.594
ag.lnd.prcp.mm,-648.3568,594.658,-1.090,0.276,-1815.406,518.693
ag.lnd.totl.k2,6.6847,2.235,2.991,0.003,2.298,11.071
ag.srf.totl.k2,-5.9569,2.070,-2.878,0.004,-10.020,-1.894
eg.cft.accs.ru.zs,2.251e+05,3.8e+04,5.927,0.000,1.51e+05,3e+05
eg.cft.accs.ur.zs,1.048e+05,2.1e+04,4.996,0.000,6.36e+04,1.46e+05
eg.cft.accs.zs,-2.703e+05,5.54e+04,-4.876,0.000,-3.79e+05,-1.61e+05
eg.egy.prim.pp.kd,-3.035e+05,8.25e+04,-3.678,0.000,-4.66e+05,-1.42e+05

0,1,2,3
Omnibus:,862.379,Durbin-Watson:,0.627
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58487.915
Skew:,3.755,Prob(JB):,0.0
Kurtosis:,40.453,Cond. No.,2050000000.0


#### OLS Regression

In [32]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the regression model on the training set
model_train = sm.OLS(y_train, X_train).fit()

# Predict on the test set
y_pred = model_train.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, r2

(1554611.431787336, 0.6596509992462433)

#### Cross Validation for the OLS Model:
- First intent with K fold
- Note for fixing:  Do cross validation in the proportion of affected / total

In [33]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer

# 1. Set up K-Fold cross-validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# 2. Initialize lists to store metrics
mae_scores = []
r2_scores = []

# 3. Perform manual cross-validation with statsmodels
for train_index, test_index in kf.split(X):
    # Split data for this fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Train statsmodels OLS
    model_fold = sm.OLS(y_train_fold, X_train_fold).fit()
    
    # Make predictions
    y_pred_fold = model_fold.predict(X_test_fold)
    
    # Calculate and store performance metrics
    mae = mean_absolute_error(y_test_fold, y_pred_fold)
    r2 = r2_score(y_test_fold, y_pred_fold)
    
    mae_scores.append(mae)
    r2_scores.append(r2)

# 4. Calculate average performance
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)
std_mae = np.std(mae_scores)
std_r2 = np.std(r2_scores)

print(f"Cross-validation MAE: {avg_mae:.4f} ± {std_mae:.4f}")
print(f"Cross-validation R²: {avg_r2:.4f} ± {std_r2:.4f}")

Cross-validation MAE: 1616452.0835 ± 137111.9160
Cross-validation R²: 0.5794 ± 0.1040


#### Random Forest Model

In [24]:
# Convert categorical variables to numerical using one-hot encoding
df_encoded = pd.get_dummies(df.drop(columns=["internally_displaced_persons"]), drop_first=True)

# Fill missing values with the median
df_encoded = df_encoded.fillna(df_encoded.median(numeric_only=True))
y_filled = df["internally_displaced_persons"].fillna(df["internally_displaced_persons"].median())

# Split into train and test sets
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(df_encoded, y_filled, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Predict on test set
y_pred_rf = rf_model.predict(X_test_rf)

# Evaluate model performance
mae_rf = mean_absolute_error(y_test_rf, y_pred_rf)
r2_rf = r2_score(y_test_rf, y_pred_rf)

mae_rf, r2_rf

(87824.00860103627, 0.997590950349355)

#### Cross Validation for the Random Forest Model

In [40]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# Setup K-fold cross-validation
k = 5  # adjust?? i think this is already pretty solid
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Prepare data
X = df_encoded
y = y_filled

# Initialize lists to store performance metrics
mae_scores = []
r2_scores = []

# Perform cross-validation
for train_index, test_index in kf.split(X):
    # Split data for this fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Train model
    rf_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions
    y_pred_fold = rf_model.predict(X_test_fold)
    
    # Calculate and store performance metrics
    mae = mean_absolute_error(y_test_fold, y_pred_fold)
    r2 = r2_score(y_test_fold, y_pred_fold)
    
    mae_scores.append(mae)
    r2_scores.append(r2)

# Calculate average performance
avg_mae = np.mean(mae_scores)
avg_r2 = np.mean(r2_scores)
std_mae = np.std(mae_scores)
std_r2 = np.std(r2_scores)

print(f"Cross-validation MAE: {avg_mae:.4f} ± {std_mae:.4f}")
print(f"Cross-validation R²: {avg_r2:.4f} ± {std_r2:.4f}")

Cross-validation MAE: 167201.4094 ± 73869.9895
Cross-validation R²: 0.9785 ± 0.0204


### Cross Validation Random Forest Results

- Cross-validation MAE (Mean Absolute Error): 167201.4094 ± 73869.9895: Much lower MAE than the OLS model predicted (10x smaller)
- Predictions with Rnadom Forest are much closer to the actual values (WUUHUU)

- Cross-validation R²: 0.9785 ± 0.0204: Very high R² value (0.9785 compared to ~0.57 for OLS)
- Our model explains nearly 98% of the variance in our data

In [25]:
# Initialize Ridge and Lasso models
ridge_model = Ridge(alpha=1.0)  # Alpha is the regularization strength
lasso_model = Lasso(alpha=0.1)  # Alpha for Lasso

# Train Ridge Regression
ridge_model.fit(X_train_rf, y_train_rf)
y_pred_ridge = ridge_model.predict(X_test_rf)
mae_ridge = mean_absolute_error(y_test_rf, y_pred_ridge)
r2_ridge = r2_score(y_test_rf, y_pred_ridge)

# Train Lasso Regression
lasso_model.fit(X_train_rf, y_train_rf)
y_pred_lasso = lasso_model.predict(X_test_rf)
mae_lasso = mean_absolute_error(y_test_rf, y_pred_lasso)
r2_lasso = r2_score(y_test_rf, y_pred_lasso)

# Output results
(mae_ridge, r2_ridge), (mae_lasso, r2_lasso)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  model = cd_fast.enet_coordinate_descent(


((821144.5480904968, 0.8999417595115521),
 (895220.2943734545, 0.7833936852467105))

#### Cross Validation for Ridge and Lasso

In [44]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# Create a pipeline that first standardizes, then applies Ridge regression
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

# Create scorers
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Run cross-validation with the pipeline
ridge_mae_scores = -cross_val_score(ridge_pipeline, X, y, cv=5, scoring=mae_scorer)
ridge_r2_scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring=r2_scorer)

print("Ridge Cross-validation Results (with Pipeline):")
print(f"Cross-validation MAE: {ridge_mae_scores.mean():.4f} ± {ridge_mae_scores.std():.4f}")
print(f"Cross-validation R²: {ridge_r2_scores.mean():.4f} ± {ridge_r2_scores.std():.4f}")

Ridge Cross-validation Results (with Pipeline):
Cross-validation MAE: 7886554.5981 ± 7106696.7234
Cross-validation R²: -50.1702 ± 74.7025


In [43]:
# Lasso

# Create a pipeline that first standardizes, then applies Ridge regression
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Lasso(alpha=1.0))
])

# Create scorers
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# Run cross-validation for Lasso
print("Lasso Cross-validation Results:")
lasso_mae_scores = -cross_val_score(lasso_model, X, y, cv=5, scoring=mae_scorer)
lasso_r2_scores = cross_val_score(lasso_model, X, y, cv=5, scoring=r2_scorer)
print(f"Cross-validation MAE: {lasso_mae_scores.mean():.4f} ± {lasso_mae_scores.std():.4f}")
print(f"Cross-validation R²: {lasso_r2_scores.mean():.4f} ± {lasso_r2_scores.std():.4f}")

Lasso Cross-validation Results:


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Cross-validation MAE: 11904304.3910 ± 10302479.3946
Cross-validation R²: -127.2428 ± 189.8866


  model = cd_fast.enet_coordinate_descent(
