In [1]:
import pandas as pd

In [2]:
country_indicators_raw = pd.read_csv("data/environment_pop_data_all.csv")
inflation_data = pd.read_csv("data/economic/inflation_all_countries_sorted_cleaned.csv")
migration_data = pd.read_csv('data/migration/full_iom_dtm_data.csv')
climate_data = pd.read_csv(r"data\climate_data\ERA5_Monthly_Climate_Weather_FULL.csv")
emdat_cc_expanded = pd.read_excel('data/climate_catastrophes/emdat_full.xlsx') 

In [3]:
def standardize_headers(df, func=None):
    df.columns = df.columns.str.replace(' ', '_').str.lower()
    if func:
      df = df.apply(func)
    return df

def remove_high_na_columns(df, threshold=0.3):
    """
    Removes columns from a DataFrame if more than `threshold` proportion of values are NA or 0.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    threshold (float): The proportion of missing (or zero) values above which a column is removed.
    
    Returns:
    pd.DataFrame: The cleaned DataFrame.
    """
    # Treat 0s as NA
    df_replaced = df.replace(0, pd.NA)
    
    # Compute the proportion of missing values per column
    missing_ratio = df_replaced.isna().mean()
    
    # Keep columns where the missing ratio is below the threshold
    return df.loc[:, missing_ratio <= threshold]

In [4]:
country_indicators_long = pd.melt(country_indicators_raw, id_vars= ['Country Code', 'Indicator Name', 'Indicator Code'], var_name = 'year')
country_indicators_long[['year', 'value']] = country_indicators_long[['year', 'value']].apply(pd.to_numeric, errors='coerce')
country_indicators_long = country_indicators_long[country_indicators_long['year'] > 2010]
country_indicators_long.drop(columns = ['Indicator Name'], inplace=True)
country_indicators = country_indicators_long.pivot_table(index=['Country Code', 'year'], columns='Indicator Code', values='value').reset_index()
country_indicators = remove_high_na_columns(country_indicators)

In [5]:
migration_data['date'] = pd.to_datetime(migration_data['reportingDate'])
migration_data['month'] = migration_data['date'].dt.month
migration_data['year'] = migration_data['date'].dt.year

# Define a more descriptive column mapping for the columns we want to keep
column_mapping = {
    'admin0Name': 'country_name',  # Keep country name
    'admin0Pcode': 'country_code', # In case you need to map country codes to names
    'numPresentIdpInd': 'internally_displaced_persons',
    'date': 'date',
    'year': 'year',
    'month': 'month'
    
}

# List of columns to keep (everything else will be dropped)
columns_to_keep = list(column_mapping.keys())

# Keep only the columns we need
migration_data = migration_data[columns_to_keep]
# Rename the columns
migration_data.rename(columns=column_mapping, inplace=True)

total_idps_per_month = migration_data[['year', 'month', 'country_name','country_code', 'internally_displaced_persons']].groupby(['year', 'month', 'country_name', 'country_code']).sum().reset_index()

In [6]:
# extract year and month, and convert to numeric
climate_data = standardize_headers(climate_data)

climate_data['year'] = climate_data['date'].str.split('-').str[0].astype(int)
climate_data['month'] = climate_data['date'].str.split('-').str[1].astype(int)

climate_data.drop(columns=['date', 'system:time_start'], inplace=True)

# Create a mapping dictionary for country names
country_mapping = {
    'Chad': 'Chad',
    'Malawi': 'Malawi',
    'Zambia': 'Zambia',
    'Zimbabwe': 'Zimbabwe',
    'Burundi': 'Burundi',
    'South Africa': 'South Africa',
    'Niger': 'Niger',
    'Sudan': 'Sudan',
    'Libya': 'Libya',
    'Ethiopia': 'Ethiopia',
    'Djibouti': 'Djibouti',
    'Somalia': 'Somalia',
    'South Sudan': 'South Sudan',
    'Kenya': 'Kenya',
    'Uganda': 'Uganda',
    'Mozambique': 'Mozambique',
    'Madagascar': 'Madagascar',
    'Mali': 'Mali',
    'Burkina Faso': 'Burkina Faso',
    'Benin': 'Benin',
    'Nigeria': 'Nigeria',
    'Cameroon': 'Cameroon',
    'Ukraine': 'Ukraine',
    'Mongolia': 'Mongolia',
    'Afghanistan': 'Afghanistan',
    'Pakistan': 'Pakistan',
    'Nepal': 'Nepal',
    'Sri Lanka': 'Sri Lanka',  # or 'Sri lanka'
    'Papua New Guinea': 'Papua New Guinea',
    'Fiji': 'Fiji',
    'Vanuatu': 'Vanuatu',
    'Indonesia': 'Indonesia',
    'Iraq': 'Iraq',
    'Lebanon': 'Lebanon',
    'Yemen': 'Yemen',
    'Dominica': 'Dominica',
    'Grenada': 'Grenada',
    'Haiti': 'Haiti',
    'Ecuador': 'Ecuador',
    'Peru': 'Peru',
    'El Salvador': 'El Salvador',
    'Guatemala': 'Guatemala',
    'Honduras': 'Honduras',
    'Philippines': 'Philippines (the)',
    'Laos': "Lao People's Democratic Republic",
    'Armenia': 'Republic of Armenia',
    'Syria': 'Syrian Arab Republic',
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Democratic Republic of the Congo': 'Democratic Republic of the Congo',
    'Central African Republic': 'Central African Republic',
    'Antigua and Barbuda': 'Antigua and Barbuda',
    'Bahamas': 'Bahamas (the)',
    'Saint Vincent and the Grenadines': 'Saint Vincent and the Grenadines'
}

# Create a new column in climate_data with the mapped names
climate_data['country_name'] = climate_data['country_na'].map(country_mapping)
climate_data.drop(columns=['country_na'], inplace=True)

In [7]:
climate_data

Unnamed: 0,temperature_2m,total_precipitation_sum,potential_evaporation_sum,year,month,country_name
0,295.270399,0.000029,-0.274852,2010,1,Chad
1,296.915585,0.248372,-0.210955,2010,1,Malawi
2,295.522374,0.237523,-0.234153,2010,1,Zambia
3,297.442308,0.137571,-0.270994,2010,1,Zimbabwe
4,293.489225,0.242564,-0.201473,2010,1,Burundi
...,...,...,...,...,...,...
10255,298.044362,0.133405,-0.514691,2024,12,Democratic Republic of the Congo
10256,301.261619,0.000119,-0.530106,2024,12,Central African Republic
10257,299.504880,0.052297,-0.322775,2024,12,Antigua and Barbuda
10258,296.698045,0.044134,-0.303525,2024,12,Bahamas (the)


In [8]:
#Merge IDPs with climate data
total_idps_per_month = pd.merge(total_idps_per_month, climate_data, how='left', 
                                on=['country_name', 'year', 'month'])

In [9]:
# Step 3: Create a list of countries to filter by
target_countries = total_idps_per_month['country_name'].unique().tolist()

# Step 4: Filter for only those countries
emdat_cc_expanded = emdat_cc_expanded[emdat_cc_expanded['Country'].isin(target_countries)]

# Step 5: fill NA for end year and assign it to the start month or year so it is joinable with the climate data
climate_catastrophes_data_subset = emdat_cc_expanded[['Country', 'Start Year', 'Start Month', 'End Year', 'End Month',  'Disaster Type','Total Affected']]
climate_catastrophes_data_subset["End Year"] = climate_catastrophes_data_subset["End Year"].fillna(climate_catastrophes_data_subset["Start Year"])
climate_catastrophes_data_subset["End Month"] = climate_catastrophes_data_subset["End Month"].fillna(climate_catastrophes_data_subset["Start Month"])

# Step 6: create a month/year column that is joinable

expanded_rows = []
for _, row in climate_catastrophes_data_subset.iterrows():
    for year in range(row["Start Year"], row["End Year"] + 1):  # Iterate through years
        # Handle potential NaN values in Start Month and End Month and ensure values are integers
        start_month = (
            int(row["Start Month"]) if year == row["Start Year"] and not pd.isna(row["Start Month"]) else 1
        )
        end_month = (
            int(row["End Month"]) if year == row["End Year"] and not pd.isna(row["End Month"]) else 12
        )
        
        # Iterate through months and create the expanded rows
        for m in range(start_month, end_month + 1):
            new_row = row.copy()
            new_row["month"] = m  # Assign the month
            new_row["year"] = year  # Assign the correct year
            expanded_rows.append(new_row)

climate_catastrophes_data = pd.DataFrame(expanded_rows)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  climate_catastrophes_data_subset["End Year"] = climate_catastrophes_data_subset["End Year"].fillna(climate_catastrophes_data_subset["Start Year"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  climate_catastrophes_data_subset["End Month"] = climate_catastrophes_data_subset["End Month"].fillna(climate_catastrophes_data_subset["Start Month"])


In [10]:
# Merge IDP, climate data and climate catastrophes data
merged_df = pd.merge(
    total_idps_per_month,
    climate_catastrophes_data,  
    left_on=['year', 'month', 'country_name'],
    right_on=['year', 'month', 'Country'],
    how='left'          # Keep all climate data rows, even without IDP data
)

#drop 'Country' column
merged_df.drop(columns=['Country'], inplace=True)

In [11]:
# Create column 'climate_catastrophe' to indicate if a climate catastrophe occurred

merged_df['climate_catastrophe'] = merged_df['Disaster Type'].notnull().astype(int)
merged_df['climate_catastrophe'].value_counts()

climate_catastrophe
0    589
1    555
Name: count, dtype: int64

In [12]:
merged_df=pd.merge(merged_df,
    inflation_data,
    how='left')

In [13]:
merged_df = pd.merge(merged_df,
    country_indicators,
    how='left',
    left_on=['country_code', 'year'],
    right_on=['Country Code', 'year']
)

In [14]:
merged_df = merged_df[(merged_df['year'] < 2024) & (merged_df['year'] > 2010)]

In [15]:
merged_df = standardize_headers(merged_df)

In [16]:
na_counts = merged_df.isna().sum()
print(na_counts)

year                              0
month                             0
country_name                      0
country_code                      0
internally_displaced_persons      0
temperature_2m                    5
total_precipitation_sum           5
potential_evaporation_sum         5
start_year                      536
start_month                     606
end_year                        536
end_month                       543
disaster_type                   536
total_affected                  564
climate_catastrophe               0
cpi_value                       194
country_code                    143
ag.lnd.frst.k2                  232
ag.lnd.prcp.mm                  293
ag.lnd.totl.k2                  209
ag.srf.totl.k2                  203
eg.cft.accs.ru.zs               247
eg.cft.accs.ur.zs               247
eg.cft.accs.zs                  247
eg.egy.prim.pp.kd               268
eg.elc.accs.zs                  203
eg.fec.rnew.zs                  266
en.ghg.all.mt.ce.ar5        

In [17]:
# Count NA's per column grouped by country
na_counts_by_country = merged_df.groupby('country_name').apply(lambda x: x.isna().sum())

  na_counts_by_country = merged_df.groupby('country_name').apply(lambda x: x.isna().sum())


In [18]:
# select numeric columns
numeric_columns = merged_df.select_dtypes(include=['number']).columns
numeric_columns = [col for col in numeric_columns if col not in ['year','month', 'internally_displaced_persons','start_month', 'end_year', 'end_month', 'total_affected', 'climate_catastrophe', 'sp.pop.totl']]
cols_to_impute = merged_df[numeric_columns]

In [19]:
cols_to_impute

Unnamed: 0,temperature_2m,total_precipitation_sum,potential_evaporation_sum,start_year,cpi_value,ag.lnd.frst.k2,ag.lnd.prcp.mm,ag.lnd.totl.k2,ag.srf.totl.k2,eg.cft.accs.ru.zs,...,en.ghg.n2o.ip.mt.ce.ar5,en.ghg.n2o.mt.ce.ar5,en.ghg.n2o.pi.mt.ce.ar5,en.ghg.n2o.tr.mt.ce.ar5,en.ghg.n2o.wa.mt.ce.ar5,er.fsh.aqua.mt,er.fsh.capt.mt,er.fsh.prod.mt,er.h2o.fwtl.k3,er.h2o.intr.k3
3,295.712678,0.013111,-0.212113,2010.0,106.332795,3752.74,1440.0,27560.0,27750.0,1.0,...,1.6150,1.3652,17.000,141.000,792.0000,600.0,16530.0,17130.0,1.45,13.007
4,296.814473,0.000284,-0.290418,,118.096617,,,,2505810.0,25.9,...,2.7562,261.2350,32.000,1.029,5.1970,2000.0,71008.0,73008.0,,
5,295.952629,0.022040,-0.301257,2010.0,106.332795,3752.74,1440.0,27560.0,27750.0,1.0,...,1.6150,1.3652,17.000,141.000,792.0000,600.0,16530.0,17130.0,1.45,13.007
6,297.774782,0.000533,-0.369376,,118.096617,,,,2505810.0,25.9,...,2.7562,261.2350,32.000,1.029,5.1970,2000.0,71008.0,73008.0,,
7,297.503557,0.154594,-0.253921,2010.0,106.332795,3752.74,1440.0,27560.0,27750.0,1.0,...,1.6150,1.3652,17.000,141.000,792.0000,600.0,16530.0,17130.0,1.45,13.007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1058,300.285488,0.149403,-0.378732,2023.0,237.222314,,,,,,...,1.9595,4.3774,37.000,479.000,2.6580,,,,,
1059,299.092440,0.001979,-0.482268,,524.905365,,,,,,...,3.7712,361.5830,64.000,5.872,2.1644,,,,,
1060,295.735278,0.000683,-0.261452,,,,,,,,...,2.1937,291.2340,88.000,152.000,8.0520,,,,,
1061,296.770002,0.097072,-0.224189,,209.931348,,,,,,...,6.2370,9.7108,48.000,492.000,3.9320,,,,,


In [20]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=100, random_state=0)
merged_df_imputed = pd.DataFrame(imp.fit_transform(cols_to_impute), columns=numeric_columns)

In [21]:
merged_df[numeric_columns] = merged_df_imputed

In [22]:
na_counts = merged_df.isna().sum()
print(na_counts)

year                              0
month                             0
country_name                      0
country_code                      0
internally_displaced_persons      0
temperature_2m                    3
total_precipitation_sum           3
potential_evaporation_sum         3
start_year                        3
start_month                     606
end_year                        536
end_month                       543
disaster_type                   536
total_affected                  564
climate_catastrophe               0
cpi_value                         3
country_code                    143
ag.lnd.frst.k2                    3
ag.lnd.prcp.mm                    3
ag.lnd.totl.k2                    3
ag.srf.totl.k2                    3
eg.cft.accs.ru.zs                 3
eg.cft.accs.ur.zs                 3
eg.cft.accs.zs                    3
eg.egy.prim.pp.kd                 3
eg.elc.accs.zs                    3
eg.fec.rnew.zs                    3
en.ghg.all.mt.ce.ar5        

In [23]:
# save it to an csv file
merged_df.to_csv('data/merged_climate_iom_data.csv', index=False)