In [1]:
import pandas as pd

In [2]:
country_indicators_raw = pd.read_csv("data/environment_pop_data_all.csv")
inflation_data = pd.read_csv("data/economic/inflation_all_countries_sorted_cleaned.csv")
migration_data = pd.read_csv('data/migration/full_iom_dtm_data.csv')
climate_data = pd.read_csv(r"data\climate_data\ERA5_Monthly_Climate_Weather_FULL.csv")
emdat_cc_expanded = pd.read_excel('data/climate_catastrophes/emdat_full.xlsx') 

In [3]:
def standardize_headers(df, func=None):
    df.columns = df.columns.str.replace(' ', '_').str.lower()
    if func:
      df = df.apply(func)
    return df

def remove_high_na_columns(df, threshold=0.3):
    """
    Removes columns from a DataFrame if more than `threshold` proportion of values are NA or 0.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    threshold (float): The proportion of missing (or zero) values above which a column is removed.
    
    Returns:
    pd.DataFrame: The cleaned DataFrame.
    """
    # Treat 0s as NA
    df_replaced = df.replace(0, pd.NA)
    
    # Compute the proportion of missing values per column
    missing_ratio = df_replaced.isna().mean()
    
    # Keep columns where the missing ratio is below the threshold
    return df.loc[:, missing_ratio <= threshold]

In [4]:
country_indicators_raw = standardize_headers(country_indicators_raw)
country_indicators_long = pd.melt(country_indicators_raw, id_vars= ['country_code', 'indicator_name', 'indicator_code'], var_name = 'year')
country_indicators_long[['year', 'value']] = country_indicators_long[['year', 'value']].apply(pd.to_numeric, errors='coerce')
country_indicators_long = country_indicators_long[country_indicators_long['year'] > 2010]
country_indicators_long.drop(columns = ['indicator_name'], inplace=True)
country_indicators = country_indicators_long.pivot_table(index=['country_code', 'year'], columns='indicator_code', values='value').reset_index()
country_indicators = remove_high_na_columns(country_indicators)

In [5]:
migration_data['date'] = pd.to_datetime(migration_data['reportingDate'])
migration_data['month'] = migration_data['date'].dt.month
migration_data['year'] = migration_data['date'].dt.year

# Define a more descriptive column mapping for the columns we want to keep
column_mapping = {
    'admin0Name': 'country_name',  # Keep country name
    'admin0Pcode': 'country_code', # In case you need to map country codes to names
    'numPresentIdpInd': 'internally_displaced_persons',
    'date': 'date',
    'year': 'year',
    'month': 'month'
    
}

# List of columns to keep (everything else will be dropped)
columns_to_keep = list(column_mapping.keys())

# Keep only the columns we need
migration_data = migration_data[columns_to_keep]
# Rename the columns
migration_data.rename(columns=column_mapping, inplace=True)

total_idps_per_month = migration_data[['year', 'month', 'country_name','country_code', 'internally_displaced_persons']].groupby(['year', 'month', 'country_name', 'country_code']).sum().reset_index()

In [None]:
# extract year and month, and convert to numeric
climate_data = standardize_headers(climate_data)

climate_data['year'] = climate_data['date'].str.split('-').str[0].astype(int)
climate_data['month'] = climate_data['date'].str.split('-').str[1].astype(int)

climate_data.drop(columns=['date', 'system:time_start'], inplace=True)

# Create a mapping dictionary for country names
country_mapping = {
    'Chad': 'Chad',
    'Malawi': 'Malawi',
    'Zambia': 'Zambia',
    'Zimbabwe': 'Zimbabwe',
    'Burundi': 'Burundi',
    'South Africa': 'South Africa',
    'Niger': 'Niger',
    'Sudan': 'Sudan',
    'Libya': 'Libya',
    'Ethiopia': 'Ethiopia',
    'Djibouti': 'Djibouti',
    'Somalia': 'Somalia',
    'South Sudan': 'South Sudan',
    'Kenya': 'Kenya',
    'Uganda': 'Uganda',
    'Mozambique': 'Mozambique',
    'Madagascar': 'Madagascar',
    'Mali': 'Mali',
    'Burkina Faso': 'Burkina Faso',
    'Benin': 'Benin',
    'Nigeria': 'Nigeria',
    'Cameroon': 'Cameroon',
    'Ukraine': 'Ukraine',
    'Mongolia': 'Mongolia',
    'Afghanistan': 'Afghanistan',
    'Pakistan': 'Pakistan',
    'Nepal': 'Nepal',
    'Sri Lanka': 'Sri Lanka',  # or 'Sri lanka'
    'Papua New Guinea': 'Papua New Guinea',
    'Fiji': 'Fiji',
    'Vanuatu': 'Vanuatu',
    'Indonesia': 'Indonesia',
    'Iraq': 'Iraq',
    'Lebanon': 'Lebanon',
    'Yemen': 'Yemen',
    'Dominica': 'Dominica',
    'Grenada': 'Grenada',
    'Haiti': 'Haiti',
    'Ecuador': 'Ecuador',
    'Peru': 'Peru',
    'El Salvador': 'El Salvador',
    'Guatemala': 'Guatemala',
    'Honduras': 'Honduras',
    'Philippines': 'Philippines (the)',
    'Laos': "Lao People's Democratic Republic",
    'Armenia': 'Republic of Armenia',
    'Syria': 'Syrian Arab Republic',
    'Bolivia': 'Bolivia (Plurinational State of)',
    'Democratic Republic of the Congo': 'Democratic Republic of the Congo',
    'Central African Republic': 'Central African Republic',
    'Antigua and Barbuda': 'Antigua and Barbuda',
    'Bahamas': 'Bahamas (the)',
    'Saint Vincent and the Grenadines': 'Saint Vincent and the Grenadines'
}

# Create a new column in climate_data with the mapped names
climate_data['country_name'] = climate_data['country_na'].map(country_mapping)
climate_data.drop(columns=['country_na'], inplace=True)

KeyError: "['CPI'] not found in axis"

In [7]:
climate_data

Unnamed: 0,country_na,temperature_2m,total_precipitation_sum,potential_evaporation_sum,year,month,country_name
0,Chad,295.270399,0.000029,-0.274852,2010,1,Chad
1,Malawi,296.915585,0.248372,-0.210955,2010,1,Malawi
2,Zambia,295.522374,0.237523,-0.234153,2010,1,Zambia
3,Zimbabwe,297.442308,0.137571,-0.270994,2010,1,Zimbabwe
4,Burundi,293.489225,0.242564,-0.201473,2010,1,Burundi
...,...,...,...,...,...,...,...
10255,Democratic Republic of the Congo,298.044362,0.133405,-0.514691,2024,12,Democratic Republic of the Congo
10256,Central African Republic,301.261619,0.000119,-0.530106,2024,12,Central African Republic
10257,Antigua and Barbuda,299.504880,0.052297,-0.322775,2024,12,Antigua and Barbuda
10258,Bahamas,296.698045,0.044134,-0.303525,2024,12,Bahamas (the)


In [None]:
#Merge IDPs with climate data
total_idps_per_month = pd.merge(total_idps_per_month, climate_data, how='left', 
                                on=['country_name', 'year', 'month'])

In [None]:
# Step 3: Create a list of countries to filter by
target_countries = total_idps_per_month['country_name'].unique().tolist()

# Step 4: Filter for only those countries
emdat_cc_expanded = emdat_cc_expanded[emdat_cc_expanded['Country'].isin(target_countries)]

# Step 5: fill NA for end year and assign it to the start month or year so it is joinable with the climate data
climate_catastrophes_data_subset = emdat_cc_expanded[['Country', 'Start Year', 'Start Month', 'End Year', 'End Month',  'Disaster Type', 'Total Affected']]
climate_catastrophes_data_subset["End Year"] = climate_catastrophes_data_subset["End Year"].fillna(climate_catastrophes_data_subset["Start Year"])
climate_catastrophes_data_subset["End Month"] = climate_catastrophes_data_subset["End Month"].fillna(climate_catastrophes_data_subset["Start Month"])

# Step 6: create a month/year column that is joinable

expanded_rows = []
for _, row in climate_catastrophes_data_subset.iterrows():
    for year in range(row["Start Year"], row["End Year"] + 1):  # Iterate through years
        # Handle potential NaN values in Start Month and End Month and ensure values are integers
        start_month = (
            int(row["Start Month"]) if year == row["Start Year"] and not pd.isna(row["Start Month"]) else 1
        )
        end_month = (
            int(row["End Month"]) if year == row["End Year"] and not pd.isna(row["End Month"]) else 12
        )
        
        # Iterate through months and create the expanded rows
        for m in range(start_month, end_month + 1):
            new_row = row.copy()
            new_row["month"] = m  # Assign the month
            new_row["year"] = year  # Assign the correct year
            expanded_rows.append(new_row)

climate_catastrophes_data = pd.DataFrame(expanded_rows)

In [None]:
# Merge IDP, climate data and climate catastrophes data
merged_df = pd.merge(
    total_idps_per_month,
    climate_catastrophes_data,  
    left_on=['year', 'month', 'country_name'],
    right_on=['year', 'month', 'Country'],
    how='left'          # Keep all climate data rows, even without IDP data
)

#drop 'Country' column
merged_df.drop(columns=['Country'], inplace=True)

In [15]:
# Create column 'climate_catastrophe' to indicate if a climate catastrophe occurred

merged_df['climate_catastrophe'] = merged_df['Disaster Type'].notnull().astype(int)
merged_df['climate_catastrophe'].value_counts()

climate_catastrophe
0    589
1    555
Name: count, dtype: int64

In [16]:
#count NAs per column
na_counts = merged_df.isna().sum()
print(na_counts)

year                              0
month                             0
country_name                      0
country_code                      0
internally_displaced_persons      0
temperature_2m                   11
total_precipitation_sum          11
potential_evaporation_sum        11
Start Year                      589
Start Month                     659
End Year                        589
End Month                       596
Disaster Type                   589
CPI                             617
Total Affected                  618
climate_catastrophe               0
dtype: int64


In [33]:
pd.merge(merged_df,
    inflation_data[['country', 'year', 'month', 'inflation']],
    how='left',
    left_on=['country_name', 'year', 'month'],
    right_on=['country', 'year', 'month']
)

Unnamed: 0,year,month,country_name,country_code,internally_displaced_persons,temperature_2m,total_precipitation_sum,potential_evaporation_sum,Start Year,Start Month,End Year,End Month,Disaster Type,CPI,Total Affected,climate_catastrophe,country,inflation
0,2010,6,Sudan,SDN,30933,305.414844,0.017494,-0.314517,,,,,,,,0,,
1,2010,11,Haiti,HTI,2137764,295.938353,0.111989,-0.212747,2010.0,10.0,2011.0,12.0,Epidemic,71.563596,513997.0,1,,
2,2010,11,Haiti,HTI,2137764,295.938353,0.111989,-0.212747,2010.0,11.0,2010.0,11.0,Storm,71.563596,5020.0,1,,
3,2011,1,Haiti,HTI,1612754,295.712678,0.013111,-0.212113,2010.0,10.0,2011.0,12.0,Epidemic,71.563596,513997.0,1,,
4,2011,2,Sudan,SDN,98298,296.814473,0.000284,-0.290418,,,,,,,,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139,2025,1,Lebanon,LBN,1333317,,,,,,,,,,,0,,
1140,2025,1,Sudan,SDN,69461010,,,,2024.0,8.0,2025.0,3.0,Epidemic,,57447.0,1,,
1141,2025,2,Lebanon,LBN,1236267,,,,,,,,,,,0,,
1142,2025,2,Syrian Arab Republic,SYR,20983938,,,,,,,,,,,0,,


In [20]:
# save it to an csv file
merged_df.to_csv('data/merged_climate_iom_data.csv', index=False)