In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import pearsonr, f_oneway, ttest_ind

In [32]:
# collect data
pm25_data = pd.read_csv("WorldBank/API_EN.ATM.PM25.MC.M3_DS2_en_csv_v2_6084.csv", skiprows=4)
mortality_data = pd.read_csv("mortality data/IHME-GBD_2023_DATA-fac2f874-1.csv")
metadata = pd.read_csv("WorldBank/Metadata_Country_API_EN.ATM.PM25.MC.M3_DS2_en_csv_v2_6084.csv")

In [None]:
#cleaning pm25 data
year_cols = [col for col in pm25_data.columns if col.isdigit()]
pm25_clean = pm25_data[['Country Name', 'Country Code'] + year_cols]
pm25_long = pd.melt(pm25_clean, id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='PM25')
pm25_long['Year'] = pm25_long['Year'].astype(int)
pm25_2020 = pm25_long[pm25_long['Year'] == 2020].dropna()

In [28]:
#cleaning mortality data
mortality_clean = mortality_data[
    (mortality_data['measure'] == 'Deaths') &
    (mortality_data['sex'] == 'Both') &
    (mortality_data['age'] == 'All ages') &
    (mortality_data['cause'] == 'All causes') &
    (mortality_data['metric'] == 'Rate')
].copy()
mortality_clean = mortality_clean.rename(columns={'location': 'Country Name', 'val': 'Mortality_Rate'})


In [33]:
# celaning metadata
metadata_clean = metadata[['Country Code', 'IncomeGroup']].dropna()

In [34]:
# merging all
df = pd.merge(pm25_2020, metadata_clean, on='Country Code')
df = pd.merge(df, mortality_clean[['Country Name', 'Mortality_Rate']], on='Country Name')
df = df.dropna()

print(f"Final dataset: {len(df)} countries")

Final dataset: 172 countries


In [35]:
# Exploratory Data Analysis
print(df[['PM25', 'Mortality_Rate']].describe())


             PM25  Mortality_Rate
count  172.000000      172.000000
mean    23.342152      788.792872
std     15.335680      267.191443
min      4.895181      103.608219
25%     11.880621      622.770200
50%     19.649015      745.218012
75%     30.111063      968.122204
max     85.122346     1471.835407


In [36]:
print("\n=== BY INCOME GROUP ===")
print(df.groupby('IncomeGroup')[['PM25', 'Mortality_Rate']].mean())


=== BY INCOME GROUP ===
                          PM25  Mortality_Rate
IncomeGroup                                   
High income          15.477283      869.494132
Low income           40.120445      798.494714
Lower middle income  29.919009      661.875990
Upper middle income  21.056104      787.238204
