In [57]:
import pandas as pd

# Load all the datasets

oecd = pd.read_csv("../data/processed/oecd_years.csv")
cpi = pd.read_csv("../data/processed/CPI.csv")
gdp = pd.read_csv("../data/processed/gdp.csv")
happiness = pd.read_csv("../data/processed/happiness.csv")
world_bank = pd.read_csv("../data/processed/world_bank_years.csv")

# Display the first few rows of each dataset for inspection (optional)
print("OECD DataFrame:")
print(oecd.head())

print("\nCPI DataFrame:")
print(cpi.head())

print("\nGDP DataFrame:")
print(gdp.head())

print("\nHappiness DataFrame:")
print(happiness.head())

print("\nWorld Bank DataFrame:")
print(world_bank.head())

# Standardize column names for country and year
oecd.rename(columns={'cname': 'country'}, inplace=True)
happiness.rename(columns={'Country Name': 'country', 'Year': 'year'}, inplace=True)
gdp.rename(columns={'CountryName': 'country'}, inplace=True)
cpi.rename(columns={'Economy Name': 'country', 'Year': 'year'}, inplace=True)
world_bank.rename(columns={'date': 'year'}, inplace=True)




OECD DataFrame:
     cname    year Civil Justice is Free of Corruption  \
0  Albania  2003.0                                 NaN   
1  Albania  2004.0                                 NaN   
2  Albania  2005.0                                 NaN   
3  Albania  2006.0                                 NaN   
4  Albania  2007.0                                 NaN   

  Criminal System is Free of Corruption  \
0                                   NaN   
1                                   NaN   
2                                   NaN   
3                                   NaN   
4                                   NaN   

  Corruption Commission Present in Constitution Civil Rights  \
0                                          2,00          NaN   
1                                          2,00          NaN   
2                                          2,00         8,00   
3                                          2,00          NaN   
4                                          2,00         

In [59]:
# List of your DataFrames
dataframes = [oecd, cpi, gdp, happiness, world_bank]

# Iterate over each DataFrame and replace missing values in 'year' with 1900
for i, df in enumerate(dataframes):
    if 'year' in df.columns:  # Ensure the 'year' column exists in the DataFrame
        df['year'] = df['year'].apply(lambda x: int(float(x)) if pd.notna(x) else 1900)
        print(f"Updated 'year' column in DataFrame {i + 1}")

# Verify changes (optional)
for i, df in enumerate(dataframes):
    print(f"DataFrame {i + 1} sample:")
    print(df[['year']].head())


Updated 'year' column in DataFrame 1
Updated 'year' column in DataFrame 2
Updated 'year' column in DataFrame 3
Updated 'year' column in DataFrame 4
Updated 'year' column in DataFrame 5
DataFrame 1 sample:
   year
0  2003
1  2004
2  2005
3  2006
4  2007
DataFrame 2 sample:
   year
0  2012
1  2013
2  2014
3  2015
4  2016
DataFrame 3 sample:
   year
0  2005
1  2006
2  2007
3  2008
4  2009
DataFrame 4 sample:
   year
0  2007
1  2009
2  2010
3  2011
4  2012
DataFrame 5 sample:
   year
0  2005
1  2006
2  2007
3  2008
4  2009


In [61]:


oecd['year'] = oecd['year'].apply(lambda x: int(float(x)) if pd.notna(x) else None)


oecd['year'] = oecd['year'].astype(str)
happiness['year'] = happiness['year'].astype(str)
cpi['year'] = cpi['year'].astype(str)
gdp['year'] = gdp['year'].astype(str)
world_bank['year'] = world_bank['year'].astype(str)

# Merge all datasets on 'country' and 'year'
merged_data = oecd.merge(cpi, on=['country', 'year'], how='outer') \
                        .merge(gdp, on=['country', 'year'], how='outer') \
                        .merge(happiness, on=['country', 'year'], how='outer') \
                        .merge(world_bank, on=['country', 'year'], how='outer')

# Display the merged DataFrame to the user
merged_data.head()


Unnamed: 0,country,year,Civil Justice is Free of Corruption,Criminal System is Free of Corruption,Corruption Commission Present in Constitution,Civil Rights,Anti Corruption Policy,The Bayesian Corruption Indicator,Economy ISO3,Corruption Perceptions Index Rank,...,government_expenditure_on_education%,government_health_expenditure%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,intentional_homicides
0,Albania,2003,,,200,,,5832,,,...,,,,,,,,,,
1,Albania,2004,,,200,,,5787,,,...,,,,,,,,,,
2,Albania,2005,,,200,800.0,400.0,5798,,,...,3.28155,2.665318,30.6,13.235,6.306,76.621,3011487.0,1604189.0,0.003672,5.078094
3,Albania,2006,,,200,,,5775,,,...,3.19146,2.559311,,12.539,6.477,76.816,2992547.0,1561661.0,0.07589,3.163096
4,Albania,2007,,,200,800.0,400.0,5744,,,...,3.27593,2.435714,,12.07,6.318,77.549,2970017.0,1517619.0,0.113008,3.528127


In [67]:

merged_data.to_csv("../data/processed/merged_.csv", index=False, index_label=False)



In [70]:
merged_data

Unnamed: 0,country,year,Civil Justice is Free of Corruption,Criminal System is Free of Corruption,Corruption Commission Present in Constitution,Civil Rights,Anti Corruption Policy,The Bayesian Corruption Indicator,Economy ISO3,Corruption Perceptions Index Rank,...,government_expenditure_on_education%,government_health_expenditure%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,intentional_homicides
0,Albania,2003,,,200,,,5832,,,...,,,,,,,,,,
1,Albania,2004,,,200,,,5787,,,...,,,,,,,,,,
2,Albania,2005,,,200,800,400,5798,,,...,3.281550,2.665318,30.6,13.235,6.306,76.62100,3011487.0,1604189.0,0.003672,5.078094
3,Albania,2006,,,200,,,5775,,,...,3.191460,2.559311,,12.539,6.477,76.81600,2992547.0,1561661.0,0.075890,3.163096
4,Albania,2007,,,200,800,400,5744,,,...,3.275930,2.435714,,12.070,6.318,77.54900,2970017.0,1517619.0,0.113008,3.528127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,United Kingdom,2020,,,,,,,GBR,11.0,...,5.497700,10.024661,32.6,10.100,10.100,80.35122,67081234.0,10798066.0,1.231142,1.003587
1024,United Kingdom,2021,,,,,,,GBR,11.0,...,5.327368,,32.4,10.100,9.700,80.70000,67026292.0,10622327.0,1.264685,
1025,United Kingdom,2022,,,,,,,GBR,18.0,...,,,,,,,66971395.0,10448877.0,1.232193,
1026,United Kingdom,2023,,,,,,,GBR,20.0,...,,,,,,,,,,
