In [2]:
import pandas as pd

# Load all the datasets

oecd = pd.read_csv("../data/processed/oecd_years.csv")
cpi = pd.read_csv("../data/processed/CPI.csv")
gdp = pd.read_csv("../data/processed/gdp.csv")
happiness = pd.read_csv("../data/processed/happiness.csv")
world_bank = pd.read_csv("../data/processed/world_bank_years.csv")

# Display the first few rows of each dataset for inspection (optional)
print("OECD DataFrame:")
print(oecd.head())

print("\nCPI DataFrame:")
print(cpi.head())

print("\nGDP DataFrame:")
print(gdp.head())

print("\nHappiness DataFrame:")
print(happiness.head())

print("\nWorld Bank DataFrame:")
print(world_bank.head())

# Standardize column names for country and year
oecd.rename(columns={'cname': 'country'}, inplace=True)
happiness.rename(columns={'Country Name': 'country', 'Year': 'year'}, inplace=True)
gdp.rename(columns={'CountryName': 'country'}, inplace=True)
cpi.rename(columns={'Economy Name': 'country', 'Year': 'year'}, inplace=True)
world_bank.rename(columns={'date': 'year'}, inplace=True)




OECD DataFrame:
     cname    year Civil Justice is Free of Corruption  \
0  Albania  2003.0                                 NaN   
1  Albania  2004.0                                 NaN   
2  Albania  2005.0                                 NaN   
3  Albania  2006.0                                 NaN   
4  Albania  2007.0                                 NaN   

  Criminal System is Free of Corruption  \
0                                   NaN   
1                                   NaN   
2                                   NaN   
3                                   NaN   
4                                   NaN   

  Corruption Commission Present in Constitution Civil Rights  \
0                                          2,00          NaN   
1                                          2,00          NaN   
2                                          2,00         8,00   
3                                          2,00          NaN   
4                                          2,00         

In [3]:
# List of your DataFrames
dataframes = [oecd, cpi, gdp, happiness, world_bank]

# Iterate over each DataFrame and replace missing values in 'year' with 1900
for i, df in enumerate(dataframes):
    if 'year' in df.columns:  # Ensure the 'year' column exists in the DataFrame
        df['year'] = df['year'].apply(lambda x: int(float(x)) if pd.notna(x) else 1900)
        print(f"Updated 'year' column in DataFrame {i + 1}")

# Verify changes (optional)
for i, df in enumerate(dataframes):
    print(f"DataFrame {i + 1} sample:")
    print(df[['year']].head())

Updated 'year' column in DataFrame 1
Updated 'year' column in DataFrame 2
Updated 'year' column in DataFrame 3
Updated 'year' column in DataFrame 4
Updated 'year' column in DataFrame 5
DataFrame 1 sample:
   year
0  2003
1  2004
2  2005
3  2006
4  2007
DataFrame 2 sample:
   year
0  2012
1  2013
2  2014
3  2015
4  2016
DataFrame 3 sample:
   year
0  2005
1  2006
2  2007
3  2008
4  2009
DataFrame 4 sample:
   year
0  2007
1  2009
2  2010
3  2011
4  2012
DataFrame 5 sample:
   year
0  2005
1  2006
2  2007
3  2008
4  2009


In [4]:


oecd['year'] = oecd['year'].apply(lambda x: int(float(x)) if pd.notna(x) else None)


oecd['year'] = oecd['year'].astype(str)
happiness['year'] = happiness['year'].astype(str)
cpi['year'] = cpi['year'].astype(str)
gdp['year'] = gdp['year'].astype(str)
world_bank['year'] = world_bank['year'].astype(str)

# Merge all datasets on 'country' and 'year'
merged_data = oecd.merge(cpi, on=['country', 'year'], how='outer') \
                        .merge(gdp, on=['country', 'year'], how='outer') \
                        .merge(happiness, on=['country', 'year'], how='outer') \
                        .merge(world_bank, on=['country', 'year'], how='outer')

# Display the merged DataFrame to the user
merged_data.head()

Unnamed: 0,country,year,Civil Justice is Free of Corruption,Criminal System is Free of Corruption,Corruption Commission Present in Constitution,Civil Rights,Anti Corruption Policy,The Bayesian Corruption Indicator,Economy ISO3,Corruption Perceptions Index Rank,...,government_expenditure_on_education%,government_health_expenditure%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,intentional_homicides
0,Albania,2003,,,200,,,5832,,,...,,,,,,,,,,
1,Albania,2004,,,200,,,5787,,,...,,,,,,,,,,
2,Albania,2005,,,200,800.0,400.0,5798,,,...,3.28155,2.665318,30.6,13.235,6.306,76.621,3011487.0,1604189.0,0.003672,5.078094
3,Albania,2006,,,200,,,5775,,,...,3.19146,2.559311,,12.539,6.477,76.816,2992547.0,1561661.0,0.07589,3.163096
4,Albania,2007,,,200,800.0,400.0,5744,,,...,3.27593,2.435714,,12.07,6.318,77.549,2970017.0,1517619.0,0.113008,3.528127


In [12]:
merged_data[merged_data["country"] == "Italy"].head()

Unnamed: 0,country,year,Corruption Commission Present in Constitution,The Bayesian Corruption Indicator,Corruption Perceptions Index Rank,Corruption Perceptions Index Score,Corruption Perceptions Index Sources,Corruption Perceptions Index Standard Error,gdp,Regional Indicator,...,government_expenditure_on_education%,government_health_expenditure%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,intentional_homicides
444,Italy,2012,200,3982,72.0,42.0,7.0,2.4,29367.5,Western Europe,...,4.05602,6.664661,35.2,9.0,10.3,82.239024,59539717.0,18645458.0,0.916809,0.880543
445,Italy,2013,200,3998,69.0,43.0,7.0,2.5,29412.8,Western Europe,...,4.14367,6.648142,34.9,8.5,10.0,82.690244,60233948.0,18685173.0,0.954217,0.83233
446,Italy,2014,200,4047,69.0,43.0,7.0,2.26,29381.3,Western Europe,...,4.06128,6.688622,34.7,8.3,9.8,83.090244,60789140.0,18679287.0,0.999287,0.78743
447,Italy,2015,200,4044,61.0,44.0,7.0,2.5,29857.5,Western Europe,...,4.07363,6.592664,35.4,8.0,10.7,82.543902,60730582.0,18483353.0,1.03479,0.778644
448,Italy,2016,200,4006,60.0,47.0,7.0,3.34,32030.9,Western Europe,...,3.81579,6.491736,35.2,7.8,10.1,83.243902,60627498.0,18276159.0,1.034978,0.672005


Removing countries that have only NaN values for all columns.

Furthermore, we want to merge countries that are mentioned more than once to the currently used name.

Because the Corruption Perception Index is only available from 2012, we need to filter the data on the timeframe 2012 - 2022.

Furthermore, we will remove some columns which we do not need for analysis.

In [6]:
# Mapping for removal of countries that are present more than once
country_mapping = {
    "Turkiye": "Turkey",
    "Russian Federation": "Russia",
    "Slovak Republic": "Slovakia",
    "Republic of Moldova": "Moldova"
}
merged_data["country"] = merged_data["country"].replace(country_mapping)

countries_to_remove = ["Andorra", "Israel", "Kosovo", "Liechtenstein", "Monaco", "Moldova", "Slovakia", "Russia", "Turkey"]
merged_data = merged_data[~merged_data["country"].isin(countries_to_remove)]

merged_data["year"] = pd.to_numeric(merged_data["year"])
merged_data = merged_data[(merged_data["year"] >= 2012) & (merged_data["year"] <= 2022)]

columns_to_remove = [
    "Unnamed: 0", "Economy ISO3", "Alpha3Code", "control_of_corruption_std",
    "agricultural_land%", "forest_land%", "avg_precipitation", "Civil Rights",
    "Civil Justice is Free of Corruption", "Criminal System is Free of Corruption",
    "central_goverment_debt%", "Positive Affect", "Negative Affect", "Anti Corruption Policy", "Log GDP Per Capita"
]

# Drop the columns
merged_data = merged_data.drop(columns=columns_to_remove, errors='ignore')


merged_data.to_csv("../data/processed/merged_.csv", index=False, index_label=False)

In [8]:
merged_data.head()

Unnamed: 0,country,year,Corruption Commission Present in Constitution,The Bayesian Corruption Indicator,Corruption Perceptions Index Rank,Corruption Perceptions Index Score,Corruption Perceptions Index Sources,Corruption Perceptions Index Standard Error,gdp,Regional Indicator,...,government_expenditure_on_education%,government_health_expenditure%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,intentional_homicides
9,Albania,2012,200,5765,113.0,33.0,7.0,2.0,9348.1,Central and Eastern Europe,...,3.309061,2.455187,29.0,12.227,7.573,78.064,2900401.0,1324613.0,0.022064,5.428407
10,Albania,2013,200,5779,116.0,31.0,7.0,2.1,9624.9,Central and Eastern Europe,...,3.5393,2.577751,,12.279,7.819,78.123,2895092.0,1291587.0,0.048893,4.295095
11,Albania,2014,200,5747,110.0,33.0,7.0,1.51,10384.7,Central and Eastern Europe,...,3.216967,2.593627,34.6,12.15,7.868,78.407,2889104.0,1258985.0,0.143777,4.64616
12,Albania,2015,200,5697,88.0,36.0,7.0,3.58,10702.7,Central and Eastern Europe,...,3.43797,2.731354,32.8,11.718,7.947,78.644,2880703.0,1226200.0,0.157162,2.22031
13,Albania,2016,200,5652,83.0,39.0,7.0,1.99,11139.9,Central and Eastern Europe,...,3.96209,2.803707,33.7,11.289,8.035,78.86,2876101.0,1195854.0,0.170949,2.742043


In [10]:
merged_data.to_csv("../data/processed/complete_data.csv", index=False)