In [1]:
import pandas as pd

# Load the CSV files
population_df = pd.read_csv('Resources/Population.csv')
happinessindex_df = pd.read_csv('Resources/HappinessIndex.csv')
unemployment_rate_df = pd.read_csv('Resources/Unemployment_rate.csv')
gini_index_df = pd.read_csv('Resources/Gini Index coefficient - distribution of family income.csv')
median_age_df = pd.read_csv('Resources/Median age.csv')
# Load the Excel file for temperature data
avg_temp_df = pd.read_excel('Resources/Avg_Temperature.xlsx')

In [2]:

# Rename columns 
population_df.rename(columns={'name': 'Country', 'value': 'Population', 'region': 'Region'}, inplace=True)
happinessindex_df.rename(columns={'Country name': 'Country', 'Ladder score': 'Ladder score'}, inplace=True)
unemployment_rate_df.rename(columns={'name': 'Country', 'ranking': 'Ranking_unemployment', 'region': 'Region'}, inplace=True)
gini_index_df.rename(columns={'name': 'Country', 'region': 'Region', 'value': 'Gini coefficient'}, inplace=True)
median_age_df.rename(columns={'name': 'Country', 'ranking': 'Ranking_median_age', 'region': 'Region'}, inplace=True)
avg_temp_df.rename(columns={'name': 'Country', 'Average Temperature': 'Average Temperature'}, inplace=True)


In [3]:

# Remove leading/trailing spaces from 'Country' column in all DataFrames
dataframes = [population_df, happinessindex_df, unemployment_rate_df, gini_index_df, median_age_df, avg_temp_df]
for df in dataframes:
    df['Country'] = df['Country'].str.strip()

In [4]:
# Convert 'Average Temperature' to numeric, coercing errors to NaN
avg_temp_df['Average Temperature'] = pd.to_numeric(avg_temp_df['Average Temperature'], errors='coerce')

In [5]:

# Debugging: Check for NaN values in 'Country' columns
for df in dataframes:
    print(f"NaN values in 'Country' column of {df.columns[0]}: {df['Country'].isna().sum()}")

NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0


In [6]:

# Check unique country names in each dataset
print("\nUnique country names in Happiness Index data:")
print(happinessindex_df['Country'].unique())
print("\nUnique country names in Temperature data:")
print(avg_temp_df['Country'].unique())


Unique country names in Happiness Index data:
['Finland' 'Denmark' 'Iceland' 'Israel' 'Netherlands' 'Sweden' 'Norway'
 'Switzerland' 'Luxembourg' 'New Zealand' 'Austria' 'Australia' 'Canada'
 'Ireland' 'United States' 'Germany' 'Belgium' 'Czechia' 'United Kingdom'
 'Lithuania' 'France' 'Slovenia' 'Costa Rica' 'Romania' 'Singapore'
 'United Arab Emirates' 'Taiwan Province of China' 'Uruguay' 'Slovakia'
 'Saudi Arabia' 'Estonia' 'Spain' 'Italy' 'Kosovo' 'Chile' 'Mexico'
 'Malta' 'Panama' 'Poland' 'Nicaragua' 'Latvia' 'Bahrain' 'Guatemala'
 'Kazakhstan' 'Serbia' 'Cyprus' 'Japan' 'Croatia' 'Brazil' 'El Salvador'
 'Hungary' 'Argentina' 'Honduras' 'Uzbekistan' 'Malaysia' 'Portugal'
 'South Korea' 'Greece' 'Mauritius' 'Thailand' 'Mongolia' 'Kyrgyzstan'
 'Moldova' 'China' 'Vietnam' 'Paraguay' 'Montenegro' 'Jamaica' 'Bolivia'
 'Russia' 'Bosnia and Herzegovina' 'Colombia' 'Dominican Republic'
 'Ecuador' 'Peru' 'Philippines' 'Bulgaria' 'Nepal' 'Armenia' 'Tajikistan'
 'Algeria' 'Hong Kong S.A.R. 

In [7]:
# Merge DataFrames and include 'Region' from gini_index_df, median_age_df, population_df, and unemployment_rate_df
merged_df = pd.merge(happinessindex_df, population_df[['Country', 'Population', 'Region']], on='Country', how='left')
print("\nAfter merging happiness and population data:")
print(merged_df.head())

merged_df = pd.merge(merged_df, unemployment_rate_df[['Country', 'Ranking_unemployment', 'Region']], on='Country', how='left', suffixes=('', '_unemployment'))
print("\nAfter merging unemployment data:")
print(merged_df.head())

merged_df = pd.merge(merged_df, gini_index_df[['Country', 'Gini coefficient', 'Region']], on='Country', how='left', suffixes=('', '_gini'))
print("\nAfter merging gini index data:")
print(merged_df.head())

merged_df = pd.merge(merged_df, median_age_df[['Country', 'Ranking_median_age', 'Region']], on='Country', how='left', suffixes=('', '_median_age'))
print("\nAfter merging median age data:")
print(merged_df.head())

merged_df = pd.merge(merged_df, avg_temp_df, on='Country', how='left', suffixes=('', '_avg_temp'))
print("\nAfter merging temperature data:")
print(merged_df.head())



After merging happiness and population data:
       Country  Ladder score  Standard error of ladder score  upperwhisker  \
0      Finland         7.804                           0.036         7.875   
1      Denmark         7.586                           0.041         7.667   
2      Iceland         7.530                           0.049         7.625   
3       Israel         7.473                           0.032         7.535   
4  Netherlands         7.403                           0.029         7.460   

   lowerwhisker  Logged GDP per capita  Social support  \
0         7.733                 10.792           0.969   
1         7.506                 10.962           0.954   
2         7.434                 10.896           0.983   
3         7.411                 10.639           0.943   
4         7.346                 10.942           0.930   

   Healthy life expectancy  Freedom to make life choices  Generosity  ...  \
0                   71.150                         0.961   

In [8]:
# Drop any duplicate columns, prioritize non-null regions
merged_df['Region'] = merged_df.apply(lambda row: row['Region'] if pd.notnull(row['Region']) else row['Region_unemployment'] if pd.notnull(row['Region_unemployment']) else row['Region_gini'] if pd.notnull(row['Region_gini']) else row['Region_median_age'], axis=1)
merged_df = merged_df.drop(columns=['Region_unemployment', 'Region_gini', 'Region_median_age'])

In [9]:

# Handle 'Average Temperature' separately
print("\nColumns in merged DataFrame:")
print(merged_df.columns)


Columns in merged DataFrame:
Index(['Country', 'Ladder score', 'Standard error of ladder score',
       'upperwhisker', 'lowerwhisker', 'Logged GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual', 'Population', 'Region', 'Ranking_unemployment',
       'Gini coefficient', 'Ranking_median_age', 'Average Temperature',
       'https://tradingeconomics.com/country-list/temperature'],
      dtype='object')


In [10]:

if 'Average Temperature' in merged_df.columns:
    merged_df['Average Temperature'] = merged_df['Average Temperature'].astype(str)
    merged_df['Average Temperature'] = merged_df['Average Temperature'].replace('nan', 'n/a')


# Fill any other NaN values with 'n/a', excluding 'Average Temperature'
other_columns = merged_df.columns.difference(['Average Temperature'])
merged_df[other_columns] = merged_df[other_columns].fillna('n/a')


In [11]:
# Check what to keep
columns_to_keep = [
    'Country', 
    'Region',
    'Ladder score',
    'Logged GDP per capita', 
    'Social support', 
    'Healthy life expectancy', 
    'Freedom to make life choices', 
    'Generosity', 
    'Perceptions of corruption', 
    'Population',  
    'Ranking_unemployment',  
    'Ranking_median_age',  
    'Gini coefficient',
    'Average Temperature'
]

missing_cols = [col for col in columns_to_keep if col not in merged_df.columns]
if missing_cols:
    print(f"Warning: Column(s) {missing_cols} are missing from the DataFrame.")

In [12]:
# final check before making csv
existing_columns_to_keep = [col for col in columns_to_keep if col in merged_df.columns]

cleaned_df = merged_df[existing_columns_to_keep]

cleaned_df.to_csv('final_output.csv', index=False)