In [7]:
import pandas as pd
from pathlib import Path 
import sqlite3

# Load the CSV files
population_df = pd.read_csv('Resources/Population.csv')
happinessindex_df = pd.read_csv('Resources/HappinessIndex.csv')
unemployment_rate_df = pd.read_csv('Resources/Unemployment_rate.csv')
gini_index_df = pd.read_csv('Resources/Gini Index coefficient - distribution of family income.csv')
median_age_df = pd.read_csv('Resources/Median age.csv')
# Load the Excel file for temperature data
avg_temp_df = pd.read_excel('Resources/Avg_Temperature.xlsx')

In [8]:

# Rename columns 
population_df.rename(columns={'name': 'Country', 'value': 'Population', 'region': 'Region'}, inplace=True)
happinessindex_df.rename(columns={'Country name': 'Country', 'Ladder score': 'Ladder score'}, inplace=True)
unemployment_rate_df.rename(columns={'name': 'Country', 'ranking': 'Ranking_unemployment', 'region': 'Region'}, inplace=True)
gini_index_df.rename(columns={'name': 'Country', 'region': 'Region'}, inplace=True)
median_age_df.rename(columns={'name': 'Country', 'ranking': 'Ranking_median_age', 'region': 'Region'}, inplace=True)
avg_temp_df.rename(columns={'name': 'Country', 'Average Temperature': 'Average Temperature'}, inplace=True)


In [9]:

# Remove leading/trailing spaces from 'Country' column in all DataFrames
dataframes = [population_df, happinessindex_df, unemployment_rate_df, gini_index_df, median_age_df, avg_temp_df]
for df in dataframes:
    df['Country'] = df['Country'].str.strip()

In [10]:

# Convert 'Average Temperature' to numeric, coercing errors to NaN
avg_temp_df['Average Temperature'] = pd.to_numeric(avg_temp_df['Average Temperature'], errors='coerce')

In [11]:

# Debugging: Check for NaN values in 'Country' columns
for df in dataframes:
    print(f"NaN values in 'Country' column of {df.columns[0]}: {df['Country'].isna().sum()}")


NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0


In [12]:
# Merge DataFrames and include 'Region' from gini_index_df, median_age_df, population_df, and unemployment_rate_df
merged_df = pd.merge(happinessindex_df, population_df[['Country', 'Population', 'Region']], on='Country', how='left')

merged_df = pd.merge(merged_df, unemployment_rate_df[['Country', 'Ranking_unemployment', 'Region']], on='Country', how='left', suffixes=('', '_unemployment'))

merged_df = pd.merge(merged_df, gini_index_df[['Country', 'Region']], on='Country', how='left', suffixes=('', '_gini'))

merged_df = pd.merge(merged_df, median_age_df[['Country', 'Ranking_median_age', 'Region']], on='Country', how='left', suffixes=('', '_median_age'))

merged_df = pd.merge(merged_df, avg_temp_df, on='Country', how='left', suffixes=('', '_avg_temp'))


In [13]:

# Drop any duplicate columns
merged_df['Region'] = merged_df.apply(lambda row: row['Region'] if pd.notnull(row['Region']) else row['Region_unemployment'] if pd.notnull(row['Region_unemployment']) else row['Region_gini'] if pd.notnull(row['Region_gini']) else row['Region_median_age'], axis=1)
merged_df = merged_df.drop(columns=['Region_unemployment', 'Region_gini', 'Region_median_age'])


In [14]:
if 'Average Temperature' in merged_df.columns:
    merged_df['Average Temperature'] = merged_df['Average Temperature'].astype(str)
    merged_df['Average Temperature'] = merged_df['Average Temperature'].replace('nan', 'n/a')

In [15]:

# Fill any other NaN values with 'n/a', excluding 'Average Temperature'
other_columns = merged_df.columns.difference(['Average Temperature'])
merged_df[other_columns] = merged_df[other_columns].fillna('n/a')


In [16]:

columns_to_keep = [
    'Country', 
    'Region',
    'Ladder score',
    'Logged GDP per capita', 
    'Social support', 
    'Healthy life expectancy', 
    'Freedom to make life choices', 
    'Generosity', 
    'Perceptions of corruption', 
    'Population',  
    'Ranking_unemployment',  
    'Ranking_median_age',  
    'Average Temperature'
]

missing_cols = [col for col in columns_to_keep if col not in merged_df.columns]
if missing_cols:
    print(f"Warning: Column(s) {missing_cols} are missing from the DataFrame.")


In [17]:
# Ensure all columns to keep exist in the dataframe before selecting them
existing_columns_to_keep = [col for col in columns_to_keep if col in merged_df.columns]

cleaned_df = merged_df[existing_columns_to_keep]

cleaned_df.to_csv('final_output.csv', index=False)
