In [68]:
import pandas as pd
import sqlite3
from pathlib import Path
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [69]:
# Load the CSV files
population_df = pd.read_csv('Resources/Population.csv')
happinessindex_df = pd.read_csv('Resources/HappinessIndex.csv')
unemployment_rate_df = pd.read_csv('Resources/Unemployment_rate.csv')
gini_index_df = pd.read_csv('Resources/Gini Index coefficient - distribution of family income.csv')
median_age_df = pd.read_csv('Resources/Median age.csv')
avg_temp_df = pd.read_csv('Resources/avg_temperature.csv')
alc_df = pd.read_csv('Resources/alcohol consumption per capita.csv')

In [70]:
# Strip leading/trailing spaces from all column names
dataframes = [population_df, happinessindex_df, unemployment_rate_df, gini_index_df, median_age_df, avg_temp_df, alc_df]
for df in dataframes:
    df.columns = df.columns.str.strip()

In [71]:
# Rename columns
population_df.rename(columns={'name': 'Country', 'value': 'Population', 'region': 'Region'}, inplace=True)
happinessindex_df.rename(columns={'Country name': 'Country', 'Ladder score': 'Ladder score'}, inplace=True)
unemployment_rate_df.rename(columns={'name': 'Country', 'ranking': 'Ranking_unemployment', 'region': 'Region'}, inplace=True)
gini_index_df.rename(columns={'name': 'Country', 'region': 'Region', 'value': 'Gini coefficient'}, inplace=True)
median_age_df.rename(columns={'name': 'Country', 'ranking': 'Ranking_median_age', 'region': 'Region'}, inplace=True)
avg_temp_df.rename(columns={'name': 'Country', 'Average Temperature': 'Average Temperature'}, inplace=True)
alc_df.rename(columns={'name': 'Country', 'liters of pure alcohol': 'Liters of pure alcohol'}, inplace=True)

In [72]:
# Remove leading/trailing spaces from 'Country' column in all DataFrames
for df in dataframes:
    df['Country'] = df['Country'].str.strip()

In [73]:
# Convert 'Population' to int
population_df['Population'] = population_df['Population'].str.replace(',', '').astype(int)

In [74]:
# Convert 'Average Temperature' to numeric, coercing errors to NaN
avg_temp_df['Average Temperature'] = pd.to_numeric(avg_temp_df['Average Temperature'], errors='coerce')

In [75]:
# Debugging: Check for NaN values in 'Country' columns
for df in dataframes:
    print(f"NaN values in 'Country' column of {df.columns[0]}: {df['Country'].isna().sum()}")

NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0
NaN values in 'Country' column of Country: 0


In [76]:
# Merge DataFrames and include 'Region' from gini_index_df, median_age_df, population_df, unemployment_rate_df, avg_temp_df and alc_df
merged_df = pd.merge(happinessindex_df, population_df[['Country', 'Population', 'Region']], on='Country', how='left')

merged_df = pd.merge(merged_df, unemployment_rate_df[['Country', 'Ranking_unemployment', 'Region']], on='Country', how='left', suffixes=('', '_unemployment'))

merged_df = pd.merge(merged_df, gini_index_df[['Country', 'Gini coefficient', 'Region']], on='Country', how='left', suffixes=('', '_gini'))

merged_df = pd.merge(merged_df, median_age_df[['Country', 'Ranking_median_age', 'Region']], on='Country', how='left', suffixes=('', '_median_age'))

merged_df = pd.merge(merged_df, avg_temp_df, on='Country', how='left', suffixes=('', '_avg_temp'))

merged_df = pd.merge(merged_df, alc_df, on='Country', how='left', suffixes=('', '_alc_df'))

In [77]:
# Drop any duplicate columns, prioritize non-null regions
merged_df['Region'] = merged_df.apply(lambda row: row['Region'] if pd.notnull(row['Region']) else row['Region_unemployment'] if pd.notnull(row['Region_unemployment']) else row['Region_gini'] if pd.notnull(row['Region_gini']) else row['Region_median_age'], axis=1)
merged_df = merged_df.drop(columns=['Region_unemployment', 'Region_gini', 'Region_median_age'])

In [78]:
# Check Columns
print("\nColumns in merged DataFrame:")
print(merged_df.columns)


Columns in merged DataFrame:
Index(['Country', 'Ladder score', 'Standard error of ladder score',
       'upperwhisker', 'lowerwhisker', 'Logged GDP per capita',
       'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual', 'Population', 'Region', 'Ranking_unemployment',
       'Gini coefficient', 'Ranking_median_age', 'Average Temperature', 'slug',
       'Liters of pure alcohol', 'date_of_information', 'ranking', 'region'],
      dtype='object')


In [79]:
# Check what to keep
columns_to_keep = [
    'Country', 
    'Region',
    'Ladder score',
    'Logged GDP per capita', 
    'Social support', 
    'Healthy life expectancy', 
    'Freedom to make life choices', 
    'Generosity', 
    'Perceptions of corruption', 
    'Population',  
    'Ranking_unemployment',  
    'Ranking_median_age',  
    'Gini coefficient',
    'Average Temperature',
    'Liters of pure alcohol'
]

missing_cols = [col for col in columns_to_keep if col not in merged_df.columns]
if missing_cols:
    print(f"Warning: Column(s) {missing_cols} are missing from the DataFrame.")

# final check before making csv
existing_columns_to_keep = [col for col in columns_to_keep if col in merged_df.columns]

cleaned_df = merged_df[existing_columns_to_keep]

# Remove rows with 'n/a' in any cell
cleaned_df = cleaned_df.replace('n/a', pd.NA).dropna()

cleaned_df.to_csv('Resources/final_output.csv', index=False)

In [80]:
cleaned_df

Unnamed: 0,Country,Region,Ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Population,Ranking_unemployment,Ranking_median_age,Gini coefficient,Average Temperature,Liters of pure alcohol
0,Finland,Europe,7.804,10.792,0.969,71.150,0.961,-0.019,0.182,5614571.0,141.0,36.0,27.7,3.24,8.23
1,Denmark,Europe,7.586,10.962,0.954,71.250,0.934,0.134,0.196,5946984.0,100.0,44.0,27.7,9.77,9.16
2,Iceland,Europe,7.530,10.896,0.983,72.050,0.936,0.211,0.668,360872.0,64.0,77.0,26.1,2.11,7.72
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,9043387.0,59.0,137.0,38.6,20.23,3.07
4,Netherlands,Europe,7.403,10.942,0.930,71.550,0.887,0.213,0.379,17463930.0,65.0,43.0,29.2,11.72,8.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,Botswana,Africa,3.435,9.629,0.753,54.725,0.742,-0.215,0.830,2417596.0,211.0,159.0,53.3,22.00,5.98
133,Zimbabwe,Africa,3.204,7.641,0.690,54.050,0.654,-0.046,0.766,15418674.0,158.0,191.0,50.3,21.83,3.11
134,Sierra Leone,Africa,3.138,7.394,0.555,54.900,0.660,0.105,0.858,8908040.0,55.0,208.0,35.7,26.64,3.22
135,Lebanon,Middle East,2.392,9.478,0.530,66.149,0.474,-0.141,0.891,5331203.0,178.0,89.0,31.8,15.66,1.14


In [81]:
# Create SQLite file
conn = sqlite3.connect('Resources/HappinessIndexScore.sqlite')
cleaned_df.to_sql('final_output', conn, if_exists='replace', index=False)
conn.close()

In [82]:
# Create SQLite connection
happiness_path = Path('Resources/HappinessIndexScore.sqlite')
engine = create_engine(f'sqlite:///{happiness_path}')
conn = engine.connect()

In [83]:
# Create df
happiness_df = pd.read_sql('SELECT * FROM final_output', con=engine)

happiness_df.head()

Unnamed: 0,Country,Region,Ladder score,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Population,Ranking_unemployment,Ranking_median_age,Gini coefficient,Average Temperature,Liters of pure alcohol
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,5614571.0,141.0,36.0,27.7,3.24,8.23
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,5946984.0,100.0,44.0,27.7,9.77,9.16
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,360872.0,64.0,77.0,26.1,2.11,7.72
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,9043387.0,59.0,137.0,38.6,20.23,3.07
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,17463930.0,65.0,43.0,29.2,11.72,8.23


In [84]:
# Find the mid point of the happiness score
mid_point = (happiness_df['Ladder score'].min()+happiness_df['Ladder score'].max())*0.5
mid_point

4.8315

In [85]:
# Create a happiness column with values of 0 or 1 based on the ladder score
happiness_df['happiness'] = 0
happiness_df.loc[happiness_df['Ladder score'] >= mid_point, 'happiness'] = 1

In [86]:
# Drop unnecesary columns
happiness_df.drop(['Country', 'Region', 'Ladder score'], inplace= True, axis= 1)

In [87]:
# Separate the y variable, the target
y = happiness_df['happiness']

# Separate the X variable, the features
X = happiness_df.drop(columns = ['happiness'])

In [88]:
# review y

y.head()

0    1
1    1
2    1
3    1
4    1
Name: happiness, dtype: int64

In [89]:
# review X

X.head()

Unnamed: 0,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Population,Ranking_unemployment,Ranking_median_age,Gini coefficient,Average Temperature,Liters of pure alcohol
0,10.792,0.969,71.15,0.961,-0.019,0.182,5614571.0,141.0,36.0,27.7,3.24,8.23
1,10.962,0.954,71.25,0.934,0.134,0.196,5946984.0,100.0,44.0,27.7,9.77,9.16
2,10.896,0.983,72.05,0.936,0.211,0.668,360872.0,64.0,77.0,26.1,2.11,7.72
3,10.639,0.943,72.697,0.809,-0.023,0.708,9043387.0,59.0,137.0,38.6,20.23,3.07
4,10.942,0.93,71.55,0.887,0.213,0.379,17463930.0,65.0,43.0,29.2,11.72,8.23


In [90]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

In [91]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [92]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train_scaled, y_train)

In [93]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")

Training Data Score: 0.8586956521739131


In [94]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head()

Unnamed: 0,Prediction,Actual
48,1,1
113,0,0
73,1,1
105,0,0
45,1,1


In [95]:
# Get the accuracy score
accuracy_score(predictions, y_test)

0.8709677419354839

In [96]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[ 9,  1],
       [ 3, 18]])

In [97]:
# Print the classification report for the model
# target_names = ['Unhappy', 'Happy']
# print(classification_report(y_test, predictions, target_names=target_names))
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.75      0.90      0.82        10
           1       0.95      0.86      0.90        21

    accuracy                           0.87        31
   macro avg       0.85      0.88      0.86        31
weighted avg       0.88      0.87      0.87        31

