## DEPENDENCIES

In [1]:
import pandas as pd
from pathlib import Path
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## IMPORT DATA

In [2]:
# Create SQLite connection
happiness_path = Path('Resources/HappinessIndexScore.sqlite')
engine = create_engine(f'sqlite:///{happiness_path}')
conn = engine.connect()
conn.close()

In [3]:
# Create df
happiness_df = pd.read_sql('SELECT * FROM final_output', con=engine)

happiness_df.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23


## LOGISTIC REGRESSION MODEL

In [4]:
# Find the mid point of the happiness score
mid_point = happiness_df['ladder_score'].median()
mid_point

5.722

In [5]:
# Create a happiness column with values of 0 or 1 based on the ladder score
happiness_df['happiness'] = 0
happiness_df.loc[happiness_df['ladder_score'] >= mid_point, 'happiness'] = 1

In [6]:
# Drop unnecesary columns
happiness_df.drop(['country', 'region', 'ladder_score'], inplace= True, axis= 1)

In [7]:
# Separate the y variable, the target
y = happiness_df['happiness']

# Separate the X variable, the features
X = happiness_df.drop(columns = ['happiness'])

In [8]:
# review y

y.head()

0    1
1    1
2    1
3    1
4    1
Name: happiness, dtype: int64

In [9]:
# review X

X.head()

Unnamed: 0,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita
0,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23
1,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16
2,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72
3,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07
4,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23


In [10]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

In [11]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train_scaled, y_train)

In [13]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")

Training Data Score: 0.9347826086956522


In [14]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).head()

Unnamed: 0,Prediction,Actual
48,0,1
113,0,0
73,0,0
105,0,0
45,0,1


In [15]:
# Get the accuracy score
accuracy_score(predictions, y_test)

0.8064516129032258

In [16]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[17,  4],
       [ 2,  8]])

In [17]:
# Print the classification report for the model
target_names = ['Unhappy', 'Happy']
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

     Unhappy       0.89      0.81      0.85        21
       Happy       0.67      0.80      0.73        10

    accuracy                           0.81        31
   macro avg       0.78      0.80      0.79        31
weighted avg       0.82      0.81      0.81        31



In [18]:
# Classification feature importance
importances = zip(classifier.coef_[0], X.columns)
importances = sorted(importances, key=lambda x: abs(x[0]), reverse=True)
importances


[(1.5027339642727484, 'social_support'),
 (1.2110747140987579, 'logged_GPD_per_capita'),
 (0.8573516598257989, 'freedom_life_choices'),
 (0.6418046110823283, 'healthy_life_expectancy'),
 (0.5434415629435844, 'median_age'),
 (-0.5415229920342285, 'perceptions_corruption'),
 (0.5018829887007995, 'generosity'),
 (0.21119515480230724, 'lt_alcohol_per_capita'),
 (0.1771083389494039, 'avg_temperature'),
 (-0.11724563802047769, 'unemployment_rate'),
 (-0.11707003180174987, 'gini_coefficient'),
 (-0.10440315705067321, 'population_density')]