<a href="https://colab.research.google.com/github/crystdang/DABC-Final-G5/blob/main/notebooks/Output_decisionTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn import tree

In [2]:
url = "https://uoft-dabc-final-g5.s3.us-east-2.amazonaws.com/transformed_table.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,date,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,bmi_range,...,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,age,gender,work_life_balance_score
0,2015-07-07,3,2,2,5,0,5,2,0,1,...,7,5,5,1,4,0,5,36 to 50,Female,609.5
1,2015-07-07,2,3,4,3,8,10,5,2,2,...,8,2,2,2,3,2,6,36 to 50,Female,655.6
2,2015-07-07,2,3,3,4,4,10,3,2,2,...,8,10,2,2,4,8,3,36 to 50,Female,631.6
3,2015-07-07,3,3,10,3,10,7,2,5,2,...,5,7,5,1,5,2,0,51 or more,Female,622.7
4,2015-07-07,5,1,3,3,10,4,2,4,2,...,7,0,0,2,8,1,5,51 or more,Female,663.9


In [3]:
# Drop the non-beneficial ID columns, 'date', 'bmi_range', 'age', 'gender'.
reduced_df = df.drop(['date', 'bmi_range', 'age', 'gender'], axis=1)
reduced_df.head()

Unnamed: 0,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,todo_completed,flow,daily_steps,live_vision,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,work_life_balance_score
0,3,2,2,5,0,5,2,0,6,4,5,0,7,5,5,1,4,0,5,609.5
1,2,3,4,3,8,10,5,2,5,2,5,5,8,2,2,2,3,2,6,655.6
2,2,3,3,4,4,10,3,2,2,2,4,5,8,10,2,2,4,8,3,631.6
3,3,3,10,3,10,7,2,5,3,5,5,0,5,7,5,1,5,2,0,622.7
4,5,1,3,3,10,4,2,4,5,0,5,0,7,0,0,2,8,1,5,663.9


In [4]:
# Determine the number of unique values in each column.
reduced_df.nunique()

fruits_veggies                6
daily_stress                  6
places_visited               11
core_circle                  11
supporting_others            11
social_network               11
achievement                  11
donation                      6
todo_completed               11
flow                         11
daily_steps                  10
live_vision                  11
sleep_hours                  10
lost_vacation                11
daily_shouting               11
sufficient_income             2
personal_awards              11
time_for_passion             11
weekly_meditation            11
work_life_balance_score    1696
dtype: int64

In [5]:
# Look at work_life_balance_score value counts for binning
WLB_count = reduced_df.work_life_balance_score.value_counts()
WLB_count

641.4    75
660.5    57
670.7    37
696.4    37
675.8    36
         ..
794.8     1
562.1     1
555.6     1
531.0     1
790.5     1
Name: work_life_balance_score, Length: 1696, dtype: int64

In [6]:
# Determine which values to replace if counts are less than ...?
replace_WLB = list(WLB_count.index)

# Replace in dataframe
for amt in replace_WLB:
    if amt <= 700:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("needs_improvement"))
    else:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("excellent"))

# Check to make sure binning was successful
reduced_df.work_life_balance_score.value_counts()

needs_improvement    12185
excellent             3786
Name: work_life_balance_score, dtype: int64

In [7]:
# Split our preprocessed data into our features and target arrays
y = reduced_df["work_life_balance_score"].values
X = reduced_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [8]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Resample the training data with Decision Tree
model = tree.DecisionTreeClassifier().fit(X_train_scaled, y_train)

In [10]:
# Calculated the balanced accuracy score
pred = model.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.8665164037064863

In [11]:
# Display the confusion matrix
cm = confusion_matrix(y_test, pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Excellent", "Actual Needs Improvement"], columns=["Predicted Excellent", "Predicted Needs Improvement"])
cm_df

Unnamed: 0,Predicted Excellent,Predicted Needs Improvement
Actual Excellent,698,249
Actual Needs Improvement,284,2762


In [12]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.71      0.74      0.91      0.72      0.82      0.66       947
needs_improvement       0.92      0.91      0.74      0.91      0.82      0.68      3046

      avg / total       0.87      0.87      0.78      0.87      0.82      0.67      3993



In [13]:
# No feature importance, live_vision and flow hypothesized as top features
liveflow_df = reduced_df[['flow', 'live_vision', 'work_life_balance_score']]

In [14]:
# Split our preprocessed data into our features and target arrays
y = liveflow_df["work_life_balance_score"].values
X = liveflow_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Resample the training data with Decision Tree
model = tree.DecisionTreeClassifier().fit(X_train_scaled, y_train)

In [17]:
# Calculate the balanced accuracy score
pred = model.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.8001502629601803

In [18]:
# Display the confusion matrix
cm = confusion_matrix(y_test, pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Excellent", "Actual Needs Improvement"], columns=["Predicted Excellent", "Predicted Needs Improvement"])
cm_df

Unnamed: 0,Predicted Excellent,Predicted Needs Improvement
Actual Excellent,319,628
Actual Needs Improvement,170,2876


In [19]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.65      0.34      0.94      0.44      0.56      0.30       947
needs_improvement       0.82      0.94      0.34      0.88      0.56      0.34      3046

      avg / total       0.78      0.80      0.48      0.78      0.56      0.33      3993

