<a href="https://colab.research.google.com/github/crystdang/DABC-Final-G5/blob/main/notebooks/Output_BRF_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Cleaning and Binning Data**

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
import numpy as np
from pathlib import Path
from collections import Counter

In [2]:
url = 'https://uoft-dabc-final-g5.s3.us-east-2.amazonaws.com/transformed_table.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,date,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,bmi_range,...,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,age,gender,work_life_balance_score
0,2015-07-07,3,2,2,5,0,5,2,0,1,...,7,5,5,1,4,0,5,36 to 50,Female,609.5
1,2015-07-07,2,3,4,3,8,10,5,2,2,...,8,2,2,2,3,2,6,36 to 50,Female,655.6
2,2015-07-07,2,3,3,4,4,10,3,2,2,...,8,10,2,2,4,8,3,36 to 50,Female,631.6
3,2015-07-07,3,3,10,3,10,7,2,5,2,...,5,7,5,1,5,2,0,51 or more,Female,622.7
4,2015-07-07,5,1,3,3,10,4,2,4,2,...,7,0,0,2,8,1,5,51 or more,Female,663.9


In [3]:
# Look at submission dates
date_count = df.date.value_counts()
date_count

2018-07-23    162
2016-03-22    144
2018-05-29    112
2019-08-05    101
2018-08-13     48
             ... 
2018-01-18      1
2018-04-11      1
2016-08-14      1
2018-04-04      1
2018-06-06      1
Name: date, Length: 1974, dtype: int64

In [4]:
# Drop the non-beneficial ID columns, 'date', 'bmi_range', 'age', 'gender'.
reduced_df = df.drop(['date', 'bmi_range', 'age', 'gender'], axis=1)
reduced_df.head()

Unnamed: 0,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,todo_completed,flow,daily_steps,live_vision,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,work_life_balance_score
0,3,2,2,5,0,5,2,0,6,4,5,0,7,5,5,1,4,0,5,609.5
1,2,3,4,3,8,10,5,2,5,2,5,5,8,2,2,2,3,2,6,655.6
2,2,3,3,4,4,10,3,2,2,2,4,5,8,10,2,2,4,8,3,631.6
3,3,3,10,3,10,7,2,5,3,5,5,0,5,7,5,1,5,2,0,622.7
4,5,1,3,3,10,4,2,4,5,0,5,0,7,0,0,2,8,1,5,663.9


In [5]:
# Determine the number of unique values in each column.
reduced_df.nunique()

fruits_veggies                6
daily_stress                  6
places_visited               11
core_circle                  11
supporting_others            11
social_network               11
achievement                  11
donation                      6
todo_completed               11
flow                         11
daily_steps                  10
live_vision                  11
sleep_hours                  10
lost_vacation                11
daily_shouting               11
sufficient_income             2
personal_awards              11
time_for_passion             11
weekly_meditation            11
work_life_balance_score    1696
dtype: int64

In [6]:
# Look at work_life_balance_score value counts for binning
WLB_count = reduced_df.work_life_balance_score.value_counts()
WLB_count

641.4    75
660.5    57
670.7    37
696.4    37
675.8    36
         ..
794.8     1
562.1     1
555.6     1
531.0     1
790.5     1
Name: work_life_balance_score, Length: 1696, dtype: int64

In [7]:
# Determine which values to replace
replace_WLB = list(WLB_count.index)

# Replace in dataframe
for amt in replace_WLB:
    if amt <= 700:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("needs_improvement"))
    else:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("excellent"))

# Check to make sure binning was successful
reduced_df.work_life_balance_score.value_counts()

needs_improvement    12185
excellent             3786
Name: work_life_balance_score, dtype: int64

**Random Forest Classifier**

In [8]:
# Split our preprocessed data into our features and target arrays
y = reduced_df["work_life_balance_score"].values
X = reduced_df.drop(["work_life_balance_score"], axis=1)
# .values returns a numpy array, not a Pandas dataframe. An array does not have a columns attribute

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Resample the training data with the BalancedRandomForestClassifier
# Used Balanced to manage input data imbalance: https://imbalanced-learn.org/stable/references/generated/imblearn.ensemble.BalancedRandomForestClassifier.html
from imblearn.ensemble import BalancedRandomForestClassifier
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

In [11]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.9398315931500172

In [12]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,926,21
Actual needs_improvement,299,2747


In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.76      0.98      0.90      0.85      0.94      0.89       947
needs_improvement       0.99      0.90      0.98      0.94      0.94      0.88      3046

      avg / total       0.94      0.92      0.96      0.92      0.94      0.88      3993



In [14]:
# Calculate feature importance in the Random Forest model.
importances = clf.feature_importances_
importances


array([0.04646439, 0.03948106, 0.07258066, 0.0702032 , 0.10738067,
       0.02973986, 0.09513296, 0.04249159, 0.0692937 , 0.03436454,
       0.05122611, 0.05494553, 0.01687931, 0.03359228, 0.0291178 ,
       0.02495852, 0.07516504, 0.06313838, 0.04384439])

In [15]:
# List the features sorted in descending order by feature importance
features = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
features

[(0.10738067252267391, 'supporting_others'),
 (0.09513296106514967, 'achievement'),
 (0.07516503637036515, 'personal_awards'),
 (0.07258066367516036, 'places_visited'),
 (0.07020320030817044, 'core_circle'),
 (0.0692936993422422, 'todo_completed'),
 (0.06313838339265122, 'time_for_passion'),
 (0.05494553254354476, 'live_vision'),
 (0.051226112920385924, 'daily_steps'),
 (0.04646438642244319, 'fruits_veggies'),
 (0.043844389373568604, 'weekly_meditation'),
 (0.04249158867241939, 'donation'),
 (0.03948106457160489, 'daily_stress'),
 (0.0343645362032892, 'flow'),
 (0.033592279882269135, 'lost_vacation'),
 (0.029739860006606795, 'social_network'),
 (0.029117803010919267, 'daily_shouting'),
 (0.02495852319107609, 'sufficient_income'),
 (0.016879306525459714, 'sleep_hours')]

In [16]:
# Split our preprocessed data into our features and target arrays removing low importance
minimal_df = reduced_df.drop(['daily_stress', 'flow', 'lost_vacation', 'social_network', 'daily_shouting', 'sufficient_income', 'sleep_hours'], axis=1)

y = minimal_df["work_life_balance_score"].values
X = minimal_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [19]:
# Define model using 128 maximum n_estimators for better accuracy
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)


In [20]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.9026831456560822

In [21]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,883,64
Actual needs_improvement,387,2659


In [22]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.70      0.93      0.87      0.80      0.90      0.82       947
needs_improvement       0.98      0.87      0.93      0.92      0.90      0.81      3046

      avg / total       0.91      0.89      0.92      0.89      0.90      0.81      3993



In [23]:
# Calculate feature importance in the Random Forest model.
importances = clf.feature_importances_

# List the features sorted in descending order by feature importance
features = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
features

[(0.11564106010872416, 'supporting_others'),
 (0.10939025982904808, 'achievement'),
 (0.10579896021181058, 'places_visited'),
 (0.08926834959790948, 'personal_awards'),
 (0.08784903922786051, 'todo_completed'),
 (0.0870881893230704, 'time_for_passion'),
 (0.08505967018679736, 'core_circle'),
 (0.07470178726137762, 'live_vision'),
 (0.07000190479233563, 'daily_steps'),
 (0.06203094626040235, 'weekly_meditation'),
 (0.058508483144500335, 'fruits_veggies'),
 (0.054661350056163446, 'donation')]

In [24]:
# Split our preprocessed data into our features and target arrays removing low importance
min_df = minimal_df.drop(['daily_steps', 'fruits_veggies', 'weekly_meditation','donation'], axis=1)

y = min_df["work_life_balance_score"].values
X = min_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [28]:
# Define model
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

In [29]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.8666041846214434

In [30]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,847,100
Actual needs_improvement,491,2555


In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.63      0.89      0.84      0.74      0.87      0.75       947
needs_improvement       0.96      0.84      0.89      0.90      0.87      0.75      3046

      avg / total       0.88      0.85      0.88      0.86      0.87      0.75      3993



In [32]:
# Calculate feature importance in the Random Forest model.
importances = clf.feature_importances_

# List the features sorted in descending order by feature importance
features = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
features

[(0.15700974819968694, 'supporting_others'),
 (0.13508537022472322, 'achievement'),
 (0.13372246200433613, 'places_visited'),
 (0.12627716664041042, 'todo_completed'),
 (0.11806302544927944, 'time_for_passion'),
 (0.11455458208645332, 'personal_awards'),
 (0.11346817589628982, 'core_circle'),
 (0.10181946949882084, 'live_vision')]

In [33]:
# Split our preprocessed data into our features and target arrays removing low importance
liveflow_df = reduced_df[['flow', 'live_vision', 'work_life_balance_score']]

y = liveflow_df["work_life_balance_score"].values
X = liveflow_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [37]:
# Define model
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

In [38]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.725251528654957

In [39]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,694,253
Actual needs_improvement,860,2186


In [40]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.45      0.73      0.72      0.55      0.73      0.53       947
needs_improvement       0.90      0.72      0.73      0.80      0.73      0.53      3046

      avg / total       0.79      0.72      0.73      0.74      0.73      0.53      3993



In [41]:
# Split our preprocessed data into our 2 features and target arrays
support_achieve_df = reduced_df[['supporting_others', 'achievement', 'work_life_balance_score']]

y = support_achieve_df["work_life_balance_score"].values
X = support_achieve_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [42]:
# Define model
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

In [43]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.759616017960439

In [44]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,788,159
Actual needs_improvement,953,2093


In [45]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.45      0.83      0.69      0.59      0.76      0.58       947
needs_improvement       0.93      0.69      0.83      0.79      0.76      0.56      3046

      avg / total       0.82      0.72      0.80      0.74      0.76      0.57      3993



**XGBoost**

In [46]:
# Split our preprocessed data into our features and target arrays
y = reduced_df["work_life_balance_score"].values
X = reduced_df.drop(["work_life_balance_score"], axis=1)

In [47]:
# Suggested by Ankush: https://xgboost.readthedocs.io/en/stable/index.html
from xgboost import XGBClassifier

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
Counter(y_train)

Counter({'needs_improvement': 9776, 'excellent': 3000})

In [48]:
# Define model
# use maximum n_estimators
bst = XGBClassifier(n_estimators=128, max_depth=2, learning_rate=1, objective='binary:logistic')

# Fit model
bst.fit(X_train, y_train)
preds = bst.predict(X_test)

In [49]:
balanced_accuracy_score(y_test, preds)

0.9194345948241169

In [50]:
# Display the confusion matrix
cm = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,691,95
Actual needs_improvement,97,2312


In [51]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, preds))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.88      0.88      0.96      0.88      0.92      0.84       786
needs_improvement       0.96      0.96      0.88      0.96      0.92      0.85      2409

      avg / total       0.94      0.94      0.90      0.94      0.92      0.85      3195



In [52]:
# Calculate feature importance in the XGBoost model.
importances = bst.feature_importances_
importances

array([0.09454258, 0.06029003, 0.05669009, 0.0719367 , 0.08035199,
       0.03554204, 0.05940647, 0.03617318, 0.0514455 , 0.01968969,
       0.04933583, 0.04040269, 0.01675281, 0.05468821, 0.03157371,
       0.08567499, 0.05176474, 0.06570591, 0.0380328 ], dtype=float32)

In [53]:
# List the features sorted in descending order by feature importance
features = sorted(zip(bst.feature_importances_, X.columns), reverse=True)
features

[(0.09454258, 'fruits_veggies'),
 (0.085674986, 'sufficient_income'),
 (0.08035199, 'supporting_others'),
 (0.0719367, 'core_circle'),
 (0.06570591, 'time_for_passion'),
 (0.060290035, 'daily_stress'),
 (0.05940647, 'achievement'),
 (0.056690086, 'places_visited'),
 (0.054688208, 'lost_vacation'),
 (0.051764738, 'personal_awards'),
 (0.0514455, 'todo_completed'),
 (0.049335826, 'daily_steps'),
 (0.04040269, 'live_vision'),
 (0.038032804, 'weekly_meditation'),
 (0.036173183, 'donation'),
 (0.035542045, 'social_network'),
 (0.031573713, 'daily_shouting'),
 (0.019689688, 'flow'),
 (0.016752811, 'sleep_hours')]

In [54]:
# Split our preprocessed data into our features and target arrays removing low importance
minimal_df = reduced_df.drop(['sleep_hours', 'flow', 'daily_shouting','social_network', 'todo_completed', 'weekly_meditation', 'achievement'], axis=1)

y = minimal_df["work_life_balance_score"].values
X = minimal_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
Counter(y_train)

Counter({'needs_improvement': 9770, 'excellent': 3006})

In [55]:
# Define model
# use maximum n_estimators
bst = XGBClassifier(n_estimators=128, max_depth=2, learning_rate=1, objective='binary:logistic')

# Fit model
bst.fit(X_train, y_train)
preds = bst.predict(X_test)

In [56]:
balanced_accuracy_score(y_test, preds)

0.8730848861283644

In [57]:
# Display the confusion matrix
cm = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,624,156
Actual needs_improvement,130,2285


In [58]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, preds))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.83      0.80      0.95      0.81      0.87      0.75       780
needs_improvement       0.94      0.95      0.80      0.94      0.87      0.77      2415

      avg / total       0.91      0.91      0.84      0.91      0.87      0.76      3195



In [59]:
# Calculate feature importance in the XGBoost model.
importances = bst.feature_importances_
importances

array([0.0507406 , 0.09136637, 0.07100479, 0.06064844, 0.12264714,
       0.05882475, 0.07132175, 0.07454678, 0.04984586, 0.19452673,
       0.09219467, 0.06233211], dtype=float32)

In [60]:
# List the features sorted in descending order by feature importance
features = sorted(zip(bst.feature_importances_, X.columns), reverse=True)
features

[(0.19452673, 'sufficient_income'),
 (0.122647144, 'supporting_others'),
 (0.09219467, 'personal_awards'),
 (0.091366366, 'daily_stress'),
 (0.074546784, 'live_vision'),
 (0.07132175, 'daily_steps'),
 (0.07100479, 'places_visited'),
 (0.06233211, 'time_for_passion'),
 (0.06064844, 'core_circle'),
 (0.058824748, 'donation'),
 (0.050740603, 'fruits_veggies'),
 (0.049845863, 'lost_vacation')]

In [61]:
# Split our preprocessed data into our features and target arrays removing low importance
min_df = minimal_df.drop(['daily_steps', 'core_circle', 'daily_stress', 'donation'], axis=1)

y = min_df["work_life_balance_score"].values
X = min_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
Counter(y_train)

Counter({'excellent': 3044, 'needs_improvement': 9732})

In [62]:
# Define model
# use maximum n_estimators
bst = XGBClassifier(n_estimators=128, max_depth=2, learning_rate=1, objective='binary:logistic')

# Fit model
bst.fit(X_train, y_train)
preds = bst.predict(X_test)

In [63]:
balanced_accuracy_score(y_test, preds)

0.8255442205649499

In [64]:
# Display the confusion matrix
cm = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,543,199
Actual needs_improvement,198,2255


In [65]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, preds))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.73      0.73      0.92      0.73      0.82      0.66       742
needs_improvement       0.92      0.92      0.73      0.92      0.82      0.69      2453

      avg / total       0.88      0.88      0.78      0.88      0.82      0.68      3195



In [66]:
# Calculate feature importance in the XGBoost model.
importances = bst.feature_importances_
importances

array([0.11717314, 0.09291126, 0.1900498 , 0.1336077 , 0.0891895 ,
       0.17726727, 0.09548921, 0.10431217], dtype=float32)

In [67]:
# List the features sorted in descending order by feature importance
features = sorted(zip(bst.feature_importances_, X.columns), reverse=True)
features

[(0.1900498, 'supporting_others'),
 (0.17726727, 'sufficient_income'),
 (0.1336077, 'live_vision'),
 (0.11717314, 'fruits_veggies'),
 (0.104312174, 'time_for_passion'),
 (0.09548921, 'personal_awards'),
 (0.09291126, 'places_visited'),
 (0.0891895, 'lost_vacation')]

In [68]:
# use flow and live_vision only
y = liveflow_df["work_life_balance_score"].values
X = liveflow_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
Counter(y_train)

Counter({'needs_improvement': 9739, 'excellent': 3037})

In [69]:
# Define model
# use maximum n_estimators
bst = XGBClassifier(n_estimators=128, max_depth=2, learning_rate=1, objective='binary:logistic')

# Fit model
bst.fit(X_train, y_train)
preds = bst.predict(X_test)

In [70]:
balanced_accuracy_score(y_test, preds)

0.6201926908267987

In [71]:
# Display the confusion matrix
cm = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,222,527
Actual needs_improvement,137,2309


In [72]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, preds))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.62      0.30      0.94      0.40      0.53      0.26       749
needs_improvement       0.81      0.94      0.30      0.87      0.53      0.30      2446

      avg / total       0.77      0.79      0.45      0.76      0.53      0.29      3195



In [73]:
# Split our preprocessed data into our 2 features and target arrays
support_income_df = reduced_df[['supporting_others', 'sufficient_income', 'work_life_balance_score']]

y = support_income_df["work_life_balance_score"].values
X = support_income_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
Counter(y_train)

Counter({'needs_improvement': 9742, 'excellent': 3034})

In [74]:
# Define model
# use maximum n_estimators
bst = XGBClassifier(n_estimators=128, max_depth=2, learning_rate=1, objective='binary:logistic')

# Fit model
bst.fit(X_train, y_train)
preds = bst.predict(X_test)

In [75]:
balanced_accuracy_score(y_test, preds)

0.7011832548053056

In [76]:
# Display the confusion matrix
cm = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "Predicted needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,Predicted needs_improvement
Actual excellent,390,362
Actual needs_improvement,284,2159


In [77]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, preds))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.58      0.52      0.88      0.55      0.68      0.44       752
needs_improvement       0.86      0.88      0.52      0.87      0.68      0.48      2443

      avg / total       0.79      0.80      0.60      0.79      0.68      0.47      3195

