<a href="https://colab.research.google.com/github/crystdang/DABC-Final-G5/blob/Crystina/notebooks/Output_BRF_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Cleaning and Binning Data**

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2023-02-02 01:01:43--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar.2’


2023-02-02 01:01:44 (5.16 MB/s) - ‘postgresql-42.2.16.jar.2’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://uoft-dabc-final-g5.s3.us-east-2.amazonaws.com/transformed_table.csv"
spark.sparkContext.addFile(url)
data_df = spark.read.csv(SparkFiles.get("transformed_table.csv"), sep=",", header=True, inferSchema=True)

In [5]:
data_df.show()

+-------------------+--------------+------------+--------------+-----------+-----------------+--------------+-----------+--------+---------+--------------+----+-----------+-----------+-----------+-------------+--------------+-----------------+---------------+----------------+-----------------+----------+------+-----------------------+
|               date|fruits_veggies|daily_stress|places_visited|core_circle|supporting_others|social_network|achievement|donation|bmi_range|todo_completed|flow|daily_steps|live_vision|sleep_hours|lost_vacation|daily_shouting|sufficient_income|personal_awards|time_for_passion|weekly_meditation|       age|gender|work_life_balance_score|
+-------------------+--------------+------------+--------------+-----------+-----------------+--------------+-----------+--------+---------+--------------+----+-----------+-----------+-----------+-------------+--------------+-----------------+---------------+----------------+-----------------+----------+------+--------------

In [6]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
import numpy as np
from pathlib import Path
from collections import Counter

In [7]:
url = 'https://uoft-dabc-final-g5.s3.us-east-2.amazonaws.com/transformed_table.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,date,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,bmi_range,...,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,age,gender,work_life_balance_score
0,2015-07-07,3,2,2,5,0,5,2,0,1,...,7,5,5,1,4,0,5,36 to 50,Female,609.5
1,2015-07-07,2,3,4,3,8,10,5,2,2,...,8,2,2,2,3,2,6,36 to 50,Female,655.6
2,2015-07-07,2,3,3,4,4,10,3,2,2,...,8,10,2,2,4,8,3,36 to 50,Female,631.6
3,2015-07-07,3,3,10,3,10,7,2,5,2,...,5,7,5,1,5,2,0,51 or more,Female,622.7
4,2015-07-07,5,1,3,3,10,4,2,4,2,...,7,0,0,2,8,1,5,51 or more,Female,663.9


In [8]:
# Drop the non-beneficial ID columns, 'date', 'bmi_range', 'age', 'gender'.
reduced_df = df.drop(['date', 'bmi_range', 'age', 'gender'], axis=1)
reduced_df.head()

Unnamed: 0,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,todo_completed,flow,daily_steps,live_vision,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,work_life_balance_score
0,3,2,2,5,0,5,2,0,6,4,5,0,7,5,5,1,4,0,5,609.5
1,2,3,4,3,8,10,5,2,5,2,5,5,8,2,2,2,3,2,6,655.6
2,2,3,3,4,4,10,3,2,2,2,4,5,8,10,2,2,4,8,3,631.6
3,3,3,10,3,10,7,2,5,3,5,5,0,5,7,5,1,5,2,0,622.7
4,5,1,3,3,10,4,2,4,5,0,5,0,7,0,0,2,8,1,5,663.9


In [9]:
# Determine the number of unique values in each column.
reduced_df.nunique()

fruits_veggies                6
daily_stress                  6
places_visited               11
core_circle                  11
supporting_others            11
social_network               11
achievement                  11
donation                      6
todo_completed               11
flow                         11
daily_steps                  10
live_vision                  11
sleep_hours                  10
lost_vacation                11
daily_shouting               11
sufficient_income             2
personal_awards              11
time_for_passion             11
weekly_meditation            11
work_life_balance_score    1696
dtype: int64

In [10]:
# Look at work_life_balance_score value counts for binning
WLB_count = reduced_df.work_life_balance_score.value_counts()
WLB_count

641.4    75
660.5    57
670.7    37
696.4    37
675.8    36
         ..
794.8     1
562.1     1
555.6     1
531.0     1
790.5     1
Name: work_life_balance_score, Length: 1696, dtype: int64

In [11]:
# Determine which values to replace if counts are less than ...?
replace_WLB = list(WLB_count.index)

# Replace in dataframe
for amt in replace_WLB:
    if amt <= 700:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("needs_improvement"))
    else:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("excellent"))

# Check to make sure binning was successful
reduced_df.work_life_balance_score.value_counts()

needs_improvement    12185
excellent             3786
Name: work_life_balance_score, dtype: int64

**Random Forest Classifier**

In [12]:
# Split our preprocessed data into our features and target arrays
y = reduced_df["work_life_balance_score"].values
X = reduced_df.drop(["work_life_balance_score"], axis=1)
# .values returns a numpy array, not a Pandas dataframe. An array does not have a columns attribute

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [13]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Resample the training data with the BalancedRandomForestClassifier
# Used Balanced to manage input data imbalance: https://imbalanced-learn.org/stable/references/generated/imblearn.ensemble.BalancedRandomForestClassifier.html
from imblearn.ensemble import BalancedRandomForestClassifier
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

In [15]:
# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.9398315931500172

In [16]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,needs_improvement
Actual excellent,926,21
Actual needs_improvement,299,2747


In [17]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.76      0.98      0.90      0.85      0.94      0.89       947
needs_improvement       0.99      0.90      0.98      0.94      0.94      0.88      3046

      avg / total       0.94      0.92      0.96      0.92      0.94      0.88      3993



In [18]:
# Calculate feature importance in the Random Forest model.
importances = clf.feature_importances_
importances


array([0.04646439, 0.03948106, 0.07258066, 0.0702032 , 0.10738067,
       0.02973986, 0.09513296, 0.04249159, 0.0692937 , 0.03436454,
       0.05122611, 0.05494553, 0.01687931, 0.03359228, 0.0291178 ,
       0.02495852, 0.07516504, 0.06313838, 0.04384439])

In [19]:
# List the features sorted in descending order by feature importance
features = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
features

[(0.10738067252267391, 'supporting_others'),
 (0.09513296106514967, 'achievement'),
 (0.07516503637036515, 'personal_awards'),
 (0.07258066367516036, 'places_visited'),
 (0.07020320030817044, 'core_circle'),
 (0.0692936993422422, 'todo_completed'),
 (0.06313838339265122, 'time_for_passion'),
 (0.05494553254354476, 'live_vision'),
 (0.051226112920385924, 'daily_steps'),
 (0.04646438642244319, 'fruits_veggies'),
 (0.043844389373568604, 'weekly_meditation'),
 (0.04249158867241939, 'donation'),
 (0.03948106457160489, 'daily_stress'),
 (0.0343645362032892, 'flow'),
 (0.033592279882269135, 'lost_vacation'),
 (0.029739860006606795, 'social_network'),
 (0.029117803010919267, 'daily_shouting'),
 (0.02495852319107609, 'sufficient_income'),
 (0.016879306525459714, 'sleep_hours')]

In [20]:
# Split our preprocessed data into our features and target arrays removing low importance
minimal_df = reduced_df.drop(['daily_stress', 'flow', 'lost_vacation', 'social_network', 'daily_shouting', 'sufficient_income', 'sleep_hours'], axis=1)

y = minimal_df["work_life_balance_score"].values
X = minimal_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [22]:
# Using 128 maximum n_estimators for better accuracy
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.9026831456560822

In [23]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,needs_improvement
Actual excellent,883,64
Actual needs_improvement,387,2659


In [24]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.70      0.93      0.87      0.80      0.90      0.82       947
needs_improvement       0.98      0.87      0.93      0.92      0.90      0.81      3046

      avg / total       0.91      0.89      0.92      0.89      0.90      0.81      3993



In [25]:
# Calculate feature importance in the Random Forest model.
importances = clf.feature_importances_

# List the features sorted in descending order by feature importance
features = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
features

[(0.11564106010872416, 'supporting_others'),
 (0.10939025982904808, 'achievement'),
 (0.10579896021181058, 'places_visited'),
 (0.08926834959790948, 'personal_awards'),
 (0.08784903922786051, 'todo_completed'),
 (0.0870881893230704, 'time_for_passion'),
 (0.08505967018679736, 'core_circle'),
 (0.07470178726137762, 'live_vision'),
 (0.07000190479233563, 'daily_steps'),
 (0.06203094626040235, 'weekly_meditation'),
 (0.058508483144500335, 'fruits_veggies'),
 (0.054661350056163446, 'donation')]

In [26]:
# Split our preprocessed data into our features and target arrays removing low importance
min_df = minimal_df.drop(['daily_steps', 'fruits_veggies', 'weekly_meditation','donation'], axis=1)

y = min_df["work_life_balance_score"].values
X = min_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [28]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.8666041846214434

In [29]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,needs_improvement
Actual excellent,847,100
Actual needs_improvement,491,2555


In [30]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.63      0.89      0.84      0.74      0.87      0.75       947
needs_improvement       0.96      0.84      0.89      0.90      0.87      0.75      3046

      avg / total       0.88      0.85      0.88      0.86      0.87      0.75      3993



In [31]:
# Calculate feature importance in the Random Forest model.
importances = clf.feature_importances_

# List the features sorted in descending order by feature importance
features = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
features

[(0.15700974819968694, 'supporting_others'),
 (0.13508537022472322, 'achievement'),
 (0.13372246200433613, 'places_visited'),
 (0.12627716664041042, 'todo_completed'),
 (0.11806302544927944, 'time_for_passion'),
 (0.11455458208645332, 'personal_awards'),
 (0.11346817589628982, 'core_circle'),
 (0.10181946949882084, 'live_vision')]

In [32]:
# Split our preprocessed data into our features and target arrays removing low importance
liveflow_df = reduced_df[['flow', 'live_vision', 'work_life_balance_score']]

y = liveflow_df["work_life_balance_score"].values
X = liveflow_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [34]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.725251528654957

In [36]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,needs_improvement
Actual excellent,694,253
Actual needs_improvement,860,2186


In [37]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.45      0.73      0.72      0.55      0.73      0.53       947
needs_improvement       0.90      0.72      0.73      0.80      0.73      0.53      3046

      avg / total       0.79      0.72      0.73      0.74      0.73      0.53      3993



In [38]:
# Split our preprocessed data into our 2 features and target arrays
support_achieve_df = reduced_df[['supporting_others', 'achievement', 'work_life_balance_score']]

y = support_achieve_df["work_life_balance_score"].values
X = support_achieve_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [39]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=128).fit(X_train_scaled, y_train)

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Calculated the balanced accuracy score
y_pred = clf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.759616017960439

In [40]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,needs_improvement
Actual excellent,788,159
Actual needs_improvement,953,2093


**XGBoost**

In [41]:
# Split our preprocessed data into our features and target arrays
y = reduced_df["work_life_balance_score"].values
X = reduced_df.drop(["work_life_balance_score"], axis=1)

In [42]:
# Suggested by Ankush: https://xgboost.readthedocs.io/en/stable/index.html
from xgboost import XGBClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
Counter(y_train)

Counter({'needs_improvement': 9759, 'excellent': 3017})

In [43]:
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
bst.fit(X_train, y_train)
preds = bst.predict(X_test)

In [44]:
balanced_accuracy_score(y_test, preds)

0.7540370520059563

In [45]:
# Display the confusion matrix
cm = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(
    cm, index=["Actual excellent", "Actual needs_improvement"], columns=["Predicted excellent", "needs_improvement"])
cm_df

Unnamed: 0,Predicted excellent,needs_improvement
Actual excellent,482,287
Actual needs_improvement,288,2138


In [46]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, preds))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.63      0.63      0.88      0.63      0.74      0.54       769
needs_improvement       0.88      0.88      0.63      0.88      0.74      0.57      2426

      avg / total       0.82      0.82      0.69      0.82      0.74      0.56      3195



In [47]:
# Calculate feature importance in the XGBoost model.
importances = bst.feature_importances_
importances

array([0.        , 0.        , 0.1454096 , 0.04217844, 0.36627144,
       0.        , 0.09834345, 0.        , 0.22535226, 0.        ,
       0.        , 0.1224448 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ], dtype=float32)

In [48]:
# List the features sorted in descending order by feature importance
features = sorted(zip(bst.feature_importances_, X.columns), reverse=True)
features

[(0.36627144, 'supporting_others'),
 (0.22535226, 'todo_completed'),
 (0.1454096, 'places_visited'),
 (0.1224448, 'live_vision'),
 (0.09834345, 'achievement'),
 (0.042178437, 'core_circle'),
 (0.0, 'weekly_meditation'),
 (0.0, 'time_for_passion'),
 (0.0, 'sufficient_income'),
 (0.0, 'social_network'),
 (0.0, 'sleep_hours'),
 (0.0, 'personal_awards'),
 (0.0, 'lost_vacation'),
 (0.0, 'fruits_veggies'),
 (0.0, 'flow'),
 (0.0, 'donation'),
 (0.0, 'daily_stress'),
 (0.0, 'daily_steps'),
 (0.0, 'daily_shouting')]