In [2]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=7b39757ee010b215cee4ef7acf14fe960589f7966d2c3395e227065472dc8a37
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [3]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2023-02-07 00:30:25--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2023-02-07 00:30:27 (1.18 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [5]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://uoft-dabc-final-g5.s3.us-east-2.amazonaws.com/transformed_table.csv"
spark.sparkContext.addFile(url)
data_df = spark.read.csv(SparkFiles.get("transformed_table.csv"), sep=",", header=True, inferSchema=True)

In [6]:
data_df.show()

+-------------------+--------------+------------+--------------+-----------+-----------------+--------------+-----------+--------+---------+--------------+----+-----------+-----------+-----------+-------------+--------------+-----------------+---------------+----------------+-----------------+----------+------+-----------------------+
|               date|fruits_veggies|daily_stress|places_visited|core_circle|supporting_others|social_network|achievement|donation|bmi_range|todo_completed|flow|daily_steps|live_vision|sleep_hours|lost_vacation|daily_shouting|sufficient_income|personal_awards|time_for_passion|weekly_meditation|       age|gender|work_life_balance_score|
+-------------------+--------------+------------+--------------+-----------+-----------------+--------------+-----------+--------+---------+--------------+----+-----------+-----------+-----------+-------------+--------------+-----------------+---------------+----------------+-----------------+----------+------+--------------

In [7]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.metrics import classification_report_imbalanced
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn import tree

In [8]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,date,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,bmi_range,...,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,age,gender,work_life_balance_score
0,2015-07-07,3,2,2,5,0,5,2,0,1,...,7,5,5,1,4,0,5,36 to 50,Female,609.5
1,2015-07-07,2,3,4,3,8,10,5,2,2,...,8,2,2,2,3,2,6,36 to 50,Female,655.6
2,2015-07-07,2,3,3,4,4,10,3,2,2,...,8,10,2,2,4,8,3,36 to 50,Female,631.6
3,2015-07-07,3,3,10,3,10,7,2,5,2,...,5,7,5,1,5,2,0,51 or more,Female,622.7
4,2015-07-07,5,1,3,3,10,4,2,4,2,...,7,0,0,2,8,1,5,51 or more,Female,663.9


In [9]:
# Drop the non-beneficial ID columns, 'date', 'bmi_range', 'age', 'gender'.
reduced_df = df.drop(['date', 'bmi_range', 'age', 'gender'], axis=1)
reduced_df.head()

Unnamed: 0,fruits_veggies,daily_stress,places_visited,core_circle,supporting_others,social_network,achievement,donation,todo_completed,flow,daily_steps,live_vision,sleep_hours,lost_vacation,daily_shouting,sufficient_income,personal_awards,time_for_passion,weekly_meditation,work_life_balance_score
0,3,2,2,5,0,5,2,0,6,4,5,0,7,5,5,1,4,0,5,609.5
1,2,3,4,3,8,10,5,2,5,2,5,5,8,2,2,2,3,2,6,655.6
2,2,3,3,4,4,10,3,2,2,2,4,5,8,10,2,2,4,8,3,631.6
3,3,3,10,3,10,7,2,5,3,5,5,0,5,7,5,1,5,2,0,622.7
4,5,1,3,3,10,4,2,4,5,0,5,0,7,0,0,2,8,1,5,663.9


In [10]:
# Determine the number of unique values in each column.
reduced_df.nunique()

fruits_veggies                6
daily_stress                  6
places_visited               11
core_circle                  11
supporting_others            11
social_network               11
achievement                  11
donation                      6
todo_completed               11
flow                         11
daily_steps                  10
live_vision                  11
sleep_hours                  10
lost_vacation                11
daily_shouting               11
sufficient_income             2
personal_awards              11
time_for_passion             11
weekly_meditation            11
work_life_balance_score    1696
dtype: int64

In [11]:
# Look at work_life_balance_score value counts for binning
WLB_count = reduced_df.work_life_balance_score.value_counts()
WLB_count

641.4    75
660.5    57
670.7    37
696.4    37
675.8    36
         ..
794.8     1
562.1     1
555.6     1
531.0     1
790.5     1
Name: work_life_balance_score, Length: 1696, dtype: int64

In [12]:
# Determine which values to replace if counts are less than ...?
replace_WLB = list(WLB_count.index)

# Replace in dataframe
for amt in replace_WLB:
    if amt <= 700:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("needs_improvement"))
    else:
        reduced_df.work_life_balance_score = reduced_df.work_life_balance_score.replace(amt,str("excellent"))

# Check to make sure binning was successful
reduced_df.work_life_balance_score.value_counts()

needs_improvement    12185
excellent             3786
Name: work_life_balance_score, dtype: int64

In [13]:
# Split our preprocessed data into our features and target arrays
y = reduced_df["work_life_balance_score"].values
X = reduced_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [14]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Resample the training data with Decision Tree
model = tree.DecisionTreeClassifier().fit(X_train_scaled, y_train)

In [16]:
# Calculated the balanced accuracy score
pred = model.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.8680190333082896

In [17]:
# Display the confusion matrix
cm = confusion_matrix(y_test, pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Excellent", "Actual Needs Improvement"], columns=["Predicted Excellent", "Predicted Needs Improvement"])
cm_df

Unnamed: 0,Predicted Excellent,Predicted Needs Improvement
Actual Excellent,710,237
Actual Needs Improvement,290,2756


In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.71      0.75      0.90      0.73      0.82      0.67       947
needs_improvement       0.92      0.90      0.75      0.91      0.82      0.69      3046

      avg / total       0.87      0.87      0.79      0.87      0.82      0.68      3993



In [19]:
# Rerunning Model with features noted as 'important' from RandomForestClassifier
feature_drop_df = reduced_df.drop(['todo_completed', 'time_for_passion', 'live_vision', 'daily_steps','fruits_veggies','weekly_meditation','donation','daily_stress','flow','lost_vacation','social_network','daily_shouting','sufficient_income','sleep_hours'], axis=1)
feature_drop_df.head()

Unnamed: 0,places_visited,core_circle,supporting_others,achievement,personal_awards,work_life_balance_score
0,2,5,0,2,4,needs_improvement
1,4,3,8,5,3,needs_improvement
2,3,4,4,3,4,needs_improvement
3,10,3,10,2,5,needs_improvement
4,3,3,10,2,8,needs_improvement


In [20]:
# Split our preprocessed data into our features and target arrays
y = feature_drop_df["work_life_balance_score"].values
X = feature_drop_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [21]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Resample the training data with Decision Tree
model = tree.DecisionTreeClassifier().fit(X_train_scaled, y_train)

In [23]:
# Calculate the balanced accuracy score
pred = model.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.7948910593538693

In [24]:
# Display the confusion matrix
cm = confusion_matrix(y_test, pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Excellent", "Actual Needs Improvement"], columns=["Predicted Excellent", "Predicted Needs Improvement"])
cm_df

Unnamed: 0,Predicted Excellent,Predicted Needs Improvement
Actual Excellent,602,345
Actual Needs Improvement,474,2572


In [25]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.56      0.64      0.84      0.60      0.73      0.53       947
needs_improvement       0.88      0.84      0.64      0.86      0.73      0.55      3046

      avg / total       0.81      0.79      0.69      0.80      0.73      0.54      3993



In [28]:
# Rerunning Model with top 2 features noted as 'important' from RandomForestClassifier
top_2_df = feature_drop_df.drop(['places_visited', 'core_circle', 'personal_awards'], axis=1)
top_2_df.head()

Unnamed: 0,supporting_others,achievement,work_life_balance_score
0,0,2,needs_improvement
1,8,5,needs_improvement
2,4,3,needs_improvement
3,10,2,needs_improvement
4,10,2,needs_improvement


In [29]:
# Split our preprocessed data into our features and target arrays
y = top_2_df["work_life_balance_score"].values
X = top_2_df.drop(["work_life_balance_score"], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, stratify=y)
Counter(y_train)

Counter({'needs_improvement': 9139, 'excellent': 2839})

In [30]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
# Resample the training data with Decision Tree
model = tree.DecisionTreeClassifier().fit(X_train_scaled, y_train)

In [32]:
# Calculate the balanced accuracy score
pred = model.predict(X_test_scaled)
accuracy_score(y_test, pred)

0.8176809416478839

In [33]:
# Display the confusion matrix
cm = confusion_matrix(y_test, pred)
cm_df = pd.DataFrame(
    cm, index=["Actual Excellent", "Actual Needs Improvement"], columns=["Predicted Excellent", "Predicted Needs Improvement"])
cm_df

Unnamed: 0,Predicted Excellent,Predicted Needs Improvement
Actual Excellent,458,489
Actual Needs Improvement,239,2807


In [34]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, pred))

                         pre       rec       spe        f1       geo       iba       sup

        excellent       0.66      0.48      0.92      0.56      0.67      0.43       947
needs_improvement       0.85      0.92      0.48      0.89      0.67      0.47      3046

      avg / total       0.81      0.82      0.59      0.81      0.67      0.46      3993

