## Preprocessing


In [1]:
# Imports here
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error, r2_score
from scikeras.wrappers import KerasClassifier, KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
# Dataset here
file = open("../dataset/processed_reviews.json", 'r', encoding='utf8')
dataset_dict = json.load(file)
df_raw = pd.DataFrame(dataset_dict)

In [3]:
print(df_raw.dtypes)
df_raw.head(10)

firm                 object
date_review          object
job_title            object
current              object
overall_rating        int64
work_life_balance     int64
culture_values        int64
career_opp            int64
comp_benefits         int64
senior_mgmt           int64
recommend            object
ceo_approv           object
outlook              object
headline             object
pros                 object
cons                 object
duration             object
dtype: object


Unnamed: 0,firm,date_review,job_title,current,overall_rating,work_life_balance,culture_values,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons,duration
0,AFH-Wealth-Management,2015-12-11,Office Administrator,Current Employee,2,3,1,2,1,4,x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...,more than 1 year
1,AFH-Wealth-Management,2016-01-28,Office Administrator,Current Employee,1,1,1,1,1,1,x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very...",less than 1 year
2,AFH-Wealth-Management,2016-04-23,Office Administrator,Current Employee,1,2,1,2,1,1,x,o,x,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr...",more than 1 year
3,AFH-Wealth-Management,2016-05-26,Office Administrator,Current Employee,3,4,2,2,3,2,o,r,r,Office administrator,Some good people to work with. Flexible worki...,Morale. Lack of managerial structure. Doesn'...,less than 1 year
4,AFH-Wealth-Management,2016-09-23,IFA,Former Employee,1,1,1,1,1,1,x,o,r,It horrible management,Good investment management strategy. Overall t...,The management and seniors are ruthless. No tr...,not mentioned
5,AFH-Wealth-Management,2016-09-25,Anonymous Employee,Current Employee,5,5,5,5,4,5,v,o,v,Good place to work,The people are great and the culture is very f...,Wouldn't necessarily say there are any cons to...,more than 5 years
6,AFH-Wealth-Management,2016-11-03,Anonymous Employee,Former Employee,4,4,4,4,4,4,v,o,v,I liked working for AFH,"Nice Staff, good HR Team. Feels vibrant and fo...",Can't really think of any obvious cons,more than 1 year
7,AFH-Wealth-Management,2017-02-21,Technician,Former Employee,1,1,1,1,3,1,x,x,x,Honest Review,Made some life time friends.,Was let go from the company just before Christ...,more than 3 years
8,AFH-Wealth-Management,2017-03-06,Administrative Support,Current Employee,1,3,1,2,1,1,x,x,x,Avoid at all cost,"I can't think of any obvious ones, although I'...",Disgustingly low wages. Staff are not valued. ...,less than 1 year
9,AFH-Wealth-Management,2017-03-10,Administrative Support,Former Employee,1,2,1,2,2,1,x,x,x,Worst place I have ever worked,Can do compressed hours if you qualify for the...,"Unsupportive HR team, worst I have ever come a...",more than 1 year


In [4]:
# Unused features
df = df_raw.copy(deep=True)
df.drop(columns=['firm','job_title'], inplace=True) #one hotting these would create too many features

# Split up Date
df['date'] = pd.to_datetime(df['date_review'])
df['month'] = df['date'].dt.month.astype(str)
df['year'] = df['date'].dt.year

# Encode 'current' as int
df['current'] = (df['current'] == 'Current Employee').astype(int)

# Min-max normalization
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Keep 'overall_rating' categorical for classification purpose
rating_to_class = {0: 0, 0.25: 1, 0.5: 2, 0.75: 3, 1: 4}
df['overall_rating'] = df['overall_rating'].map(rating_to_class)


# Text vectorization for "headline", "pros", and "cons"
tfidf_vectorizer = TfidfVectorizer(max_features=10, stop_words='english')

# Ensure that every entry is string for vectorization
df['headline'] = df['headline'].astype(str)
df['pros'] = df['pros'].astype(str)
df['cons'] = df['cons'].astype(str)

headline_tfidf = tfidf_vectorizer.fit_transform(df['headline'])
pros_tfidf = tfidf_vectorizer.fit_transform(df['pros'])
cons_tfidf = tfidf_vectorizer.fit_transform(df['cons'])

headline_features_df = pd.DataFrame(headline_tfidf.toarray(), columns=["headline" + feature for feature in tfidf_vectorizer.get_feature_names_out()])
pros_features_df = pd.DataFrame(pros_tfidf.toarray(), columns=["pros" + feature for feature in tfidf_vectorizer.get_feature_names_out()])
cons_features_df = pd.DataFrame(cons_tfidf.toarray(), columns=["cons" + feature for feature in tfidf_vectorizer.get_feature_names_out()])

#One hot encode and concat everything
one_hot_encoded = pd.get_dummies(df[['recommend', 'ceo_approv', 'outlook', 'month', 'duration']])
df.drop(columns=['date', 'date_review', 'recommend', 'ceo_approv', 'outlook', 'month', 'duration', 'headline', 'pros', 'cons'], inplace=True)
df = pd.concat([df, one_hot_encoded, headline_features_df, pros_features_df, cons_features_df], axis=1)
print(df.columns)
df.head()

Index(['current', 'overall_rating', 'work_life_balance', 'culture_values',
       'career_opp', 'comp_benefits', 'senior_mgmt', 'year', 'recommend_o',
       'recommend_v', 'recommend_x', 'ceo_approv_o', 'ceo_approv_r',
       'ceo_approv_v', 'ceo_approv_x', 'outlook_o', 'outlook_r', 'outlook_v',
       'outlook_x', 'month_1', 'month_10', 'month_11', 'month_12', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'duration_less than 1 year', 'duration_more than 1 year',
       'duration_more than 10 years', 'duration_more than 3 years',
       'duration_more than 5 years', 'duration_more than 8 years',
       'duration_not mentioned', 'headlinecompany', 'headlinehours',
       'headlinelife', 'headlinelong', 'headlinemanagement', 'headlinepay',
       'headlinepeople', 'headlinetime', 'headlinework', 'headlineworking',
       'proscompany', 'proshours', 'proslife', 'proslong', 'prosmanagement',
       'prospay', 'prospeople', 'prostime'

Unnamed: 0,current,overall_rating,work_life_balance,culture_values,career_opp,comp_benefits,senior_mgmt,year,recommend_o,recommend_v,...,conscompany,conshours,conslife,conslong,consmanagement,conspay,conspeople,constime,conswork,consworking
0,1.0,1,0.5,0.0,0.25,0.0,0.75,0.538462,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0,0.0,0.0,0.0,0.0,0.0,0.615385,False,False,...,0.0,0.0,0.0,0.0,0.636924,0.0,0.0,0.0,0.0,0.770927
2,1.0,0,0.25,0.0,0.25,0.0,0.0,0.615385,False,False,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,2,0.75,0.25,0.25,0.5,0.25,0.615385,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0,0.0,0.0,0.0,0.0,0.0,0.615385,False,False,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#Spliting the data
X = df.drop(columns=['overall_rating'])
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (681651, 67) (681651,)
Testing set shape: (75740, 67) (75740,)


In [6]:
# Training
# svm = SVC(kernel = 'linear')
# svm.fit(X_train,y_train)
# yhat_test = svm.predict(X_test)

# Normal linear SVM takes too long as the runtime is O(n_features * n_samples^2)
# So, we use the optimized version with a max_iter
from sklearn.svm import LinearSVC
svm = LinearSVC(dual=False, max_iter=10000000)
svm.fit(X_train, y_train)
yhat_test = svm.predict(X_test)
yhat_train = svm.predict(X_train)

In [7]:
def my_eval(name, y, yhat, norm=1):
    y_norm = np.array(y) / norm
    yhat_norm = yhat / norm

    MSE = sum((yhat_norm - y_norm)**2)/y_norm.size
    print(f"{name} MSE: {MSE}")

    FVU = MSE / np.var(y_norm)
    print(f"{name} FVU: {FVU}")

    acc = np.sum(y == yhat) / y.size
    print(f"{name} Accuracy: {acc}")

    con_mat = confusion_matrix(y, yhat)
    print(con_mat)

    TP = np.diag(con_mat)
    FP, FN = con_mat.sum(axis=0) - TP, con_mat.sum(axis=1) - TP
    TN = np.array([len(yhat)] * con_mat.shape[1]) - (TP + FP + FN)
    prec_class = TP / (TP + FP)
    recall_class = TP / (TP + FN)

    print("True Positives: %s" % (TP))
    print("True Negatives: %s" % (TN))
    print("False Positives: %s" % (FP))
    print("False Negatives: %s" % (FN))
    print("Precision Per Class: %s" % (prec_class))
    print("Recall Per Class: %s" % (recall_class))
    print("Total: %s\n" % (np.sum([TP, TN, FP, FN], axis=0)))

In [8]:
# Train set evaluation.
my_eval("Train set", y_train, yhat_train, norm=4)

Train set MSE: 0.043614694323048014
Train set FVU: 0.5025392906181483
Train set Accuracy: 0.5426442563716624
[[ 32071   3202   9193   3172    625]
 [ 16067   8913  25338   8965    858]
 [  5904   9160  48147  82751   9054]
 [   519   1310  12317 145251  66279]
 [   303    233   2823  53684 135512]]
True Positives: [ 32071   8913  48147 145251 135512]
True Negatives: [610595 607605 476964 307403 412280]
False Positives: [ 22793  13905  49671 148572  76816]
False Negatives: [ 16192  51228 106869  80425  57043]
Precision Per Class: [0.58455453 0.39061267 0.49221002 0.49434864 0.63822011]
Recall Per Class: [0.6645049  0.14820173 0.31059375 0.64362626 0.70375737]
Total: [681651 681651 681651 681651 681651]



In [9]:
# Test set evaluation.
my_eval("Test set", y_test, yhat_test, norm=4)

Test set MSE: 0.0439414444151043
Test set FVU: 0.508569652894807
Test set Accuracy: 0.5408502772643253
[[ 3527   345  1001   360    72]
 [ 1791   974  2816   960    79]
 [  640  1055  5396  9287  1032]
 [   65   154  1442 16039  7286]
 [   50    27   345  5969 15028]]
True Positives: [ 3527   974  5396 16039 15028]
True Negatives: [67889 67539 52726 34178 45852]
False Positives: [ 2546  1581  5604 16576  8469]
False Negatives: [ 1778  5646 12014  8947  6391]
Precision Per Class: [0.58076733 0.38121331 0.49054545 0.49176759 0.63957101]
Recall Per Class: [0.66484449 0.14712991 0.30993682 0.64191947 0.70162006]
Total: [75740 75740 75740 75740 75740]



In [40]:
# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
def my_tuner(X, y, test):
    hypers = {'C': [0.1, 1, 10, 100, 1000],  
                'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
                'kernel': ['rbf']}

    hyper_search = GridSearchCV(SVC(), hypers, refit=True, verbose=0)
    hyper_search.fit(X, y)
    return hyper_search.predict(test)

In [38]:
hyper_yhat_test = my_tuner(X_train, y_train, X_test)
my_eval("Best Model Test set", y_test, hyper_yhat_test, norm=4)

Best Model Test set MSE: 0.03418108001056245
Best Model Test set FVU: 0.3956051110273934
Best Model Test set Accuracy: 0.6421837866385002
[[ 3933   277   754   285    56]
 [ 1396  2221  2199   741    63]
 [  492   811  8010  7290   807]
 [   47   122  1144 18029  5644]
 [   40    23   263  4647 16446]]
True Positives: [ 3933  2221  8010 18029 16446]
True Negatives: [68460 67887 53970 37791 47751]
False Positives: [ 1975  1233  4360 12963  6570]
False Negatives: [1372 4399 9400 6957 4973]
Precision Per Class: [0.66570752 0.64302258 0.64753436 0.58173077 0.7145464 ]
Recall Per Class: [0.74137606 0.33549849 0.46008041 0.72156408 0.76782296]
Total: [75740 75740 75740 75740 75740]

