## Preprocessing

In [1]:
# Imports here
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from scikeras.wrappers import KerasClassifier, KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# Dataset here
file = open("../dataset/processed_reviews.json", 'r', encoding='utf8')
dataset_dict = json.load(file)
df_raw = pd.DataFrame(dataset_dict)

In [24]:
# Unused features
df = df_raw.copy(deep=True) #Do this so that I dont have to rerun the previous cell every time I make a change
df.drop(columns=['firm','job_title'], inplace=True) #one hotting these would create too many features

# Split up Date
df['date'] = pd.to_datetime(df['date_review'])
df['month'] = df['date'].dt.month.astype(str)
df['year'] = df['date'].dt.year

# Consider the length text inputs
df['pros_length'] = df['pros'].apply(lambda x: len(x))
df['cons_length'] = df['cons'].apply(lambda x: len(x))
df.drop(columns=['headline', 'pros', 'cons'], inplace=True)

# Encode 'current' as int
df['current'] = (df['current'] == 'Current Employee').astype(int)

# Min-max normalization
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#One hot encode
one_hot_encoded = pd.get_dummies(df[['recommend', 'ceo_approv', 'outlook', 'month', 'duration']])
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop(columns=['date', 'date_review', 'recommend', 'ceo_approv', 'outlook', 'month', 'duration'], inplace=True)

In [25]:
print(df.columns)
df.head()

Index(['current', 'overall_rating', 'work_life_balance', 'culture_values',
       'career_opp', 'comp_benefits', 'senior_mgmt', 'year', 'pros_length',
       'cons_length', 'recommend_o', 'recommend_v', 'recommend_x',
       'ceo_approv_o', 'ceo_approv_r', 'ceo_approv_v', 'ceo_approv_x',
       'outlook_o', 'outlook_r', 'outlook_v', 'outlook_x', 'month_1',
       'month_10', 'month_11', 'month_12', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'duration_less than 1 year', 'duration_more than 1 year',
       'duration_more than 10 years', 'duration_more than 3 years',
       'duration_more than 5 years', 'duration_more than 8 years',
       'duration_not mentioned'],
      dtype='object')


Unnamed: 0,current,overall_rating,work_life_balance,culture_values,career_opp,comp_benefits,senior_mgmt,year,pros_length,cons_length,...,month_7,month_8,month_9,duration_less than 1 year,duration_more than 1 year,duration_more than 10 years,duration_more than 3 years,duration_more than 5 years,duration_more than 8 years,duration_not mentioned
0,1.0,0.25,0.5,0.0,0.25,0.0,0.75,0.538462,0.002257,0.011246,...,False,False,False,False,True,False,False,False,False,False
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615385,0.002753,0.020727,...,False,False,False,True,False,False,False,False,False,False
2,1.0,0.0,0.25,0.0,0.25,0.0,0.0,0.615385,0.001817,0.020541,...,False,False,False,False,True,False,False,False,False,False
3,1.0,0.5,0.75,0.25,0.25,0.5,0.25,0.615385,0.006607,0.009573,...,False,False,False,True,False,False,False,False,False,False
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615385,0.005726,0.022493,...,False,False,True,False,False,False,False,False,False,True


In [27]:
#Spliting the data
X = df.drop(columns=['overall_rating'])
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (681651, 39) (681651,)
Testing set shape: (75740, 39) (75740,)


## Model Training

In [29]:
def buildReluNN():
    model = Sequential([
        Dense(32, activation = 'relu', input_dim = X.shape[1]),
        Dense(16, activation = 'relu'),
        Dense(8, activation = 'relu'),
        Dense(4, activation = 'relu'),
        Dense(1, activation = 'sigmoid'),
    ])
    optimizer = SGD(learning_rate=0.3)
    model.compile(optimizer=optimizer, loss='mse', metrics=['MSE'])
    return(model)

estimator = KerasRegressor(model=buildReluNN, epochs=10, batch_size=1000, verbose=0)
history = estimator.fit(X_train, y_train, validation_data=(X_test.astype('float'), y_test))
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
pd.DataFrame(y_test_pred)

Unnamed: 0,0
0,0.232726
1,0.498747
2,0.694679
3,0.948616
4,0.333123
...,...
75735,0.698247
75736,0.919499
75737,0.914638
75738,0.941296


## Model Eval

In [30]:
MSE = sum((y_test_pred - y_test)**2)/y_test.size
print(MSE)

0.0300858729102788


In [31]:
# Fraction of Variance Unexplained
FVU = MSE/np.var(y_test)
print(FVU)

0.34820798785021984
