## Preprocessing

In [5]:
# Imports here
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from scikeras.wrappers import KerasClassifier, KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# Dataset here
file = open("../dataset/processed_reviews.json", 'r', encoding='utf8')
dataset_dict = json.load(file)
df_raw = pd.DataFrame(dataset_dict)

In [7]:
# Unused features
df = df_raw.copy(deep=True) #Do this so that I dont have to rerun the previous cell every time I make a change
df.drop(columns=['firm','job_title'], inplace=True) #one hotting these would create too many features

# Split up Date
df['date'] = pd.to_datetime(df['date_review'])
df['month'] = df['date'].dt.month.astype(str)
df['year'] = df['date'].dt.year

# Consider the length text inputs
df['pros_length'] = df['pros'].apply(lambda x: len(x))
df['cons_length'] = df['cons'].apply(lambda x: len(x))
df.drop(columns=['headline', 'pros', 'cons'], inplace=True)

# Encode 'current' as int
df['current'] = (df['current'] == 'Current Employee').astype(int)

# Min-max normalization
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#One hot encode
one_hot_encoded = pd.get_dummies(df[['recommend', 'ceo_approv', 'outlook', 'month', 'duration']])
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop(columns=['date', 'date_review', 'recommend', 'ceo_approv', 'outlook', 'month', 'duration'], inplace=True)

In [8]:
print(df.columns)
df.head()

Index(['current', 'overall_rating', 'work_life_balance', 'culture_values',
       'career_opp', 'comp_benefits', 'senior_mgmt', 'year', 'pros_length',
       'cons_length', 'recommend_o', 'recommend_v', 'recommend_x',
       'ceo_approv_o', 'ceo_approv_r', 'ceo_approv_v', 'ceo_approv_x',
       'outlook_o', 'outlook_r', 'outlook_v', 'outlook_x', 'month_1',
       'month_10', 'month_11', 'month_12', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'duration_less than 1 year', 'duration_more than 1 year',
       'duration_more than 10 years', 'duration_more than 3 years',
       'duration_more than 5 years', 'duration_more than 8 years',
       'duration_not mentioned'],
      dtype='object')


Unnamed: 0,current,overall_rating,work_life_balance,culture_values,career_opp,comp_benefits,senior_mgmt,year,pros_length,cons_length,...,month_7,month_8,month_9,duration_less than 1 year,duration_more than 1 year,duration_more than 10 years,duration_more than 3 years,duration_more than 5 years,duration_more than 8 years,duration_not mentioned
0,1.0,0.25,0.5,0.0,0.25,0.0,0.75,0.538462,0.002257,0.011246,...,False,False,False,False,True,False,False,False,False,False
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615385,0.002753,0.020727,...,False,False,False,True,False,False,False,False,False,False
2,1.0,0.0,0.25,0.0,0.25,0.0,0.0,0.615385,0.001817,0.020541,...,False,False,False,False,True,False,False,False,False,False
3,1.0,0.5,0.75,0.25,0.25,0.5,0.25,0.615385,0.006607,0.009573,...,False,False,False,True,False,False,False,False,False,False
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615385,0.005726,0.022493,...,False,False,True,False,False,False,False,False,False,True


In [9]:
#Spliting the data
X = df.drop(columns=['overall_rating'])
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (681651, 39) (681651,)
Testing set shape: (75740, 39) (75740,)


## Model Training

In [43]:
import tensorflow as tf

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

In [50]:
def buildReluNN():
    model = Sequential([
        Dense(32, activation = 'relu', input_dim = X.shape[1]),
        Dense(16, activation = 'relu'),
        Dense(8, activation = 'relu'),
        Dense(4, activation = 'relu'),
        Dense(1, activation = 'sigmoid'),
    ])
    optimizer = SGD(learning_rate=0.1)
    model.compile(optimizer=optimizer, loss='mse', metrics=['MSE'])
    return(model)

estimator = KerasRegressor(model=buildReluNN, epochs=200, batch_size=1000, verbose=1)
history = estimator.fit(X_train, y_train, validation_data=(X_test.astype('float'), y_test))
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
pd.DataFrame(y_test_pred)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Unnamed: 0,0
0,0.241834
1,0.488427
2,0.653242
3,0.942512
4,0.297351
...,...
75735,0.690084
75736,0.928053
75737,0.914464
75738,0.943409


## Model Eval

In [52]:
MSE = sum((y_test_pred - y_test)**2)/y_test.size
print(MSE)

0.030162438788755852


In [53]:
# Fraction of Variance Unexplained
FVU = MSE/np.var(y_test)
print(FVU)

0.34909416375872704


## Model Tuning

In [None]:
import keras_tuner

In [27]:
def buildHPmodel(hp):
  model= Sequential([
      Dense(32, activation = 'relu', input_dim = X.shape[1]),
      Dense(
              units=hp.Int("units", min_value=8, max_value=24, step=4),
              activation= 'relu',
      ),
      Dense(
              units=hp.Int("units", min_value=4, max_value=16, step=4),
              activation= 'relu',
      ),
      Dense(
              units=hp.Int("units", min_value=1, max_value=8, step=4),
              activation= 'relu',
      ),
      Dense(1, activation = 'sigmoid')
  ])

  optimizer = SGD(learning_rate=0.3)
  model.compile(optimizer=optimizer, loss='mse', metrics=['MSE'])
  return model

In [32]:
hp = keras_tuner.HyperParameters()

tuner = keras_tuner.GridSearch(
    hypermodel=buildHPmodel,
    objective="val_loss",
    max_trials=10,
    seed=0,
    executions_per_trial=1,
    hyperparameters=hp,
    tune_new_entries=True,
    allow_new_entries=True,
    max_consecutive_failed_trials=3,
    overwrite=True
)

X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)
tuner.search(X_train, y_train, epochs=5, validation_data=(X_test, y_test), verbose = 1)

Trial 9 Complete [00h 01m 21s]
val_loss: 0.02869807370007038

Best val_loss So Far: 0.02869807370007038
Total elapsed time: 00h 12m 06s


In [33]:
tuner.results_summary()

Results summary
Results in .\untitled_project
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 08 summary
Hyperparameters:
units: 24
Score: 0.02869807370007038

Trial 03 summary
Hyperparameters:
units: 14
Score: 0.028774499893188477

Trial 07 summary
Hyperparameters:
units: 22
Score: 0.028802407905459404

Trial 00 summary
Hyperparameters:
units: 8
Score: 0.02884363941848278

Trial 01 summary
Hyperparameters:
units: 10
Score: 0.02884863130748272

Trial 04 summary
Hyperparameters:
units: 16
Score: 0.02886430360376835

Trial 05 summary
Hyperparameters:
units: 18
Score: 0.02886887826025486

Trial 02 summary
Hyperparameters:
units: 12
Score: 0.028917666524648666

Trial 06 summary
Hyperparameters:
units: 20
Score: 0.028966402634978294


## Hyperparameter Tuning - Random Search

In [None]:
def HPmodel(hp):
    model = Sequential([
        Dense(hp.Int("units", min_value=32, max_value=64, step=2), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"]), input_dim = X.shape[1]),
        Dense(hp.Int("units", min_value=32, max_value=256, step=8), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"])),
        Dense(hp.Int("units", min_value=32, max_value=128, step=4), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"])),
        Dense(hp.Int("units", min_value=32, max_value=64, step=4), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"])),
        Dense(1, activation = 'sigmoid'),
    ])

    learning_rate = hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='log')
    optimizer = SGD(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['MSE'])
    return(model)

In [None]:
tuner = keras_tuner.RandomSearch(
    HPmodel,
    overwrite = True,
    objective='val_loss',
    max_trials = 20,
    max_consecutive_failed_trials=3
)
tuner.search(np.array(X_train).astype('float32'), y_train, epochs=5, validation_data=(np.array(X_test).astype('float32'), y_test))

### Model Eval

In [None]:
model = tuner.get_best_models()[0]
print(model.summary())
y_pred = model.predict(np.array(X_test).astype('float32'))

In [None]:
y_test_pred = y_pred.reshape(75740,)
MSE = sum((y_test_pred - y_test)**2)/y_test.size
print(MSE)

In [None]:
FVU = MSE/np.var(y_test)
print(FVU)