## Preprocessing

In [18]:
# Imports here
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from scikeras.wrappers import KerasClassifier, KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import keras_tuner
import tensorflow as tf

In [2]:
# Dataset here
file = open("../dataset/processed_reviews.json", 'r', encoding='utf8')
dataset_dict = json.load(file)
df_raw = pd.DataFrame(dataset_dict)

In [3]:
# Unused features
df = df_raw.copy(deep=True) #Do this so that I dont have to rerun the previous cell every time I make a change
df.drop(columns=['firm','job_title'], inplace=True) #one hotting these would create too many features

# Split up Date
df['date'] = pd.to_datetime(df['date_review'])
df['month'] = df['date'].dt.month.astype(str)
df['year'] = df['date'].dt.year

# Consider the length text inputs
df['pros_length'] = df['pros'].apply(lambda x: len(x))
df['cons_length'] = df['cons'].apply(lambda x: len(x))
df.drop(columns=['headline', 'pros', 'cons'], inplace=True)

# Encode 'current' as int
df['current'] = (df['current'] == 'Current Employee').astype(int)

# Min-max normalization
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#One hot encode
one_hot_encoded = pd.get_dummies(df[['recommend', 'ceo_approv', 'outlook', 'month', 'duration']])
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop(columns=['date', 'date_review', 'recommend', 'ceo_approv', 'outlook', 'month', 'duration'], inplace=True)

In [4]:
#Spliting the data
X = df.drop(columns=['overall_rating'])
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (681651, 39) (681651,)
Testing set shape: (75740, 39) (75740,)


## Hyperparameter Tuning - Random Search

In [5]:
def HPmodel(hp):
    model = Sequential([
        Dense(hp.Int("units", min_value=32, max_value=64, step=2), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"]), input_dim = X.shape[1]),
        Dense(hp.Int("units", min_value=32, max_value=256, step=8), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"])),
        Dense(hp.Int("units", min_value=32, max_value=128, step=4), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"])),
        Dense(hp.Int("units", min_value=32, max_value=64, step=4), activation = hp.Choice("activation", ["relu", "sigmoid", "softmax", "tanh"])),
        Dense(1, activation = 'sigmoid'),
    ])

    learning_rate = hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='log')
    optimizer = SGD(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['MSE'])
    return(model)

In [7]:
tuner = keras_tuner.RandomSearch(
    HPmodel,
    overwrite = True,
    objective='val_loss',
    max_trials = 20,
    max_consecutive_failed_trials=3
)
tuner.search(np.array(X_train).astype('float32'), y_train, epochs=5, validation_data=(np.array(X_test).astype('float32'), y_test))

Trial 20 Complete [00h 04m 38s]
val_loss: 0.0335109680891037

Best val_loss So Far: 0.030189480632543564
Total elapsed time: 01h 47m 27s


### Model Eval

In [8]:
model = tuner.get_best_models()[0]
print(model.summary())
tuner.results_summary()
best_hp = tuner.get_best_hyperparameters()[0]
best_hp.values

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                2000      
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 50)                2550      
                                                                 
 dense_4 (Dense)             (None, 1)                 51        
                                                                 
Total params: 9701 (37.89 KB)
Trainable params: 9701 (37.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Results summary
R

In [9]:
y_pred = model.predict(np.array(X_test).astype('float32'))
y_test_pred = y_pred.reshape(75740,)
MSE = sum((y_test_pred - y_test)**2)/y_test.size
print(MSE)

0.030189490467440894


In [10]:
FVU = MSE/np.var(y_test)
print(FVU)

0.3494072371189498


In [19]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=4,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

In [27]:
best_model = HPmodel(best_hp)
history = best_model.fit(np.array(X_train).astype('float32'), y_train, validation_split = .15, batch_size = 1000, epochs = 200, callbacks = [early_stopping], verbose = 0)

In [28]:
y_pred = best_model.predict(np.array(X_test).astype('float32'))
y_test_pred = y_pred.reshape(75740,)
MSE = sum((y_test_pred - y_test)**2)/y_test.size
print(MSE)

0.030023938220445422


In [29]:
FVU = MSE/np.var(y_test)
print(FVU)

0.34749116790658907
