In [13]:
pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.15.0-cp39-cp39-macosx_10_15_x86_64.whl (239.1 MB)
[K     |████████████████████████████████| 239.1 MB 10 kB/s  eta 0:00:013▌                             | 19.1 MB 5.4 MB/s eta 0:00:42     |█████▋                          | 41.5 MB 7.8 MB/s eta 0:00:26     |█████████▏                      | 68.4 MB 56.6 MB/s eta 0:00:04     |█████████████                   | 96.4 MB 1.6 MB/s eta 0:01:27     |█████████████▌                  | 101.1 MB 10.1 MB/s eta 0:00:14     |███████████████                 | 112.4 MB 13.0 MB/s eta 0:00:10     |████████████████                | 119.4 MB 9.1 MB/s eta 0:00:14     |████████████████▋               | 124.1 MB 8.9 MB/s eta 0:00:13eta 0:00:21     |█████████████████████████▎      | 188.6 MB 10.1 MB/s eta 0:00:05     |█████████████████████████▌      | 190.2 MB 3.2 MB/s eta 0:00:16 MB 738 kB/s eta 0:00:38
Collecting opt-einsum>=2.3.2
  U

In [14]:
# Imports here
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from scikeras.wrappers import KerasClassifier, KerasRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD

2024-02-19 13:34:24.624325: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [21]:
# Dataset here
file = open("../dataset/processed_reviews.json", 'r', encoding='utf8')
dataset_dict = json.load(file)
df = pd.DataFrame(dataset_dict)
df.drop(columns=['firm','job_title', 'current', 'headline', 'pros', 'cons', 'duration'], inplace=True)

In [22]:
# Min-max normalization
scaler = MinMaxScaler()
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#One hot encode
df['date'] = pd.to_datetime(df['date_review'])
df['month'] = df['date'].dt.month.apply(lambda x: str(x))
df['year'] = df['date'].dt.year
one_hot_encoded = pd.get_dummies(df[['recommend', 'ceo_approv', 'outlook', 'month']])
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop(columns=['date', 'date_review', 'recommend', 'ceo_approv', 'outlook', 'month'], inplace=True)


df.head()

Unnamed: 0,overall_rating,work_life_balance,culture_values,career_opp,comp_benefits,senior_mgmt,year,recommend_o,recommend_v,recommend_x,...,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9
0,0.25,0.5,0.0,0.25,0.0,0.75,2015,False,False,True,...,False,True,False,False,False,False,False,False,False,False
1,0.0,0.0,0.0,0.0,0.0,0.0,2016,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,0.0,0.25,0.0,0.25,0.0,0.0,2016,False,False,True,...,False,False,False,False,True,False,False,False,False,False
3,0.5,0.75,0.25,0.25,0.5,0.25,2016,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,0.0,0.0,0.0,0.0,0.0,0.0,2016,False,False,True,...,False,False,False,False,False,False,False,False,False,True


In [8]:
#Spliting the data
X = df.drop(columns=['overall_rating'])
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [18]:
def buildSigmoidPerceptron():
    model = Sequential([
        Dense(1, activation = 'sigmoid', input_dim = X.shape[1]),
    ])
    optimizer = SGD(learning_rate=0.3)
    model.compile(optimizer=optimizer, loss='mse', metrics=['MSE'])
    return(model)

estimator = KerasRegressor(model=buildSigmoidPerceptron, epochs=10, batch_size=1000, verbose=0)
estimator.fit(X_train, y_train)
y_train_pred = estimator.predict(X_train)
y_test_pred = estimator.predict(X_test)
pd.DataFrame(y_test_pred)

Unnamed: 0,0
0,0.205751
1,0.504229
2,0.711091
3,0.937396
4,0.253917
...,...
75735,0.675459
75736,0.909880
75737,0.908631
75738,0.928982


In [19]:
# Surprisingly low MSE with just perceptron! Did I overlook something?
MSE = sum((y_test_pred - y_test)**2)/y_test.size
print(MSE)

0.03130137527791183


In [20]:
# Fraction of Variance Unexplained
FVU = MSE/np.var(y_test)
print(FVU)

0.3622759736761359


In [22]:
# Training vs Test Error
train_error = sum((y_train_pred - y_train)**2)/y_train.size
test_error = sum((y_test_pred - y_test)**2)/y_test.size

print('Train Error:', train_error)
print('Test Error:', test_error)

train_fvu = train_error/np.var(y_train)
test_fvu = test_error/np.var(y_test)

print('Train FVU:', train_fvu)
print('Test FVU:', test_fvu)

Train Error: 0.031080815487849862
Test Error: 0.03130137527791183
Train FVU: 0.3581208399950576
Test FVU: 0.3622759736761359
