## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import ast
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from scipy.stats import skew, shapiro

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb

from sklearn.metrics import r2_score, mean_squared_error

import pickle

In [2]:
from keras.preprocessing.text import Tokenizer # clean and tokenize data
from keras.preprocessing.sequence import pad_sequences #
from keras.initializers import he_normal
import tensorflow as tf
import keras.backend as K # facilitate the computation of performance metrics

In [3]:
# display purposes
%config InlineBackend.figure_format = 'retina'
# Parameter mappings to override the values in the preset seaborn style dictionaries
color = 'black'
sns.set_theme(rc={'grid.color': 'white', 
                  'xtick.color': color,
                  'ytick.color': color,
                  'axes.labelcolor': color,
                  'text.color': color,
                  'axes.facecolor':(0,0,0,0),
                  'figure.facecolor':(0,0,0,0)})

## Load Datasets

In [38]:
train_df = pd.read_csv('../datasets/train_df.csv')

#### Check each dataset

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31327 entries, 0 to 31326
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   post_comment         31327 non-null  object 
 1   id                   31327 non-null  object 
 2   sentence             31327 non-null  object 
 3   entity               31327 non-null  object 
 4   pos_sentiment_words  31327 non-null  object 
 5   neg_sentiment_words  31327 non-null  object 
 6   textblob             31327 non-null  float64
 7   sentiment            31327 non-null  float64
dtypes: float64(2), object(6)
memory usage: 1.9+ MB


In [40]:
train_df.rename(index={7:'sentiment'})

Unnamed: 0,post_comment,id,sentence,entity,pos_sentiment_words,neg_sentiment_words,textblob,sentiment
0,post,0_post,how to spend the perfect 48 hours in singapore,"['48 hour', 'singapore']",['perfect'],[],1.000000,10.000000
1,post,0_post,first head to your waterfall at singapore airp...,"['first', 'singapore airport']","['absolutely', 'incredible']",[],0.900000,9.642857
2,post,0_post,then head to the center and drop by the future...,"['future world exhibition', 'art science museum']",[],[],0.000000,6.428571
3,post,0_post,next you have to head to gardens by the bay it...,['garden by the bay'],"['completely', 'free', 'amazing']",[],0.333333,7.619048
4,post,0_post,dont forget to buy tickets for the skyway for ...,"['skyway', 'floral fantasy']","['most', 'magical']",['insane'],0.000000,6.428571
...,...,...,...,...,...,...,...,...
31322,post,435_cmt,locals,[],[],[],0.000000,6.428571
31323,post,435_cmt,you complained singaporeans are rude bec of yo...,['1'],[],"['complained', 'rude']",-0.300000,5.357143
31324,post,435_cmt,i felt sorry for your friend if he is a singap...,['singaporean'],[],['sorry'],-0.500000,4.642857
31325,post,435_cmt,next time do a bit of googling before going to...,[],['new'],[],0.068182,6.672078


In [41]:
train_df.to_csv('../datasets/train_df.csv',index=False)

In [6]:
train_df['sentiment'] = (train_df['sentiment'])**(1 / 1.25)

In [7]:
# scale using minmax
# convert to array
y = train_df['sentiment'].array

mm = MinMaxScaler()
mm.fit(y.reshape(-1,1))
y = mm.transform(y.reshape(-1,1))

# convert back to dataframe
train_df['sentiment'] = pd.DataFrame(y)

## Train-Test-Split

#### Using train_test_split, the data will be split for training and testing 

In [8]:
X = train_df['sentence']
y = train_df['sentiment']

In [9]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.78,random_state=42)

print(f'The X train set has {X_train.shape[0]} rows.')
print(f'The y train set has {y_train.shape[0]} rows.')
print(f'The X test set has {X_test.shape[0]} rows.')
print(f'The y test set has {y_test.shape[0]} rows.')

The X train set has 24435 rows.
The y train set has 24435 rows.
The X test set has 6892 rows.
The y test set has 6892 rows.


## Modeling

### Keras

#### Kickstart the modeling process by fixing the random seed

In [10]:
tf.keras.utils.set_random_seed(1) 

#### Instantiate variables that will be used in the model. Some of these variables are tuned to optimize for R^2 and MSE scores.

In [11]:
vocab_size = 9800
oov_token = "<OOV>"
padding_type = 'post'
trunc_type = 'post'
embedding_dim = 16 # 16
max_length = 200 # 100, 110

#### Define custom function to calculate R^2

In [12]:
# define custom functions to calculate R2
def R2(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) # sum of squares of residuals
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) # total sum of squares
    # K.epsilon: Epsilon is small value that makes very little difference to the value of 
    # the denominator, but ensures that it isn't equal to exactly zero.
    r2_score = (1 - (SS_res / (SS_tot + K.epsilon())))
    return r2_score

#### Initialize and fit the tokenizer

In [13]:
# initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_token)

# create tokens for every word in the corpus
tokenizer.fit_on_texts(X_train)

# save them in word_index
word_index = tokenizer.word_index

#### Turn sentences in both X_train and X_test into sequences of tokens and pad them

In [14]:
# turn sentences into sequences of tokens
X_train_seq = tokenizer.texts_to_sequences(X_train)
# pad sentences so they are all the same length
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=max_length,
                                   padding=padding_type,truncating=trunc_type)

# do the same for X_test
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=max_length,
                                   padding=padding_type,truncating=trunc_type)

#### Build Model

In [15]:
def get_model():
    # Create a simple model.
    inputs = keras.Input(shape=(32,))
    outputs = keras.layers.Dense(1)(inputs)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(), loss="mean_squared_error")
    return model

In [16]:
model = tf.keras.Sequential([
    # top layer is an embedding, where direction of each word will be learned epoch by epoch
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    # we pool with a global average pooling, ie, adding up the vectors to get meaning
    tf.keras.layers.GlobalAveragePooling1D(),
    # this is then fed into deep neural network
    tf.keras.layers.Dense(96, 
                          activation='relu',
                          kernel_initializer = tf.keras.initializers.RandomNormal(mean=0., 
                                                                                  stddev=2.)),
    tf.keras.layers.Dense(64, 
                          activation='relu'),
    tf.keras.layers.Dense(32, 
                          activation='relu'),
    tf.keras.layers.Dense(1)
])

#### Get model summary

In [17]:
# get model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 16)           156800    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 96)                1632      
                                                                 
 dense_1 (Dense)             (None, 64)                6208      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                        

#### Compile the model

In [18]:
# compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0009, 
                                     weight_decay=0.004,
                                     clipvalue=2.)

model.compile(loss='mean_squared_error',
              optimizer=optimizer,
              metrics=['mse',
                       tf.keras.metrics.RootMeanSquaredError(name='RMSE'),
                      R2]) # r2 defined above



#### Fit the model

In [19]:
# fit RNN model
num_epochs = 92 # 550
batch_size = 32

history = model.fit(X_train_seq_padded,y_train,epochs=num_epochs,batch_size=batch_size,
                   validation_data=(X_test_seq_padded, y_test),
                   verbose=2)

Epoch 1/92
764/764 - 2s - loss: 0.0125 - mse: 0.0125 - RMSE: 0.1117 - R2: 0.0276 - val_loss: 0.0084 - val_mse: 0.0084 - val_RMSE: 0.0916 - val_R2: 0.3870 - 2s/epoch - 2ms/step
Epoch 2/92
764/764 - 1s - loss: 0.0045 - mse: 0.0045 - RMSE: 0.0670 - R2: 0.6359 - val_loss: 0.0038 - val_mse: 0.0038 - val_RMSE: 0.0614 - val_R2: 0.7153 - 1s/epoch - 1ms/step
Epoch 3/92
764/764 - 1s - loss: 0.0027 - mse: 0.0027 - RMSE: 0.0516 - R2: 0.7679 - val_loss: 0.0030 - val_mse: 0.0030 - val_RMSE: 0.0546 - val_R2: 0.7699 - 1s/epoch - 1ms/step
Epoch 4/92
764/764 - 1s - loss: 0.0020 - mse: 0.0020 - RMSE: 0.0444 - R2: 0.8321 - val_loss: 0.0029 - val_mse: 0.0029 - val_RMSE: 0.0536 - val_R2: 0.7739 - 1s/epoch - 1ms/step
Epoch 5/92
764/764 - 1s - loss: 0.0016 - mse: 0.0016 - RMSE: 0.0395 - R2: 0.8621 - val_loss: 0.0025 - val_mse: 0.0025 - val_RMSE: 0.0500 - val_R2: 0.8022 - 1s/epoch - 1ms/step
Epoch 6/92
764/764 - 1s - loss: 0.0013 - mse: 0.0013 - RMSE: 0.0361 - R2: 0.8839 - val_loss: 0.0024 - val_mse: 0.0024 - 

Epoch 46/92
764/764 - 1s - loss: 1.7127e-04 - mse: 1.7127e-04 - RMSE: 0.0131 - R2: -8.6666e-01 - val_loss: 0.0012 - val_mse: 0.0012 - val_RMSE: 0.0348 - val_R2: 0.9034 - 1s/epoch - 2ms/step
Epoch 47/92
764/764 - 1s - loss: 2.0505e-04 - mse: 2.0505e-04 - RMSE: 0.0143 - R2: 0.9811 - val_loss: 0.0013 - val_mse: 0.0013 - val_RMSE: 0.0366 - val_R2: 0.8942 - 1s/epoch - 2ms/step
Epoch 48/92
764/764 - 1s - loss: 2.0225e-04 - mse: 2.0225e-04 - RMSE: 0.0142 - R2: 0.9829 - val_loss: 0.0020 - val_mse: 0.0020 - val_RMSE: 0.0447 - val_R2: 0.8486 - 1s/epoch - 2ms/step
Epoch 49/92
764/764 - 1s - loss: 2.0790e-04 - mse: 2.0790e-04 - RMSE: 0.0144 - R2: 0.9818 - val_loss: 0.0012 - val_mse: 0.0012 - val_RMSE: 0.0350 - val_R2: 0.9054 - 1s/epoch - 2ms/step
Epoch 50/92
764/764 - 2s - loss: 1.7635e-04 - mse: 1.7635e-04 - RMSE: 0.0133 - R2: 0.9832 - val_loss: 0.0012 - val_mse: 0.0012 - val_RMSE: 0.0347 - val_R2: 0.9018 - 2s/epoch - 2ms/step
Epoch 51/92
764/764 - 1s - loss: 1.7717e-04 - mse: 1.7717e-04 - RMSE: 

Epoch 91/92
764/764 - 1s - loss: 1.2306e-04 - mse: 1.2306e-04 - RMSE: 0.0111 - R2: 0.9871 - val_loss: 0.0012 - val_mse: 0.0012 - val_RMSE: 0.0347 - val_R2: 0.9002 - 1s/epoch - 2ms/step
Epoch 92/92
764/764 - 1s - loss: 1.1840e-04 - mse: 1.1840e-04 - RMSE: 0.0109 - R2: 0.9897 - val_loss: 0.0012 - val_mse: 0.0012 - val_RMSE: 0.0339 - val_R2: 0.9110 - 1s/epoch - 2ms/step


#### Save the training history into a dataframe

In [20]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist = hist.rename(columns={'mse': 'MSE','val_mse': 'val_MSE'})
hist.tail()

Unnamed: 0,loss,MSE,RMSE,R2,val_loss,val_MSE,val_RMSE,val_R2,epoch
87,0.000159,0.000159,0.012601,0.987198,0.00119,0.00119,0.0345,0.902072,87
88,0.000109,0.000109,0.010459,0.989663,0.001186,0.001186,0.034441,0.905564,88
89,0.000104,0.000104,0.010217,0.990842,0.001173,0.001173,0.034251,0.90626,89
90,0.000123,0.000123,0.011093,0.987078,0.001202,0.001202,0.034665,0.900228,90
91,0.000118,0.000118,0.010881,0.989671,0.001152,0.001152,0.033944,0.91097,91


#### Save the final trained model

In [21]:
# saving the trained mode
# pickle.dump(model, open('keras_model.pkl', 'wb'))

https://www.tensorflow.org/guide/keras/serialization_and_saving

In [36]:
model.save("keras_model_k.h5")

In [37]:
# It can be used to reconstruct the model identically.
reconstructed_model = tf.keras.models.load_model("keras_model_k.h5", custom_objects={"R2": R2 })

In [29]:
# Let's check:
np.testing.assert_allclose(
    model.predict(X_test_seq_padded), reconstructed_model.predict(X_test_seq_padded)
)



In [32]:
pred = model.predict(X_test_seq_padded)



In [33]:
pred

array([[0.53233874],
       [0.53300905],
       [0.53140783],
       ...,
       [0.5321098 ],
       [0.6403018 ],
       [0.53173107]], dtype=float32)

In [30]:
recon_pred = reconstructed_model.predict(X_test_seq_padded)



In [31]:
recon_pred

array([[0.53233874],
       [0.53300905],
       [0.53140783],
       ...,
       [0.5321098 ],
       [0.6403018 ],
       [0.53173107]], dtype=float32)

https://www.tensorflow.org/guide/saved_model

In [None]:
import os
import tempfile

In [None]:
tmpdir = tempfile.mkdtemp()

In [None]:
tf_Saved_Model_save_path = os.path.join(tmpdir, "tf_Saved_Model")
tf.saved_model.save(model, tf_Saved_Model_save_path)

In [None]:
loaded = tf.saved_model.load(tf_Saved_Model_save_path)

In [None]:
print(list(loaded.signatures.keys()))

In [None]:
infer = loaded.signatures["serving_default"]
print(infer.structured_outputs)

In [45]:
pred_load = loaded.predict(X_test_seq_padded)

NameError: name 'loaded' is not defined

#### Reverse both the transformation and scaling

In [42]:
y

0        1.000000
1        0.954695
2        0.529570
3        0.691104
4        0.529570
           ...   
31322    0.529570
31323    0.378988
31324    0.275259
31325    0.563067
31326    0.722783
Name: sentiment, Length: 31327, dtype: float64

In [44]:
reverse_y = ss.inverse_transform(y)

NameError: name 'ss' is not defined

In [43]:
train_df['sentiment']

0        10.000000
1         9.642857
2         6.428571
3         7.619048
4         6.428571
           ...    
31322     6.428571
31323     5.357143
31324     4.642857
31325     6.672078
31326     7.857143
Name: sentiment, Length: 31327, dtype: float64