In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from math import sqrt

Using TensorFlow backend.


In [2]:
train = pd.read_csv('csv/train.csv')
valid = pd.read_csv("csv/test.csv")

In [3]:
train.loc[1, "description"]

'Fractured fairy tale has plenty of twists for fantasy fans.'

## Create the splits

In [4]:
x_tr, y_tr = train['description'].values, train['csm_rating'].values
x_val, y_val = valid["description"].values, valid["csm_rating"].values

In [5]:
print(x_tr.shape, y_tr.shape)

(4651,) (4651,)


In [6]:
print(x_val.shape, y_val.shape)

(1164,) (1164,)


## Prepare the Data

In [7]:
#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_tr))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
x_val_seq = tokenizer.texts_to_sequences(x_val)

print(len(max(x_tr_seq, key=len)))
print(len(min(x_tr_seq, key=len)))

14
3


In [8]:
#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=14)
x_val_seq = pad_sequences(x_val_seq, maxlen=14)
print(len(max(x_tr_seq, key=len)))
print(len(min(x_tr_seq, key=len)))

14
14


In [9]:
x_tr_seq[1]

array([   0,    0,    0,    0, 1028,   99,    3,   25,  618,    1,  334,
          8,   12,   72], dtype=int32)

In [10]:
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

5815


## Create embeddings

In [11]:
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

Found 5814 unique tokens.


## Create the Model

In [12]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Activation, Dense
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping

model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=14,trainable=True)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer

model.compile(optimizer='adam', loss='mean_squared_error') 

#Adding callbacks
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=10)  
#mc=ModelCheckpoint('best_model.h5', monitor='val_loss', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 300)           1744500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 14, 128)           219648    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,972,469
Trainable params: 1,972,469
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [13]:
history = model.fit(np.array(x_tr_seq),
                    np.array(y_tr),
                    batch_size=128,
                    epochs=10,
                    validation_data=(np.array(x_val_seq),np.array(y_val)),
                    verbose=1)
                    #callbacks=[mc])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4651 samples, validate on 1164 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
#evaluation 
mse = model.evaluate(x_val_seq, y_val)

print("The mse is %.3f." % mse)
print("The rmse is %.3f." % sqrt(mse))

The mse is 81.627.
The rmse is 9.035.


## [Use Transfer Learning](https://www.analyticsvidhya.com/blog/2020/03/pretrained-word-embeddings-nlp/)

In [15]:
# load the whole embedding into memory
embeddings_index = dict()

with open("glove/glove.6B.300d.txt") as f:

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [16]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
model=Sequential()

#embedding layer
#model.add(Embedding(size_of_vocabulary,300,input_length=14,trainable=True)) 
model.add(Embedding(size_of_vocabulary,300,weights=[embedding_matrix],input_length=14,trainable=False)) 


#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='mean_squared_error') 

#Print summary of model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 14, 300)           1744500   
_________________________________________________________________
lstm_2 (LSTM)                (None, 14, 128)           219648    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 1,972,469
Trainable params: 227,969
Non-trainable params: 1,744,500
_________________________________________________________________
None


In [18]:
history = model.fit(np.array(x_tr_seq),
                    np.array(y_tr),
                    batch_size=128,
                    epochs=10,
                    validation_data=(np.array(x_val_seq),np.array(y_val)),
                    verbose=1)

Train on 4651 samples, validate on 1164 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
#evaluation 
mse = model.evaluate(x_val_seq, y_val)



In [20]:
print("The mse is  %.3f." % mse)
print("The rmse is %.3f." % sqrt(mse))

The mse is  81.627.
The rmse is 9.035.
