In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
train = pd.read_csv('csv/train.csv')
valid = pd.read_csv("csv/test.csv")

In [3]:
train.loc[1, "description"]

'Fractured fairy tale has plenty of twists for fantasy fans.'

## Create the splits

In [4]:
x_tr, y_tr = train['description'].values, train['csm_rating'].values
x_val, y_val = valid["description"].values, valid["csm_rating"].values

In [5]:
print(x_tr.shape, y_tr.shape)

(4651,) (4651,)


In [6]:
print(x_val.shape, y_val.shape)

(1164,) (1164,)


## Prepare the Data

In [7]:
#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_tr))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
x_val_seq = tokenizer.texts_to_sequences(x_val)

print(len(max(x_tr_seq, key=len)))
print(len(min(x_tr_seq, key=len)))

14
3


In [8]:
#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=14)
x_val_seq = pad_sequences(x_val_seq, maxlen=14)
print(len(max(x_tr_seq, key=len)))
print(len(min(x_tr_seq, key=len)))

14
14


In [9]:
x_tr_seq[1]

array([   0,    0,    0,    0, 1028,   99,    3,   25,  618,    1,  334,
          8,   12,   72], dtype=int32)

In [10]:
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

5815


## Create embeddings

In [11]:
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

Found 5814 unique tokens.


## Create the Model

In [12]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Activation, Dense
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping

model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=14,trainable=True)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer

model.compile(optimizer='RMSprop', loss='mse',metrics=["mae", "mse"]) 

#Adding callbacks
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=10)  
mc=ModelCheckpoint('best_model.h5', monitor='val_loss', mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 300)           1744500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 14, 128)           219648    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,972,469
Trainable params: 1,972,469
Non-trainable params: 0
_________________________________________________________________
None


## Fit the Model

In [13]:
history = model.fit(np.array(x_tr_seq),
                    np.array(y_tr),
                    batch_size=64,
                    epochs=20,
                    validation_data=(np.array(x_val_seq),np.array(y_val)),
                    verbose=1,
                    callbacks=[mc])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4651 samples, validate on 1164 samples
Epoch 1/20

Epoch 00001: val_loss improved from -inf to 81.62736, saving model to best_model.h5
Epoch 2/20

Epoch 00002: val_loss did not improve from 81.62736
Epoch 3/20

Epoch 00003: val_loss did not improve from 81.62736
Epoch 4/20

Epoch 00004: val_loss did not improve from 81.62736
Epoch 5/20

Epoch 00005: val_loss did not improve from 81.62736
Epoch 6/20

Epoch 00006: val_loss did not improve from 81.62736
Epoch 7/20

Epoch 00007: val_loss did not improve from 81.62736
Epoch 8/20

Epoch 00008: val_loss did not improve from 81.62736
Epoch 9/20

Epoch 00009: val_loss did not improve from 81.62736
Epoch 10/20

Epoch 00010: val_loss did not improve from 81.62736
Epoch 11/20

Epoch 00011: val_loss did not improve from 81.62736
Epoch 12/20

Epoch 00012: val_loss did not improve from 81.62736
Epoch 13/20

Epoch 00013: val_loss did not improve from 81.62736
Epoch 14/20

Epoch 00014: val_loss did not improve from 81.62736
Epoch 15/20

Epoch 

In [14]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

#evaluation 
loss, mae, mse = model.evaluate(x_val_seq, y_val, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} ".format(mae))

test_predictions = model.predict(y_val).flatten()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Testing set Mean Abs Error:  8.16 


ValueError: Error when checking input: expected embedding_1_input to have shape (14,) but got array with shape (1,)

## [Use Transfer Learning](https://www.analyticsvidhya.com/blog/2020/03/pretrained-word-embeddings-nlp/)