In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
train = pd.read_csv('csv/train.csv')
test = pd.read_csv("csv/test.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4651 entries, 0 to 4650
Data columns (total 24 columns):
title                             4651 non-null object
description                       4651 non-null object
plot                              4651 non-null object
csm_review                        4651 non-null object
need_to_know                      4651 non-null object
par_rating                        1988 non-null float64
kids_rating                       2413 non-null float64
csm_rating                        4651 non-null int64
Author                            4376 non-null object
Genre                             4651 non-null object
Topics                            3112 non-null object
Book type                         4651 non-null object
Publisher                         4545 non-null object
Publication date                  4651 non-null object
Publisher's recommended age(s)    3725 non-null object
Number of pages                   4615 non-null float64
Available o

## Create the splits

In [8]:
x_tr, y_tr = train['description'].values, train['csm_rating'].values
x_test, y_test = test["description"].values, test["csm_rating"].values

In [9]:
print(x_tr.shape, y_tr.shape)

(4651,) (4651,)


In [10]:
print(x_test.shape, y_test.shape)

(1164,) (1164,)


## Create embeddings

In [11]:
#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(x_tr))

#converting text into integer sequences
x_tr_seq  = tokenizer.texts_to_sequences(x_tr) 
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [32]:
word_index = tokenizer.word_index
print("Found %s unique tokens." % len(word_index))

Found 5814 unique tokesn.


## Padding

Use prepading for the max length as denoted [here](https://arxiv.org/pdf/1903.07288.pdf)

In [23]:
print(len(max(x_tr_seq, key=len)))
print(len(min(x_tr_seq, key=len)))

14
3


In [25]:
#padding to prepare sequences of same length
x_tr_seq  = pad_sequences(x_tr_seq, maxlen=14)
x_test_seq = pad_sequences(x_test_seq, maxlen=14)

In [26]:
print(len(max(x_tr_seq, key=len)))
print(len(min(x_tr_seq, key=len)))

14
14


## Number of Unique words

In [27]:
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

5815


## Create the Model

In [28]:
#deep learning library
from keras.models import *
from keras.layers import *
from keras.callbacks import *

In [29]:
model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=100,trainable=True)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

