## Import of librairies

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import *
from keras.layers import Embedding, Dense, Dropout, Bidirectional


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


## Loading of data

link : https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/download

In [0]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Distribution of positive and negative sentiments

In [4]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Preprocessing

Replacement of values of target column by numrical values.

In [0]:
df.sentiment.replace(['positive', 'negative'], [1, 0], inplace=True)

Building of a list for reviews and another for sentiments.

In [0]:
reviews = [review for review in df['review']]
sentiments = [sentiment for sentiment in df['sentiment'] ]

Function to remove useless information

In [0]:
def text_preprocessing (text, min_word_length) :
  """

  text : must be a string
  min_word_length : must be a integer 

  function return a string

  """
  # put in lowercase all words
  text = text.lower()
  # remove html tags
  text = text.replace('{html}',"")
  # remove URL
  text = re.sub(r'http\S+', '', text)
  # remove numbers
  text = re.sub('[0-9]+', '', text)
  # tokenize the text and convert string to list
  tokenizer = RegexpTokenizer(r'\w+')
  text = tokenizer.tokenize(text)
  # removing of stopwords
  text = [word for word in text if len(word) > min_word_length if word not in stopwords.words('english')]
  # Lemmatization
  lemmatizer = WordNetLemmatizer()
  text=[lemmatizer.lemmatize(word) for word in text]
  # convert list to string
  text = " ".join(text)
  return text

Applying of function

In [0]:
reviews = [text_preprocessing (review, 1) for review in reviews]

Tokenization

In [0]:
tokenizer = Tokenizer(
    num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,
    split=' ', char_level=False, oov_token=None, document_count=0)

# The filters remove useless character for a text analysis
#lower = True puts each word in lowercase

Get the vocabulary size

In [0]:
#Applying of tokenizer function on all texts

tokenizer.fit_on_texts(reviews)

# Get dictionnary with a word as key and the index as value
word_index = tokenizer.word_index

In [11]:
print ('Vocabulary size: ' + str(len(word_index)))

Vocabulary size: 90198


Convert to sequence

In [0]:
sequences = tokenizer.texts_to_sequences(reviews)

In [0]:
padded_sequences = pad_sequences(sequences, padding='post', maxlen= 150)

In [14]:
# Get the maximum length of all sequences

maxlen = 0

for sequence in sequences :
  if len(sequence)> maxlen :
    maxlen = len(sequence)

print('maximum length: ' + str(maxlen))

maximum length: 1427


## Bi-LSTM

In [30]:
embedding_dim = 100
vocab_size = len(word_index)+1

model=Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=150))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 150, 100)          9019900   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               84480     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 9,104,509
Trainable params: 9,104,509
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
epochs = 3

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-5,
    decay_steps=100000,
    decay_rate=0.9, name='Adam')


model.compile(optimizer='Adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    padded_sequences, sentiments,
    epochs=epochs, validation_split = 0.2, batch_size = 200)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 40000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
