In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv
/kaggle/input/glove6b100dtxt/glove.6B.100d.txt


In [2]:
df_total=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train_set,test_set=train_test_split(df_total[['review','sentiment']].values,stratify=df_total['sentiment'].values)

In [5]:
print(np.unique(train_set[:,-1],return_counts=True))
print(np.unique(test_set[:,-1],return_counts=True))

(array(['negative', 'positive'], dtype=object), array([18750, 18750]))
(array(['negative', 'positive'], dtype=object), array([6250, 6250]))


In [6]:
import re
import string
from nltk.corpus import stopwords

In [7]:
def clean_doc(doc):
    label=(doc[:,-1]=='positive')*1
    stop_words=set(stopwords.words('english'))
    review_list=[]
    for review in doc[:,0]:
        tokens=review.split()
        re_puch=re.compile('[%s]'%re.escape(string.punctuation))
        tokens=[re_puch.sub('',w) for w in tokens ]
        tokens=[word for word in tokens if word.isalpha()]
        tokens=[w for w in tokens if not w in stop_words]
        tokens=[w.lower() for w in tokens]
        tokens=' '.join(tokens)
        review_list.append(tokens)
    return review_list,label
    

In [8]:
train_doc,train_label=clean_doc(train_set)
test_doc,test_label=clean_doc(test_set)

In [9]:
train_doc[1]

'this one worst movies ive ever seen life they said mans answer waiting exhaleall im going say really didnt respond i couldnt believe actually made the director choose another profession cant make movie the script wasnt good it made sense messy bet movies much better i horribly disappointed see talented actor terrence bad excuse movie if i could turn back hands time i wouldnt go back media play never buy movie i would keep wrapped sitting shelf instead wasting time watching'

In [10]:
y_train=np.reshape(train_label,(train_label.shape[0],1))


In [11]:
y_train.shape

(37500, 1)

In [12]:
y_test=np.reshape(test_label,(test_label.shape[0],1))

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [14]:
def create_tokenizer(lines):
    t=Tokenizer()
    t.fit_on_texts(lines)
    return t

In [15]:
tokenizer=create_tokenizer(train_doc)

In [16]:
vocab_size=len(tokenizer.word_index)+1

In [17]:
np.median([len(s.split()) for s in train_doc])

98.0

In [18]:
def encode_doc(tokenizer,doc,length):
    encoded=tokenizer.texts_to_sequences(doc)
    padded=pad_sequences(encoded,maxlen=length,padding='post')
    return padded

In [19]:
length=150

In [20]:
X_train=encode_doc(tokenizer,train_doc,length)
X_test=encode_doc(tokenizer,test_doc,length)

In [21]:
X_train

array([[    25,     19,    286, ...,      0,      0,      0],
       [     8,      6,    159, ...,      0,      0,      0],
       [    85,     47,    895, ...,    679,      2,      2],
       ...,
       [ 16283,     61,     99, ..., 149077,    826,    487],
       [   436,    128,    152, ...,   4225,    296,   1440],
       [    63,     37,     26, ...,      0,      0,      0]], dtype=int32)

In [22]:
embedding_index=dict()
f=open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt')
for line in f:
    values=line.split()
    word=values[0]
    enbd=np.asarray(values[1:],dtype='float32')
    embedding_index[word]=enbd
    
f.close()

In [23]:
embedding_matrix=np.zeros((vocab_size,100))
for word ,i in tokenizer.word_index.items():
    embedding_vector=embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

In [24]:
from keras.models import Model
from keras.layers import Input, Dense,Flatten,Dropout,Embedding
from keras.layers.convolutional import Conv1D,MaxPooling1D
from keras.layers.merge import concatenate
from keras.utils.vis_utils import plot_model

In [None]:
def define_model(length,vocab_size):
    input1=Input(shape=(length,))
    embedding1=Embedding(vocab_size,100,weights=[embedding_matrix],trainable=False)(input1)
    conv1=Conv1D(filters=32,kernel_size=1,activation='relu')(embedding1)
    drop1=Dropout(0.5)(conv1)
    pool1=MaxPooling1D(pool_size=2)(drop1)
    flat1=Flatten()(pool1)
    
    input2=Input(shape=(length,))
    embedding2=Embedding(vocab_size,100,weights=[embedding_matrix],trainable=False)(input1)
    conv2=Conv1D(filters=32,kernel_size=4,activation='relu')(embedding2)
    drop2=Dropout(0.5)(conv2)
    pool2=MaxPooling1D(pool_size=2)(drop2)
    flat2=Flatten()(pool2)
    
    input3=Input(shape=(length,))
    embedding3=Embedding(vocab_size,100,weights=[embedding_matrix],trainable=False)(input1)
    conv3=Conv1D(filters=32,kernel_size=8,activation='relu')(embedding3)
    drop3=Dropout(0.5)(conv3)
    pool3=MaxPooling1D(pool_size=2)(drop3)
    flat3=Flatten()(pool3)
    
    merged=concatenate([flat1,flat2,flat3])
    
    dense1=Dense(10,activation='relu')(merged)
    outputs=Dense(1,activation='sigmoid')(dense1)
    model=Model(inputs=[input1,input2,input3],outputs=outputs)
    
    model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['accuracy'])
    
    model.summary()
    plot_model(model,show_shapes=True)
    
    return model

In [26]:
model=define_model(length,vocab_size)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 100)     14908100    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 100)     14908100    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 100)     14908100    input_1[0][0]                    
_______________________________________________________________________________________

In [27]:
X_train.shape

(37500, 150)

In [28]:
model.fit(X_train,y_train,epochs=10,batch_size=16,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f03a7373350>

In [29]:
model.evaluate([X_test,X_test,X_test],y_test)



[0.39076292514801025, 0.8447999954223633]

In [30]:
review=["""Good solid start of a new promising new horror/mystery show.

The story reminds me of classic adventure stories like Indiana Jones and gives constant suspension and couriosity. You want to know what happens next and what the mystery is about. It does not invent something completly new, but delievers an exciting premise.

The main protagonist is the heart of the show and carries the main attention of the spectator. In comparison the other side characters are boring and lack of an impact and can't compete with the actor of Refaat Ismail.

Where The Haunting of Bly Manor fails and tries to copies his own formula, this show creates a satisfying setting."""]
review=encode_doc(tokenizer,review,length)

z=model.predict(review)

In [31]:
print(z)

[[0.9635902]]


In [32]:
print(review)

[[    9  1072   285   269    51    82  2360    82   110   736    50     3
     16  1613  1651   269   273  1136   502     7  6484  1248    35   311
   1715  5516    35   138    93   289    46   131   470   281    35   131
      3   736   506  2103    11  2097   161 10383    62 38643    82    34
    678  1077   766     3   185  1990   506     3   449   269     3    50
     35  2531     3   185   599   269     3 10740    49  1880     3  1215
    412    32  1679   270    35   483   269   678  1443    35  5630   422
      3   215   269 39338  1195     3  2278   269 78994 11767   935    35
    411   289  4202   343 10216  2159     8    50  2086    51  2359   877
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]]
