In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, LSTM, Dropout, Dense, Flatten, Embedding, Conv1D, Input
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
imdb_data = pd.read_csv('../data/IMDBDataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
imdb_data.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [9]:
X = imdb_data['review']
y = pd.get_dummies(imdb_data.sentiment, prefix='sent', drop_first=True).values
y.shape

(50000, 1)

In [10]:
def clean_text(doc):
    document = remove_tags(doc)
    
    document = re.sub('[^a-zA-Z]', ' ', document)
    
    document = re.sub(r'\s+[^a-zA-Z]\s+', ' ', document)
    
    document = re.sub(r'\s+', ' ', document)
    
    return document

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(document):
    return TAG_RE.sub('', document)

In [11]:
X_sentences = []
reviews = list()

for rev in X:
    X_sentences.append(clean_text(rev))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_sentences, y, test_size=0.2, random_state=42)

In [13]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [14]:
vocabulary_size = len(tokenizer.word_index) + 1
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [15]:
embedded_dict = dict()
glove_embeddings = open('../data/glove.6B.100d.txt', encoding='utf-8')

In [16]:
for embeddings in glove_embeddings:
    embeddings_tokens = embeddings.split()
    emb_word = embeddings_tokens[0]
    emb_vector = np.asarray(embeddings_tokens[1:], dtype='float32')
    embedded_dict[emb_word] = emb_vector
glove_embeddings.close()

In [17]:
embedd_mat = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embedded_dict.get(word)
    if embedding_vector is not None:
        embedd_mat[index] = embedding_vector
embedd_mat.shape

(92547, 100)

In [18]:
def enbedded_model():
    embedding_inputs = Input(shape=(maxlen))
    embedding_layer = Embedding(vocabulary_size, 100, weights=[embedd_mat], trainable=False)(embedding_inputs)
    
    conv1 = Conv1D(128, 3, strides=2, activation='relu')(embedding_layer)
    conv2 = Conv1D(64, 3, strides=2, activation='relu')(conv1)
    
    flatten_layer = Flatten()(conv2)
    
    drop1 = Dropout(0.2)(flatten_layer)

    dense1 = Dense(512, activation='relu')(drop1)
    drop2 = Dropout(0.2)(dense1)
    

    output_layer = Dense(1, activation='sigmoid')(drop2)
    return Model(inputs=embedding_inputs, outputs=output_layer)

model = enbedded_model()

In [19]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 100)          9254700   
_________________________________________________________________
conv1d (Conv1D)              (None, 49, 128)           38528     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 24, 64)            24640     
_________________________________________________________________
flatten (Flatten)            (None, 1536)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1536)              0         
_________________________________________________________________
dense (Dense)                (None, 512)               786944

In [20]:
history = model.fit(X_train, y_train, batch_size=64, epochs=10, verbose=1, validation_split=0.2)
score = model.evaluate(X_test, y_test, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
print(score[0])
print(score[1])

0.6133042573928833
0.8138999938964844
