In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

In [2]:
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, ZeroPadding1D, Embedding

from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.initializers import Constant
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

##### Configuration variables

In [3]:
# num_words = 15000
epochs = 5
embedding_dim = 300            # If you use pre trained embedding matrix, Make sure this variable is equal with it
batch_size = 64
num_class = 3

##### Read dataset and getting ready

In [4]:
df = pd.read_csv('data/clean_dataset.csv')    # Read movie comments dataset             
df = df.dropna()                         # Remove none type samples
df = df.sample(frac=1) 
df.head(5)

Unnamed: 0,comment,point
2918,woody allen alem adam yine ortaya izlemesi key...,2
16306,kafaya koyduğunu yapabilmek i̇nsanoğlunun sahi...,2
14998,harika farklı içinde kullanılan animasyonlar h...,2
1988,ben fılmı cıktıgı ılk zamanlarda ızlemıstım he...,2
35178,saçma filmdi kesinlikle zaman kaybı denzel bil...,0


##### Split dataset and make label categorical

In [5]:
target = df['point'].values.tolist()                 # Get all points as a list
data = df['comment'].values.tolist()                 # Get all comments as a list

target = to_categorical(target, num_classes=num_class)                      # Convert label into one-hot vector

cutoff = int(len(target) * 0.8)                      # Split dataset as train and test
x_train, x_test = data[:cutoff], data[cutoff:]
y_train, y_test = target[:cutoff], target[cutoff:]

print(x_train[2340])
print(y_train[2340])

oyuncular cekim cok cok amatördü hikayeyinin anlami yok yarim yamalak sacma bitis oldu begenemedim
[1. 0. 0.]


##### Tokenize data

In [6]:
# tokenizer = Tokenizer(num_words=num_words)    # Create tokenizer (Take only first 14000 words)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)              # Create tokens from our dataset
tokenizer.word_index                      # Display tokens and their index

{'iyi': 1,
 'güzel': 2,
 'olarak': 3,
 'bence': 4,
 'filmde': 5,
 'var': 6,
 'değil': 7,
 'ben': 8,
 'olan': 9,
 'kötü': 10,
 'yok': 11,
 'sonra': 12,
 'son': 13,
 'gerçekten': 14,
 'filme': 15,
 'ilk': 16,
 'bile': 17,
 'biraz': 18,
 'olmuş': 19,
 'zaman': 20,
 'başarılı': 21,
 'kesinlikle': 22,
 'fazla': 23,
 'büyük': 24,
 'zaten': 25,
 'sadece': 26,
 'tek': 27,
 'tavsiye': 28,
 'yine': 29,
 'ancak': 30,
 'harika': 31,
 'özellikle': 32,
 'böyle': 33,
 'olduğunu': 34,
 'olduğu': 35,
 'tam': 36,
 'fakat': 37,
 'izledim': 38,
 'senaryo': 39,
 'konu': 40,
 'farklı': 41,
 'komedi': 42,
 'cok': 43,
 'aksiyon': 44,
 'aynı': 45,
 'pek': 46,
 'oldukça': 47,
 'bana': 48,
 'şekilde': 49,
 'filmdi': 50,
 'sinema': 51,
 'benim': 52,
 'yer': 53,
 'izleyin': 54,
 'göre': 55,
 'önce': 56,
 'oyuncu': 57,
 'korku': 58,
 'yönetmen': 59,
 'iki': 60,
 'rağmen': 61,
 'ayrıca': 62,
 'mükemmel': 63,
 'filmleri': 64,
 'ederim': 65,
 'olsa': 66,
 'konusu': 67,
 'filmden': 68,
 'eğlenceli': 69,
 'olması': 70,


In [7]:
x_train_tokens = tokenizer.texts_to_sequences(x_train)    # Convert comments using tokens
x_test_tokens = tokenizer.texts_to_sequences(x_test)

print(x_train[2340])
print(x_train_tokens[2340])

oyuncular cekim cok cok amatördü hikayeyinin anlami yok yarim yamalak sacma bitis oldu begenemedim
[120, 9947, 43, 43, 27521, 98844, 27522, 11, 6703, 9524, 1366, 67403, 90, 33541]


##### Calculate some statistic about data and decide the size of the comment

In [8]:
# Calculate length of the comments and choose common length
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

max_tokens = int(np.mean(num_tokens) + 1.5 * np.std(num_tokens))
print('mean: ', np.mean(num_tokens))
print('std: ', np.std(num_tokens))
print('max_token_length: ', max_tokens)
print('cover range: ', np.sum(num_tokens < max_tokens) / len(num_tokens))

mean:  33.50765392627041
std:  59.82182649168789
max_token_length:  123
cover range:  0.960313423436516


#####  Make all comments the same size (Padding)

In [9]:
# Make all comments in same length with padding
x_train_tokens = pad_sequences(x_train_tokens, maxlen=max_tokens)
x_test_tokens = pad_sequences(x_test_tokens, maxlen=max_tokens)
print(x_train_tokens.shape)
print(x_test_tokens.shape)

(66057, 123)
(16515, 123)


##### Get pre-created embedding vectors and make an embedding matrix

In [10]:
embeddings_index = {}
f = open('embedding_vectors.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [11]:
num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
num_words

220641

#### Create and Compile Neural Network Model

In [12]:
model = Sequential()
                    # input_dim: # of words, output_dim: # of embedding vector size, input_len: max token size
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_dim,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=max_tokens,
                    trainable=False,
                    name='embedding_layer'))

model.add(Conv1D(64, 5, strides=1, activation='relu', padding='same'))
model.add(MaxPooling1D(3, strides=3))
model.add(ZeroPadding1D(1))
model.add(Conv1D(128, 5, strides=1, activation='relu', padding='same'))
model.add(MaxPooling1D(3, strides=3))
model.add(ZeroPadding1D(1))
model.add(Conv1D(256, 5, strides=1, activation='relu'))
model.add(MaxPooling1D(3, strides=3))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_class, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-4), metrics=['accuracy'])
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 123, 300)          66192300  
_________________________________________________________________
conv1d (Conv1D)              (None, 123, 64)           96064     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 41, 64)            0         
_________________________________________________________________
zero_padding1d (ZeroPadding1 (None, 43, 64)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 43, 128)           41088     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 14, 128)           0         
_________________________________________

##### Train, Test and Save model

In [13]:
model.fit(x_train_tokens, y_train, validation_split=0.1, epochs=epochs, batch_size=batch_size)

Train on 59451 samples, validate on 6606 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1c32300b088>

In [14]:
model.evaluate(x_test_tokens, y_test)



[0.7327419369176702, 0.6691493]

In [15]:
def predict_review(review):
    classes = ['negative', 'normal', 'positive']
    review_tokens = tokenizer.texts_to_sequences([review])
    review_pad = pad_sequences(review_tokens, maxlen=max_tokens)
    pred = model.predict(review_pad)
    print(pred)
    class_idx = np.argmax(pred)
    return classes[class_idx]

In [17]:
print(predict_review('ilk başlar ken kötü aksiyon hissi veriyo gittikce merak uyandırıyor bence izlenmezze bişey kaybetilmez'))
print(predict_review('tekrar filmerinin ilklerini aratığı tezini destekliyor yinede kötü değil'))
print(predict_review('internetten yorumlara baktım gittim hayatimda izlediğim berbat filmlerden konu saçma oyunculuk kötü ben pişman oldum'))
print(predict_review('i̇zlediğim kötü filmler arasında olsunda nolursa olsun gidicek arkadaşlara başka filme gitmelerini öneririm'))
print(predict_review('dylan minnette olduğu izlediğim sadece evde geçmesi biraz sıkıcı olabilir konu sürüklemesi falan harika'))


[[0.14561127 0.54823905 0.30614966]]
normal
[[0.07386654 0.6931705  0.23296297]]
normal
[[0.9301333  0.06310222 0.00676446]]
negative
[[0.41087717 0.2922061  0.2969167 ]]
negative
[[0.02168684 0.2733658  0.7049473 ]]
positive
[[0.39184412 0.54884225 0.05931357]]
normal
