In [146]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import csv
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.layers import SimpleRNN, Input, Embedding, Dropout, Dense, Activation, LSTM
from keras.models import Model

In [157]:
train = pd.read_csv('data/reviews_train.csv').sample(frac=1)
test = pd.read_csv('data/reviews_test.csv')
train.head()

Unnamed: 0,text,label
21520,"Tho 35 years old, Groove Tube looks a lot like...",1
20883,Having heard quite positive reviews and having...,1
15487,***SPOILERS*** When undercover Brooklyn North ...,1
15872,This is not a profound movie; most of the plot...,1
15619,I loved this episode. It is so great that all ...,1


In [8]:
print(train.shape)
print(train.label.sum())

print(test.shape)
print(test.label.sum())

(24987, 2)
12490
(24989, 2)
12495


In [77]:
def tokenize_lemmatize_data(df, lemmatize=False):
    df2 = df.copy(deep=True)
    df2["text"] = df2['text'].apply(lambda x:  re.sub(r"\'.+?", '', x))
    pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
    tokenizer = RegexpTokenizer(pattern)
    df2["text"] = df2["text"].apply(lambda x: tokenizer.tokenize(x.lower()))
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        df2['text'] = df2['text'].apply(lambda x: [lemmatizer.lemmatize(item) for item in x])
    return df2

In [158]:
train_word_list = tokenize_lemmatize_data(train)
test_word_list = tokenize_lemmatize_data(test)
train_word_list.head()

Unnamed: 0,text,label
21520,"[tho, 35, years, old, ,, groove, tube, looks, ...",1
20883,"[having, heard, quite, positive, reviews, and,...",1
15487,"[*, *, *, spoilers, *, *, *, when, undercover,...",1
15872,"[this, is, not, a, profound, movie, ;, most, o...",1
15619,"[i, loved, this, episode, ., it, is, so, great...",1


In [97]:
def load_fb_embeddings(dict_dir, max_dict_lenght):
    dict_df = pd.read_csv(dict_dir, sep=';', nrows=max_dict_lenght, skiprows=1, encoding='utf8', quoting=csv.QUOTE_NONE,
                          header=None)
    embeddings_word_index = dict_df.iloc[:, 0]
    embeddings_word_index = pd.Series(embeddings_word_index.index, index=embeddings_word_index)
    embeddings_word_index = embeddings_word_index.to_dict()
    # throw out the first column (with words) and last one (with nans)
    embeddings_matrix = np.array(dict_df.iloc[:, 1:-1])
    return embeddings_word_index, embeddings_matrix

def get_text_indeces(word_list, emb_index):
    return [emb_index.get(word, 0) for word in word_list]

In [94]:
embeddings_word_index, embedding_matrix = load_fb_embeddings('data/wiki10k.csv', None)

In [178]:
train_tokens = train_word_list.text.apply(get_text_indeces, args=(embeddings_word_index,))
test_tokens = test_word_list.text.apply(get_text_indeces, args=(embeddings_word_index,))

In [179]:
pad_train_tokens = pad_sequences(train_tokens, maxlen=100, dtype='int32', padding='post', value=0.0)
pad_test_tokens = pad_sequences(test_tokens, maxlen=100, dtype='int32', padding='post', value=0.0)

In [198]:
def create_rnn_model(input_shape, emb_matrix):
    sentence_indices = Input(shape=input_shape, dtype='int32')
    
    # make an embedding layer
    embedding_layer = Embedding(10000, 300, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    embs = embedding_layer(sentence_indices)
    X = SimpleRNN(100, activation='tanh', return_sequences=True)(embs)
    X = SimpleRNN(100, activation='tanh', return_sequences=False)(X)
    X = Dense(1, activation='sigmoid')(X)
    
    model = Model(inputs=sentence_indices, outputs=X)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [199]:
model = create_rnn_model((100,), embedding_matrix)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_25 (InputLayer)        (None, 100)               0         
_________________________________________________________________
embedding_25 (Embedding)     (None, 100, 300)          3000000   
_________________________________________________________________
simple_rnn_30 (SimpleRNN)    (None, 100, 100)          40100     
_________________________________________________________________
simple_rnn_31 (SimpleRNN)    (None, 100)               20100     
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 101       
Total params: 3,060,301
Trainable params: 60,301
Non-trainable params: 3,000,000
_________________________________________________________________


In [200]:
model.fit(pad_train_tokens, train.label, batch_size=64, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f97abcbc9b0>

In [201]:
model.evaluate(pad_train_tokens, train.label)
model.evaluate(pad_test_tokens, test.label)



[0.6587067715541298, 0.5992636760269886]