In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.python.keras import models, layers, optimizers

# from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
# from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import re

%matplotlib inline


In [3]:
import tensorflow

In [4]:
from tensorflow.python.keras import models, layers, optimizers

In [6]:
from tensorflow.python.keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [7]:
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [8]:
import bz2

In [9]:
df = pd.read_csv('train.csv')

In [10]:
df.head()

Unnamed: 0,review_clean,sentiment
0,this is a cute little book that is fairly shor...,1
1,too shooortt really when i started enjoying th...,1
2,i received a complimentary copy of this book i...,1
3,a loving sometimes heartbreaking story full of...,1
4,read all books havent been disappointed yet,1


In [11]:
df.isna().sum()

review_clean    21
sentiment        0
dtype: int64

In [12]:
df.dropna(inplace=True)

In [14]:
from sklearn.model_selection import train_test_split

In [16]:
train_data, test_data = train_test_split(df, test_size=0.2,random_state = 42)

In [17]:
train_labels = train_data.sentiment
train_texts = train_data.review_clean

test_labels = test_data.sentiment
test_texts = test_data.review_clean

In [18]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [19]:
MAX_FEATURES = 12000

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [20]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

In [21]:
def build_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
model = build_model()

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead


In [22]:
model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=2,
    validation_data=(test_texts, test_labels) )

Train on 63983 samples, validate on 15996 samples
Epoch 1/2

Epoch 2/2



<tensorflow.python.keras._impl.keras.callbacks.History at 0x12ac76710>

In [23]:
df_test = pd.read_csv('test.csv')

In [24]:
df_test.isna().sum()

review_clean    5
sentiment       0
dtype: int64

In [26]:
df_test.shape

(20000, 2)

In [28]:
df_test.dropna(inplace=True)

In [34]:
test1_labels = df_test.sentiment
test1_texts = df_test.review_clean

In [35]:
test1_texts = normalize_texts(test1_texts)

In [36]:
test1_texts = tokenizer.texts_to_sequences(test1_texts)

In [37]:
test1_texts = pad_sequences(test1_texts, maxlen=MAX_LENGTH)

In [38]:
preds = model.predict(test1_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test1_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test1_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test1_labels, preds)))

Accuracy score: 0.9178
F1 score: 0.957
ROC AUC score: 0.7753


In [44]:
def build_rnn_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.GRU(128, return_sequences=True)(embedded)
    x = layers.GRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    return model
    
rnn_model = build_rnn_model()

In [45]:
rnn_model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=1,
    validation_data=(test_texts, test_labels) )

Train on 63983 samples, validate on 15996 samples
Epoch 1/1



<tensorflow.python.keras._impl.keras.callbacks.History at 0x130e73cf8>