In [1]:
import random
import re
import json
import yaml
import pandas as pd
import numpy as np
import seaborn as sns
import gensim
import pickle

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.utils import to_categorical

from CustomGenerator import CustomGenerator
from tools import configuration_fn
from dl_tools import build_w2v_model
from tools import read_glove_vecs,sentences_to_indices,pretrained_embedding_layer,convert_to_one_hot

Using TensorFlow backend.


ModuleNotFoundError: No module named 'CustomGenerator'

Read configuration from yml file

In [None]:
cfg = yaml.safe_load(open(configuration_fn))
fn = cfg['input']
MODEL_FILENAME = cfg['output']

In [None]:
np.random.seed(cfg['RANDOM_SEED'])

More configuration

In [None]:
NUMBER_WORDS_TO_REVIEW = 15
WORD_TO_TEST = 'washington'
CLOSEST_COUNT = 10
NUMBER_OF_CLUSTERS = 15
num_words = 2000

In [None]:
print(fn)

In [None]:
data = pd.read_csv(fn, delimiter = '\t', quoting = 3)

Review data

In [None]:
print(f'we have {len(data)} lines of data')
data = data.sample(frac=1)

In [None]:
data.head()

# Cleanup and features: 
    lowercase
    remove digits
    parse annotation
    remove redundencies
    remove empty column extras

In [None]:
data['content'] = data['verified_reviews'].str.lower()
data['content'] = data['content'].str.replace('[^a-z\s]', '')
data['annotation']=data['rating'].apply(lambda x: int(x/5))
data.drop_duplicates(subset = "content", inplace=True)

In [None]:
sns.countplot(x='annotation', data=data)

Take a small balanced subset of the data, include positives and negatives

In [None]:
number_of_offsentive_annotations = len(data[data['annotation']==0])
print("bad reviews:",number_of_offsentive_annotations)

In [None]:
data = pd.concat([data[data['annotation']==0] , data[data['annotation']==1][:number_of_offsentive_annotations]])

In [None]:
sns.countplot(x='annotation', data=data)

In [None]:
print(f'We now have {len(data)} data points')
data.head()

In [None]:
lens = data.content.str.len()
#maxLen = max(lens)
avg = lens.mean()
print(f'Mean tweet length: {avg:10.1f}')

In [None]:
lens.hist()

Build train / test set

# TFiDF + SVM

In [None]:
Y = data['annotation'].values
X_train, X_test, y_train, y_test = train_test_split(data['content'].values, Y,
                                                    test_size = cfg['TEST_FRACTION2_LSTM_W2V'],
                                                    random_state = cfg['RANDOM_SEED'], stratify=Y)

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
############
# Evaluate #
############
categories = ['negative' , 'positive']

predicted = pipeline.predict(X_test)
print(metrics.classification_report(y_test, predicted,
                                    target_names=categories))
metrics.confusion_matrix(y_test, predicted)

In [None]:
model = pipeline[2]
count_vect = pipeline[0]

In [None]:
threshold = sorted(list(model.coef_[0]))[-100]
revdict = dict((count_vect.vocabulary_[k],k) for k in iter(count_vect.vocabulary_))
print([revdict[i] for i,x in enumerate(model.coef_[0]) if x>threshold])

In [None]:
y_score = pipeline.decision_function(X_test)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
#for i in range(2):
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

In [None]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

# LSTM + W2V

In [None]:
Y = convert_to_one_hot(data['annotation'].values, 2)
X_train, X_test, y_train, y_test = train_test_split(data['content'].values, Y,
                                                    test_size = cfg['TEST_FRACTION2_LSTM_W2V'],
                                                    random_state = cfg['RANDOM_SEED'], stratify=Y)

In [None]:
%%time
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [None]:
maxLen = len(max(X_train, key=len).split())
print(maxLen)
maxLen = 25

In [None]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
X_test_indices =  sentences_to_indices(X_test,  word_to_index, maxLen)

In [None]:
print(len(X_train[380].split()))

In [None]:
len(max(X_train, key=len))

In [None]:
print(f'Size of train: {len(X_train)} size of test: {len(X_test)}')

In [None]:
model_w2v = build_w2v_model(word_to_vec_map, word_to_index, maxLen)
model_w2v.summary()

In [None]:
%%time
generator = CustomGenerator(X_train_indices, y_train)
history = model_w2v.fit_generator(generator=generator, steps_per_epoch=(len(X_train_indices) // generator.batch_size),
                    epochs = cfg['LSTM_W2V_EPOCHS'], verbose = 1, shuffle=True, validation_data=(X_test_indices, y_test))


Calculate accuracy on the test set

In [None]:
%%time
test_mse  = model_w2v.evaluate(X_test_indices,  y_test)
train_mse = model_w2v.evaluate(X_train_indices, y_train)
print(test_mse)

In [None]:
# evaluate the model
#print('Train: %.3f, Test: %.3f' % (train_mse, test_mse))
# plot loss during training
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

see the mislabelled examples

In [None]:
%%time
pred = model_w2v.predict(X_test_indices)
pred_indices = np.argmax(pred, axis=1)
classes = np.array(range(0, 2))
predicted = classes[pred_indices]
yt = [int(x[1]) for x in y_test]
print(metrics.classification_report(yt, predicted))
print(metrics.confusion_matrix(yt, predicted))

Example

In [None]:
# Change the sentence below to see your prediction. Make sure all the words are in the Glove embeddings.  
x_test = np.array(['please go and the arrive'])
print(X_test_indices)
print(x_test[0] +' '+  str((model_w2v.predict(X_test_indices))))

See the mislabelled examples

In [None]:
for i in range(len(X_test)):
    num = np.argmax(pred[i])
    if(num != yt[i]):
        print( str(X_test[i]) + ' prediction: '+ str(pred[i]) + str(num))

# Output

In [None]:
pickle.dump(model_w2v, open(MODEL_FILENAME,'wb'))