In [1]:
import re
import os
import time
import gc
import random
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  app.launch_new_instance()


In [2]:
EMBEDDING_DIM = 300
MAX_LEN = 300
FASTTEXT_EMBEDDING_PATH = "/content/crawl-300d-2M.vec"

In [3]:
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
!unzip /content/Dataset.zip

Archive:  /content/Dataset.zip
replace twitter-test.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: twitter-test.tsv        
replace twitter-dev.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: twitter-dev.tsv         
replace twitter-train.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: twitter-train.tsv       


In [4]:
def loadDataframe(path):
  df = pd.read_csv(path, sep = '\t', names=["X", "sentiment", "text"])
  df = df[['text','sentiment']]
  return df

train_path = '/content/twitter-train.tsv'
test_path = '/content/twitter-test.tsv'
validate_path = '/content/twitter-dev.tsv'
train_df = loadDataframe(train_path)
test_df = loadDataframe(test_path)
val_df = loadDataframe(validate_path)

In [5]:
def preprocess(data):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

x_train = preprocess(train_df['text'])
x_test = preprocess(test_df['text'])
x_val = preprocess(val_df['text'])

sentmap = {'positive' : 1, 'negative' : -1, 'neutral' : 0}
y_train = train_df['sentiment'].map(sentmap)
y_test = test_df['sentiment'].map(sentmap)
y_val = val_df['sentiment'].map(sentmap)

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test) + list(x_val))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_val = tokenizer.texts_to_sequences(x_val)
x_train = pad_sequences(x_train, maxlen=MAX_LEN)
x_test = pad_sequences(x_test, maxlen=MAX_LEN)
x_val = pad_sequences(x_val, maxlen=MAX_LEN)

max_features = len(tokenizer.word_index) + 1

In [7]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

embedding_matrix, unknown_words = build_matrix(tokenizer.word_index, FASTTEXT_EMBEDDING_PATH)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [8]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [9]:
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable = False, mask_zero = True))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(EMBEDDING_DIM, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy', f1_m, precision_m, recall_m])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          9708000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 300, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dense (Dense)                (None, 3)                 903       
Total params: 10,430,103
Trainable params: 722,103
Non-trainable params: 9,708,000
_________________________________________________________________
None


In [10]:
from keras.utils import to_categorical
def one_hot(y):
    y += 1
    y = to_categorical(y)
    return y
y_train = one_hot(y_train)
y_test = one_hot(y_test)
y_val = one_hot(y_val)

In [11]:
batch_size = 1024
model.fit(x_train, y_train, epochs = 20, batch_size=batch_size, verbose = 2)

Epoch 1/20
10/10 - 18s - loss: 0.9853 - accuracy: 0.4967 - f1_m: 0.3268 - precision_m: 0.5635 - recall_m: 0.2479
Epoch 2/20
10/10 - 12s - loss: 0.8990 - accuracy: 0.5815 - f1_m: 0.5150 - precision_m: 0.6487 - recall_m: 0.4286
Epoch 3/20
10/10 - 12s - loss: 0.8547 - accuracy: 0.5996 - f1_m: 0.5509 - precision_m: 0.6635 - recall_m: 0.4714
Epoch 4/20
10/10 - 13s - loss: 0.8141 - accuracy: 0.6238 - f1_m: 0.5890 - precision_m: 0.6804 - recall_m: 0.5193
Epoch 5/20
10/10 - 12s - loss: 0.7993 - accuracy: 0.6333 - f1_m: 0.6005 - precision_m: 0.6886 - recall_m: 0.5325
Epoch 6/20
10/10 - 12s - loss: 0.7843 - accuracy: 0.6424 - f1_m: 0.6083 - precision_m: 0.6939 - recall_m: 0.5418
Epoch 7/20
10/10 - 13s - loss: 0.7854 - accuracy: 0.6404 - f1_m: 0.6133 - precision_m: 0.6946 - recall_m: 0.5492
Epoch 8/20
10/10 - 13s - loss: 0.7689 - accuracy: 0.6467 - f1_m: 0.6252 - precision_m: 0.6959 - recall_m: 0.5676
Epoch 9/20
10/10 - 13s - loss: 0.7644 - accuracy: 0.6541 - f1_m: 0.6289 - precision_m: 0.6987 - 

<tensorflow.python.keras.callbacks.History at 0x7f710e4f3f10>

In [12]:
score, acc, f1, precision, recall = model.evaluate(x_test, y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
print("Precision: %.2f" % (precision))
print("Recall: %.2f" % (recall))

4/4 - 2s - loss: 0.6864 - accuracy: 0.6938 - f1_m: 0.6855 - precision_m: 0.7284 - recall_m: 0.6473
score: 0.69
acc: 0.69
F1-score: 0.69
Precision: 0.73
Recall: 0.65


In [13]:
score, acc, f1, precision, recall = model.evaluate(x_val, y_val, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))
print("F1-score: %.2f" % (f1))
print("Precision: %.2f" % (precision))
print("Recall: %.2f" % (recall))

2/2 - 1s - loss: 0.7481 - accuracy: 0.6844 - f1_m: 0.6709 - precision_m: 0.7250 - recall_m: 0.6243
score: 0.75
acc: 0.68
F1-score: 0.67
Precision: 0.73
Recall: 0.62
