In [1]:
import gzip
import gensim 
import logging

import numpy as np
import pandas as pd
import pickle
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from joblib import dump, load

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import scipy as sp

  from numpy.core.umath_tests import inner1d
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load labels and text
labels, texts = [], []
data_neg = open('data/train_neg_full.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    texts.append(line)
    labels.append(-1)
data_pos = open('data/train_pos_full.txt', encoding="utf8").read()
for i, line in enumerate(data_pos.split('\n')):
    if (line == ''):
        break
    texts.append(line)
    labels.append(1)

In [3]:
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [4]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2)

In [5]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [6]:
glove2word2vec(glove_input_file='data/glove.twitter.27B.25d.txt', word2vec_output_file='data/glove.twitter.27B.25d_word2vect.txt')

(1193514, 25)

In [7]:
# load the pre-trained word-embedding vectors
model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.twitter.27B.25d_word2vect.txt', binary=False)

In [8]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

In [9]:
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 25))
for word, i in word_index.items():
    try:
        embedding_vector = model.get_vector(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        ;

In [10]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    lstm_model = models.Model(inputs=input_layer, outputs=output_layer2)
    lstm_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return lstm_model

In [11]:
lstm_classifier = create_rnn_lstm()

lstm_classifier.fit(train_seq_x, train_y, epochs = 1)

Epoch 1/1
 262336/2000000 [==>...........................] - ETA: 2:22:59 - loss: 0.4458

KeyboardInterrupt: 

In [None]:
lstm_predictions = lstm_classifier.predict(valid_seq_x)

lstm_predictions[lstm_predictions >= 0.5] = 1
lstm_predictions[lstm_predictions < 0.5] = 0

valid_percent = 100 * (1 - np.count_nonzero(np.array([valid_y]).T - lstm_predictions.astype(int)) / len(lstm_predictions))
print('local results: ' + str(valid_percent) + '%')

In [None]:
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

lstm_test_predictions = lstm_classifier.predict(test_seq_x)
test_predictions = np.round(lstm_test_predictions).astype(int)
test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('LSTMSubmission.csv')