In [1]:
import gzip
import gensim 
import logging

import numpy as np
import pandas as pd
import pickle
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from joblib import dump, load

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import scipy as sp

  from numpy.core.umath_tests import inner1d
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Loading the training data

In [2]:
# Load the labels and text
labels, texts = [], []
data_neg = open('data/train_neg_full.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    texts.append(line)
    labels.append(-1)
data_pos = open('data/train_pos_full.txt', encoding="utf8").read()
for i, line in enumerate(data_pos.split('\n')):
    if (line == ''):
        break
    texts.append(line)
    labels.append(1)

In [3]:
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

## Generate sets, load embeddings and tokenize text

In [4]:
# Generate training and validation sets
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.2)

In [5]:
# Encode the labels
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [6]:
# We convert from glove to word2vect
glove2word2vec(glove_input_file='data/glove.twitter.27B.25d.txt', word2vec_output_file='data/glove.twitter.27B.25d_word2vect.txt')

(1193514, 25)

In [7]:
# Load the embeddings vectors
model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.twitter.27B.25d_word2vect.txt', binary=False)

KeyboardInterrupt: 

In [None]:
# Create a tokenizer
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

# Convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

In [None]:
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 25))
for word, i in word_index.items():
    try:
        embedding_vector = model.get_vector(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        ;

## Neural net implementations

### CNN

In [None]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    cnn_model = models.Model(inputs=input_layer, outputs=output_layer2)
    cnn_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return cnn_model

In [None]:
cnn_classifier = create_cnn()
cnn_classifier.fit(train_seq_x, train_y)

In [None]:
cnn_predictions = cnn_classifier.predict(valid_seq_x)

cnn_predictions[cnn_predictions >= 0.5] = 1
cnn_predictions[cnn_predictions < 0.5] = 0

valid_percent = 100 * (1 - np.count_nonzero(np.array([valid_y]).T - cnn_predictions.astype(int)) / len(cnn_predictions))
print('local results: ' + str(valid_percent) + '%')

In [None]:
# Submission code
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

cnn_test_predictions = cnn_classifier.predict(test_seq_x)
test_predictions = np.round(cnn_test_predictions).astype(int)
test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('CNNSubmission.csv')

### LSTM

In [None]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    lstm_model = models.Model(inputs=input_layer, outputs=output_layer2)
    lstm_model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return lstm_model

In [None]:
lstm_classifier = create_rnn_lstm()

lstm_classifier.fit(train_seq_x, train_y, epochs = 1)

In [None]:
lstm_predictions = lstm_classifier.predict(valid_seq_x)

lstm_predictions[lstm_predictions >= 0.5] = 1
lstm_predictions[lstm_predictions < 0.5] = 0

valid_percent = 100 * (1 - np.count_nonzero(np.array([valid_y]).T - lstm_predictions.astype(int)) / len(lstm_predictions))
print('local results: ' + str(valid_percent) + '%')

In [None]:
# Submission code
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

lstm_test_predictions = lstm_classifier.predict(test_seq_x)
test_predictions = np.round(lstm_test_predictions).astype(int)
test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('LSTMSubmission.csv')

### GRU

In [None]:
def create_rnn_gru():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    gru_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(gru_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

In [None]:
gru_classifier = create_rnn_gru()

gru_classifier.fit(train_seq_x, train_y, epochs = 1)

In [None]:
gru_predictions = gru_classifier.predict(valid_seq_x)

gru_predictions[gru_predictions >= 0.5] = 1
gru_predictions[gru_predictions < 0.5] = 0

valid_percent = 100 * (1 - np.count_nonzero(np.array([valid_y]).T - gru_predictions.astype(int)) / len(gru_predictions))
print('local results: ' + str(valid_percent) + '%')

In [None]:
# Generate csv
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

gru_test_predictions = gru_classifier.predict(test_seq_x)
test_predictions = np.round(gru_test_predictions).astype(int)
test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('GRUSubmission.csv')

### RNN

In [None]:
def create_bidirectional_rnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

In [None]:
bidirectional_classifier = create_bidirectional_rnn()

bidirectional_classifier.fit(train_seq_x, train_y, epochs = 1)

In [None]:
bidirectional_predictions = bidirectional_classifier.predict(valid_seq_x)

bidirectional_predictions[bidirectional_predictions >= 0.5] = 1
bidirectional_predictions[bidirectional_predictions < 0.5] = 0

valid_percent = 100 * (1 - np.count_nonzero(np.array([valid_y]).T - bidirectional_predictions.astype(int)) / len(bidirectional_predictions))
print('local results: ' + str(valid_percent) + '%')

In [None]:
# Generate csv
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

bidirectional_test_predictions = bidirectional_classifier.predict(test_seq_x)
test_predictions = np.round(bidirectional_test_predictions).astype(int)
test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('BIDIRECTIONALSubmission.csv')

### RCNN

In [None]:
def create_rcnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

In [None]:
rcnn_classifier = create_rcnn()

rcnn_classifier.fit(train_seq_x, train_y, epochs = 1)

In [None]:
rcnn_predictions = rcnn_classifier.predict(valid_seq_x)

rcnn_predictions[rcnn_predictions >= 0.5] = 1
rcnn_predictions[rcnn_predictions < 0.5] = 0

valid_percent = 100 * (1 - np.count_nonzero(np.array([valid_y]).T - rcnn_predictions.astype(int)) / len(rcnn_predictions))
print('local results: ' + str(valid_percent) + '%')

In [None]:
# Generate csv
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

rcnn_test_predictions = rcnn_classifier.predict(test_seq_x)
test_predictions = np.round(rcnn_test_predictions).astype(int)
test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('RCNNSubmission.csv')

### Stacking

In [None]:
nn_out = np.hstack((cnn_predictions, lstm_predictions, gru_predictions, bidirectional_predictions, rcnn_predictions))

In [None]:
clf = LogisticRegressionCV(Cs=5, cv = 5, n_jobs = 1, multi_class = 'ovr').fit(nn_out, valid_y)

In [None]:
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

lstm_test_predictions = lstm_classifier.predict(test_seq_x)
cnn_test_predictions = cnn_classifier.predict(test_seq_x)
gru_test_predictions = gru_classifier.predict(test_seq_x)
rnn_test_predictions = bidirectional_classifier.predict(test_seq_x)
rcnn_test_predictions = rcnn_classifier.predict(test_seq_x)

test_nn_out = np.hstack((cnn_test_predictions, lstm_test_predictions, gru_test_predictions, rnn_test_predictions, rcnn_test_predictions))
test_predictions = clf.predict(test_nn_out)

test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('Logistic_CNN_LSTM_GRU_RNN_RCNN_Submission.csv')

### SNN

In [None]:
def create_snn(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ))
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier

In [None]:
snn_classifier = create_snn(nn_out.shape[1])

In [None]:
snn_classifier.fit(nn_out, valid_y)

In [None]:
# Generate CNN csv
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

lstm_test_predictions = lstm_classifier.predict(test_seq_x)
cnn_test_predictions = cnn_classifier.predict(test_seq_x)
gru_test_predictions = gru_classifier.predict(test_seq_x)
rnn_test_predictions = bidirectional_classifier.predict(test_seq_x)
rcnn_test_predictions = rcnn_classifier.predict(test_seq_x)

test_nn_out = np.hstack((cnn_test_predictions, lstm_test_predictions, gru_test_predictions, rnn_test_predictions, rcnn_test_predictions))
test_predictions = snn_classifier.predict(test_nn_out)
test_predictions = np.round(test_predictions).astype(int)

test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('SNN_CNN_LSTM_GRU_RNN_RCNN_Submission.csv')

### Majority voting

In [None]:
test_texts = []
data_neg = open('data/test_data.txt', encoding="utf8").read()
for i, line in enumerate(data_neg.split('\n')):
    if (line == ''):
        break
    test_texts.append(line)

testDF = pd.DataFrame()
testDF['text'] = test_texts
test_seq_x = sequence.pad_sequences(token.texts_to_sequences(testDF['text']), maxlen=70)

lstm_test_predictions = lstm_classifier.predict(test_seq_x)
cnn_test_predictions = cnn_classifier.predict(test_seq_x)
gru_test_predictions = gru_classifier.predict(test_seq_x)
rnn_test_predictions = bidirectional_classifier.predict(test_seq_x)
rcnn_test_predictions = rcnn_classifier.predict(test_seq_x)

test_pred_matrix = np.round(np.hstack((cnn_test_predictions, lstm_test_predictions, gru_test_predictions, rnn_test_predictions, rcnn_test_predictions))).astype(int)
test_predictions = test_pred_matrix.sum(axis=1)
test_predictions[test_predictions < 2.5] = 0
test_predictions[test_predictions >= 2.5] = 1

test_predictions[test_predictions == 0] = -1
testDF['Prediction'] = test_predictions
testDF = testDF.drop(['text'], axis=1)
testDF['Id'] = testDF.index + 1
testDF = testDF.set_index('Id')
testDF.to_csv('Majority_CNN_LSTM_GRU_RNN_RCNN_Submission.csv')