In [35]:
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk import FreqDist

from sklearn.model_selection import train_test_split

import numpy as np

import gensim.models as genmod

from keras.preprocessing import sequence

import keras
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D
from keras.layers import Dense
from keras.layers import Input
from keras.layers import Dropout
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Flatten
from keras import optimizers

In [2]:
def polishData(df):
    
    data = []

    for i in xrange(0, df.shape[0]):
        no_links = re.sub("https?:\/\/.*[\r\n]*", " ", df.iloc[:,0][i], flags=re.MULTILINE)

        letters_only = re.sub("[^a-zA-Z]", " ", no_links)

        lower_case = letters_only.lower()
        words = lower_case.split()
    
        words = [w for w in words if not w in stopwords.words("english")]
    
        data.append(words)
        
    return data

In [3]:
def labelsOneHot(df):
    sentiOneHot = pd.get_dummies(df.iloc[:,1])

    labels = np.empty((sentiOneHot.shape), dtype = int)

    for i in xrange(0,sentiOneHot.shape[1]):
        numbers = np.array(sentiOneHot.iloc[:,i])
        labels[:,i] = numbers

    return labels

In [4]:
def polishDataSet(df):
    data = polishData(df)
    labels = labelsOneHot(df)
    
    return data, labels

In [5]:
def vocabBuilder (data, unknown = True, min_no_of_words = 1):
    tokens = []
    
    for i in xrange(0, len(data)):
        for j in xrange(0,len(data[i])):
            tokens.append(data[i][j])
            
    freqdist = FreqDist(tokens)
    
    vocab = []
    
    for key in freqdist:
        if freqdist[key] >= min_no_of_words:
            vocab.append(key)
    
    if unknown:
        vocab.append('UNKNOWN')
    
    return vocab

In [6]:
def fit_unknown_token(data, vocab):
    
    data_unknown = data
    
    for i in xrange(0, len(data)):
        for j in xrange(0, len(data[i])):
            if data[i][j] not in vocab:
                data_unknown[i][j] = 'UNKNOWN'
                
    return data_unknown

In [7]:
def word2vec(data, window, min_count, size, iterations):
    model = genmod.Word2Vec(train_data, window = window, min_count = min_count, 
                            size = size, iter = iterations)
    return model.wv

In [8]:
def word_embedding_matrix_builder(word_vectors_model, size, vocab):
        
    embeddingsMatrix = np.zeros((len(vocab), size))

    for i in xrange(0, len(vocab)):
        if vocab[i] in word_vectors_model.vocab:
            embeddingsMatrix[i] = word_vectors_model[vocab[i]]

    return embeddingsMatrix

In [9]:
def word_to_index(data, vocab):
    
    data_word_to_index = []
    
    for i in xrange(0, len(data)):
        wordToIndex = []

        for j in xrange(0, len(data[i])):            
            l = vocab.index(data[i][j])
            wordToIndex.append(l)
        
        data_word_to_index.append(wordToIndex)

    return data_word_to_index

In [33]:
def makeModel(train_data, train_labels, vocab_length, pretrained = False, wordEmbeddingsLocal = None, 
              wordEmbeddingsGlove = None, wordEmbeddingsGoogle = None, trainable = False, size = 300,
              hidden_layer = 128, activation = 'relu', optimizer = 'adam', loss = 'categorical_crossentropy'):
    
    main_input = Input(shape = (train_data.shape[1],), dtype = 'float64', name = 'main_input')
    word_embeddings_random = Embedding(len(vocab), size, input_length = train_data.shape[1])(main_input)
    word_embeddings_pretrained_word2vec_local = Embedding(len(vocab), wordEmbeddingsLocal.shape[1], 
                                                          input_length = train_data.shape[1],
                                                          weights = [wordEmbeddingsLocal], 
                                                          trainable = trainable) (main_input)
    word_embeddings_pretrained_glove = Embedding(len(vocab), wordEmbeddingsGlove.shape[1], 
                                                 input_length = train_data.shape[1],
                                                 weights = [wordEmbeddingsGlove], 
                                                 trainable = trainable) (main_input)
    word_embeddings_pretrained_word2vec_google = Embedding(len(vocab), wordEmbeddingsGoogle.shape[1], 
                                                           input_length = train_data.shape[1],
                                                           weights = [wordEmbeddingsGoogle], 
                                                           trainable = trainable) (main_input)
    
    convolution_random_1 = Conv1D(256, 32, padding = 'same', activation = 'elu')(word_embeddings_random)
    dropout_random_1 = Dropout(0.2)(convolution_random_1)
    max_pooling_random_1 = MaxPooling1D(pool_size = 8)(dropout_random_1)
    conv_features_random = Flatten() (max_pooling_random_1)
    
    convolution_pretrained_word2vec_local_1 = Conv1D(256, 32, padding = 'same', 
                                                     activation = 'elu')(word_embeddings_pretrained_word2vec_local)
    dropout_pretrained_word2vec_local_1 = Dropout(0.2)(convolution_pretrained_word2vec_local_1)
    max_pooling_pretrained_word2vec_local_1 = MaxPooling1D(pool_size = 8)(dropout_pretrained_word2vec_local_1)
    conv_features_pretrained_word2vec_local = Flatten() (max_pooling_pretrained_word2vec_local_1)
    
    convolution_pretrained_glove_1 = Conv1D(256, 32, padding = 'same', 
                                            activation = 'elu')(word_embeddings_pretrained_glove)
    dropout_pretrained_glove_1 = Dropout(0.2)(convolution_pretrained_glove_1)
    max_pooling_pretrained_glove_1 = MaxPooling1D(pool_size = 8)(dropout_pretrained_glove_1)
    conv_features_pretrained_glove = Flatten() (max_pooling_pretrained_glove_1)
    
    convolution_pretrained_word2vec_google_1 = Conv1D(256, 32, padding = 'same', 
                                                     activation = 'elu')(word_embeddings_pretrained_word2vec_google)
    dropout_pretrained_word2vec_google_1 = Dropout(0.2)(convolution_pretrained_word2vec_google_1)
    max_pooling_pretrained_word2vec_google_1 = MaxPooling1D(pool_size = 8)(dropout_pretrained_word2vec_google_1)
    conv_features_pretrained_word2vec_google = Flatten() (max_pooling_pretrained_word2vec_google_1)
    
    conv_features = keras.layers.concatenate([conv_features_random, conv_features_pretrained_word2vec_local,
                                             conv_features_pretrained_glove, conv_features_pretrained_word2vec_google])
    
    dense_output_1 = Dense(hidden_layer, activation = activation) (conv_features)
    dropout_dense_1 = Dropout(0.5)(dense_output_1)
    dense_output_2 = Dense(hidden_layer, activation = activation) (dropout_dense_1)
    dropout_dense_2 = Dropout(0.5)(dense_output_2)
    main_output = Dense(train_labels.shape[1], activation = 'softmax')(dropout_dense_2)
    
    model = Model(inputs = [main_input], outputs = [main_output])
 
    model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])
    
    return model

In [11]:
def evaluateModel(test_data, test_labels):   
    scores = model.evaluate(test_data, test_labels)
    print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))   

In [12]:
path = "./train_data.csv"
df = pd.read_csv(path)
df

Unnamed: 0,content,sentiment
0,@tiffanylue i know i was listenin to bad habi...,empty
1,Layin n bed with a headache ughhhh...waitin o...,sadness
2,Funeral ceremony...gloomy friday...,sadness
3,wants to hang out with friends SOON!,enthusiasm
4,@dannycastillo We want to trade with someone w...,neutral
5,Re-pinging @ghostridah14: why didn't you go to...,worry
6,"I should be sleep, but im not! thinking about ...",sadness
7,Hmmm. http://www.djhero.com/ is down,worry
8,@charviray Charlene my love. I miss you,sadness
9,@kelcouch I'm sorry at least it's Friday?,sadness


In [13]:
data, labels = polishDataSet(df)

In [15]:
test_split = 0.20
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, 
                                                                    test_size = test_split, 
                                                                    random_state = 42)

In [16]:
vocab = vocabBuilder(train_data, unknown = True, min_no_of_words = 2)

In [17]:
train_data_fit = fit_unknown_token(train_data, vocab)
test_data_fit = fit_unknown_token(test_data, vocab)

In [19]:
#wordModelGoogleWord2Vec = models.KeyedVectors.load_word2vec_format('../../GoogleNews-vectors-negative300.bin', binary=True)
#wordModelGlove = models.KeyedVectors.load_word2vec_format('',binary = True)
wordModelLocal = word2vec(train_data, 5, 2, 128, 20)

In [20]:
wordEmbeddingsMatrixLocal = word_embedding_matrix_builder(wordModelLocal, 128, vocab)

In [21]:
train_data_sequence = word_to_index(train_data_fit, vocab)
test_data_sequence = word_to_index(test_data_fit, vocab)

In [23]:
train_size = 20
train_data_padded = sequence.pad_sequences(train_data_sequence, maxlen = train_size)
test_data_padded = sequence.pad_sequences(test_data_sequence, maxlen = train_size)

In [36]:
model = makeModel(train_data_padded, train_labels, len(vocab), pretrained = False,
                  wordEmbeddingsGlove = wordEmbeddingsMatrixLocal, wordEmbeddingsGoogle = wordEmbeddingsMatrixLocal,
                  wordEmbeddingsLocal = wordEmbeddingsMatrixLocal, trainable = False, size = 300)

In [37]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 20)            0                                            
____________________________________________________________________________________________________
embedding_7 (Embedding)          (None, 20, 300)       9199800     main_input[0][0]                 
____________________________________________________________________________________________________
embedding_8 (Embedding)          (None, 20, 128)       3925248     main_input[0][0]                 
____________________________________________________________________________________________________
embedding_9 (Embedding)          (None, 20, 128)       3925248     main_input[0][0]                 
___________________________________________________________________________________________

In [None]:
epochs = 3
batch_size = 384
model.fit(train_data_padded, train_labels, epochs = epochs, batch_size = batch_size)

In [None]:
model.evaluateModel(test_data_padded, test_labels)