# Importing  Libraries

In [19]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
import pickle

In [3]:
import os
import sys
import re
np.random.seed(7)

In [59]:
Dir='Glove/'
MAX_SEQUENCE_LENGTH = 11
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 300
TEST_SPLIT = 0.1
VALIDATION_SPLIT = 0.1

In [5]:
# Function to clean the string
import string
punctuations = string.punctuation
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")  # List that contains stopwords to reduce noise
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [6]:
def clean_str(text):
    cleaned_text = text.lower()
    cleaned_text=re.sub("[^a-zA-Z]"," ",cleaned_text) #extracting all the words
    cleaned_text=re.sub(r'\b\w{1,3}\b', '',cleaned_text) #removing words with less than 3

    cleaned_text = "".join(c for c in cleaned_text if c not in punctuations) #removing punctuation from the data
    words = cleaned_text.split()
    words = [w for w in words if w not in stopword_list]
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    words = words.str.replace(pattern, r"\1")
    
     #lemmatization
    words = [lem.lemmatize(word,"v") for word in words]
    words = [lem.lemmatize(word,"n") for word in words]
    words = [lem.lemmatize(word,"r") for word in words]
    cleaned_text = " ".join(words)
    
    return cleaned_text

In [7]:
def gloveVec(filename):
    embeddings = {}
    f = open(os.path.join(Dir, filename), encoding='utf-8')
    i = 0
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
        except ValueError:
            i += 1
    f.close()
    return embeddings

In [16]:
def loadData(filename):
    df = pd.read_csv(filename)
    selected = ['label', 'tweet']
    non_selected = list(set(df.columns) - set(selected))
    df = df.drop(non_selected, axis=1)
    df = df.dropna(axis=0, how='any', subset=selected)
    labels = sorted(list(set(df[selected[0]].tolist())))
    dict.fromkeys(set(df[selected[0]].tolist()))
    label_dict = {}
    for i in range(len(labels)):
        label_dict[labels[i]] = i

    x_train = df[selected[1]].apply(lambda x: clean_str(x)).tolist()
    y_train = df[selected[0]].apply(lambda y: label_dict[y]).tolist()
    y_train = to_categorical(np.asarray(y_train))
    return x_train,y_train

In [31]:
def createVocabAndData(sentences):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    vocab = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
    return vocab,data

In [10]:
def createEmbeddingMatrix(word_index,embeddings_index):
    nb_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [26]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
K.clear_session()
callbacks = [EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=5),
             ModelCheckpoint(filepath='best_model.h5', #Best model gets saved
             monitor='val_loss',mode='min',save_best_only=True)]

In [46]:
def lstmModel(embedding_matrix,epoch):
    model = Sequential()
    n, embedding_dims = embedding_matrix.shape

    model.add(Embedding(n, embedding_dims, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True,mask_zero=True))
    model.add(LSTM(128,dropout=0.1,recurrent_dropout=0.2))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(4,activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    model.fit(X_train, y_train, validation_split=VALIDATION_SPLIT, epochs=epoch, batch_size=128,callbacks=callbacks)
    model.save_weights('text_lstm_weights.h5')

    scores= model.evaluate(X_test, y_test, verbose=1)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

In [60]:
if __name__ == "__main__":

    sentences, labels = loadData('tweets.csv')
    embeddings = gloveVec('glove.6B.300d.txt')
    vocab, data = createVocabAndData(sentences)
    embedding_mat = createEmbeddingMatrix(vocab,embeddings)
    pickle.dump([data, labels, embedding_mat], open('embedding_matrix.pkl', 'wb'))
    print ("Data created")

    print("Train Test split")
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SPLIT, random_state=42)

    lstmModel(embedding_mat,20)

Data created
Train Test split
Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 11, 300)           2340900   
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_21 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_22 (Dense)             (None, 4)                 260       
Total params: 2,569,064
Trainable params: 2,569,064
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 00010: early stopping
accuracy: 83.98%


# Using GRU

In [40]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU

In [41]:
callbacks_gru = [EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=5),
             ModelCheckpoint(filepath='best_model_gru.h5', 
             monitor='val_loss',mode='min',save_best_only=True)]

In [42]:
def GRUModel(embedding_matrix,epoch):
    model = Sequential()
    n, embedding_dims = embedding_matrix.shape

    model.add(Embedding(n, embedding_dims, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True,mask_zero=True))
    model.add(GRU(128,dropout=0.1,recurrent_dropout=0.2))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(4,activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    model.fit(X_train, y_train, validation_split=VALIDATION_SPLIT, epochs=epoch, batch_size=128,callbacks=callbacks_gru)
    model.save_weights('text_gru_weights.h5')

    scores= model.evaluate(X_test, y_test, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

In [62]:
if __name__ == "__main__":

    sentences, labels = loadData('tweets.csv')
    embeddings = gloveVec('glove.6B.300d.txt')
    vocab, data = createVocabAndData(sentences)
    embedding_mat = createEmbeddingMatrix(vocab,embeddings)
    pickle.dump([data, labels, embedding_mat], open('embedding_matrix.pkl', 'wb'))
    print ("Data created")

    print("Train Test split")
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SPLIT, random_state=42)

    GRUModel(embedding_mat,20)

Data created
Train Test split
Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 11, 300)           2340900   
_________________________________________________________________
gru_4 (GRU)                  (None, 128)               165120    
_________________________________________________________________
dense_23 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_24 (Dense)             (None, 4)                 260       
Total params: 2,514,536
Trainable params: 2,514,536
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00012: early stopping
accuracy: 83.43%
