In [14]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense, Concatenate, Conv1D, Bidirectional, LSTM, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [15]:
import os
import sys
import re

In [5]:
# Function to clean the string
import string
punctuations = string.punctuation
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")  # List that contains stopwords to reduce noise
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

In [22]:
def clean_str(text):
    cleaned_text = text.lower()
    cleaned_text=re.sub("[^a-zA-Z]"," ",cleaned_text) #extracting all the words
    cleaned_text=re.sub(r'\b\w{1,3}\b', '',cleaned_text) #removing words with less than 3

    cleaned_text = "".join(c for c in cleaned_text if c not in punctuations) #removing punctuation from the data
    words = cleaned_text.split()
    words = [w for w in words if w not in stopword_list]
    
    
     #lemmatization
    words = [lem.lemmatize(word,"v") for word in words]
    words = [lem.lemmatize(word,"n") for word in words]
    words = [lem.lemmatize(word,"r") for word in words]
    cleaned_text = " ".join(words)
    
    return cleaned_text

In [16]:
def loadData(filename):
    df = pd.read_csv(filename)
    selected = ['label', 'tweet']
    non_selected = list(set(df.columns) - set(selected))
    df = df.drop(non_selected, axis=1)
    df = df.dropna(axis=0, how='any', subset=selected)
    labels = sorted(list(set(df[selected[0]].tolist())))
    dict.fromkeys(set(df[selected[0]].tolist()))
    label_dict = {}
    for i in range(len(labels)):
        label_dict[labels[i]] = i

    x_train = df[selected[1]].apply(lambda x: clean_str(x)).tolist()
    y_train = df[selected[0]].apply(lambda y: label_dict[y]).tolist()
    y_train = to_categorical(np.asarray(y_train))
    return x_train,y_train

In [26]:
def createVocabAndData(sentences):
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    vocab = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=maxlen,padding='post')
    return vocab,data

In [30]:
max_features = 10000
maxlen = 11
batch_size = 32
embedding_dims = 300
epochs = 5
TEST_SPLIT = 0.2

In [34]:
class RCNNVariant(Model):


    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 kernel_sizes=[1, 2, 3, 4, 5],
                 class_num=4,
                 last_activation='sigmoid'):
        super(RCNNVariant, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.kernel_sizes = kernel_sizes
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.bi_rnn = Bidirectional(LSTM(128, return_sequences=True))
        self.concatenate = Concatenate()
        self.convs = []
        for kernel_size in self.kernel_sizes:
            conv = Conv1D(128, kernel_size, activation='relu')
            self.convs.append(conv)
        self.avg_pooling = GlobalAveragePooling1D()
        self.max_pooling = GlobalMaxPooling1D()
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of TextRNN must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of TextRNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        x_context = self.bi_rnn(embedding)
        x = self.concatenate([embedding, x_context])
        convs = []
        for i in range(len(self.kernel_sizes)):
            conv = self.convs[i](x)
            convs.append(conv)
        poolings = [self.avg_pooling(conv) for conv in convs] + [self.max_pooling(conv) for conv in convs]
        x = self.concatenate(poolings)
        output = self.classifier(x)
        return output

In [32]:
model = RCNNVariant(maxlen, max_features, embedding_dims)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [23]:
sentences, labels = loadData('tweetdata.csv')

In [27]:
vocab, data = createVocabAndData(sentences)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SPLIT, random_state=42)

In [33]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))
print(model.summary())

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Model: "rcnn_variant_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  3000000   
_________________________________________________________________
bidirectional_2 (Bidirection multiple                  439296    
_________________________________________________________________
concatenate_2 (Concatenate)  multiple                  0         
_________________________________________________________________
conv1d_10 (Conv1D)           multiple                  71296     
_________________________________________________________________
conv1d_11 (Conv1D)           multiple                  142464    
_________________________________________________________________
conv1d_12 (Conv1D)           multiple                  213632    
_________________________________________________________________
conv1d_13 (C