In [1]:
from __future__ import division, print_function
from gensim import models
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import imdb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping
import time
import numpy as np
import pandas as pd
import os
import collections
import re
import string

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# from google.colab import drive
# drive.mount('/content/gdrive')

### Read data

In [5]:
# IMDB, TWITTER
DATA = 'TWITTER'

In [6]:
TRAIN_TEST_SPLIT = 0.7

In [7]:
if(DATA == 'TWITTER'):
    data = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding='latin-1')
    data.columns=['Label', 'id', 'Date', 'Query', 'User', 'Text']
    data = data.drop(columns=['id', 'Date', 'Query', 'User'], axis=1)
    data['Label'] = data['Label'].map({0:'negative', 4:'positive'})
    # data = data.sample(frac = 0.6, random_state = 42)
elif(DATA == 'IMDB'):
    data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
    data.columns = ['Text', 'Label']

data.head()

Unnamed: 0,Label,Text
0,negative,is upset that he can't update his Facebook by ...
1,negative,@Kenichan I dived many times for the ball. Man...
2,negative,my whole body feels itchy and like its on fire
3,negative,"@nationwideclass no, it's not behaving at all...."
4,negative,@Kwesidei not the whole crew


In [8]:
data.Label.unique()

array(['negative', 'positive'], dtype=object)

In [9]:
pos = []
neg = []

for l in data.Label:
    if l == 'negative':
        pos.append(0)
        neg.append(1)
    elif l == 'positive':
        pos.append(1)
        neg.append(0)

data['Pos'] = pos
data['Neg'] = neg

data.head()

Unnamed: 0,Label,Text,Pos,Neg
0,negative,is upset that he can't update his Facebook by ...,0,1
1,negative,@Kenichan I dived many times for the ball. Man...,0,1
2,negative,my whole body feels itchy and like its on fire,0,1
3,negative,"@nationwideclass no, it's not behaving at all....",0,1
4,negative,@Kwesidei not the whole crew,0,1


### Clean data

In [10]:
def remove_punct(text):
    return re.sub('['+string.punctuation+']', '', text)

def lower_token(tokens): 
    return [w.lower() for w in tokens]  

def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))
tokens = [word_tokenize(sen) for sen in data.Text_Clean] 
lower_tokens = [lower_token(token) for token in tokens] 
filtered_words = [remove_stop_words(sen) for sen in lower_tokens] 
result = [' '.join(sen) for sen in filtered_words] 

In [11]:
data['Text_Final'] = result
data['tokens'] = filtered_words
data = data[['Text_Final', 'tokens', 'Label', 'Pos', 'Neg']]
data.head()

Unnamed: 0,Text_Final,tokens,Label,Pos,Neg
0,upset cant update facebook texting might cry r...,"[upset, cant, update, facebook, texting, might...",negative,0,1
1,kenichan dived many times ball managed save 50...,"[kenichan, dived, many, times, ball, managed, ...",negative,0,1
2,whole body feels itchy like fire,"[whole, body, feels, itchy, like, fire]",negative,0,1
3,nationwideclass behaving im mad cant see,"[nationwideclass, behaving, im, mad, cant, see]",negative,0,1
4,kwesidei whole crew,"[kwesidei, whole, crew]",negative,0,1


### Split data into test and train

In [12]:
data_train, data_test = train_test_split(data, test_size = TRAIN_TEST_SPLIT, random_state = 42)

### Load Google News Word2Vec model

In [13]:
word2vec_path = '../input/word2vec/GoogleNews-vectors-negative300.bin'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

### Tokenize and Pad sequences

In [14]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
TRAINING_VOCAB = sorted(list(set(all_training_words)))

tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())

training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_word_index = tokenizer.word_index

In [15]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))

for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)

In [16]:
del tokens
del lower_tokens
del stoplist
del filtered_words
del result
del word2vec
del data

In [17]:
label_names = ['Pos', 'Neg']

x_train = train_cnn_data
y_train = data_train[label_names].values

x_test = test_cnn_data
y_test = data_test[label_names].values

### Define CNN

In [18]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    
    x = Embedding(num_words, embedding_dim,
                                   weights=[embeddings], input_length = max_sequence_length, trainable=True)(sequence_input)

    x = Conv1D(filters=200, kernel_size=2, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(filters=400, kernel_size=3, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(filters=800, kernel_size=4, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Flatten()(x)
    x = Dropout(0.5)(x) 

    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)

    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)

    preds = Dense(labels_index, activation='softmax')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [19]:
class TimeHistory(Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

time_callback = TimeHistory()

In [20]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, len(list(label_names)))
model.summary()

2021-11-13 06:52:21.607457: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-13 06:52:21.708609: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-13 06:52:21.709317: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-13 06:52:21.710533: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 300)           107214600 
_________________________________________________________________
conv1d (Conv1D)              (None, 49, 200)           120200    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 24, 200)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 22, 400)           240400    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 11, 400)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 8, 800)            128080

### Train CNN

In [21]:
batch_size = 64
rlrop = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose = 1)

In [22]:
def getMetrics(num_of_epochs):
    print(f'Epochs: {num_of_epochs}')
    
    model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, len(list(label_names)))
    hist = model.fit(x_train, y_train, epochs=num_of_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size, callbacks = [time_callback, rlrop], verbose = 0)
    total_time = sum(time_callback.times)
    
    print(f'Total Time Taken: {total_time} s')
    
    start = time.time()
    
    predictions = model.predict(x_test, verbose=1)
    prediction_labels = []
    
    for p in predictions:
        prediction_labels.append(labels[np.argmax(p)])
    
    end = time.time()
    total_inference_time = end - start
    
    print(f'Test Accuracy: {accuracy_score(data_test.Label, prediction_labels)}')
    print(f'Total Inference Time: {total_inference_time}')
    
    cce = tf.keras.losses.CategoricalCrossentropy()
    cce_loss = cce(y_test, predictions).numpy()
    print(f'Total Loss: {cce_loss}')
    
    print('-----')

In [23]:
epochs = [2, 5, 10]
labels = ['positive', 'negative']

for epoch in epochs:
    getMetrics(epoch)

Epochs: 2


2021-11-13 06:52:25.576220: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-13 06:52:27.058208: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Total Time Taken: 423.4450204372406 s
Test Accuracy: 0.7336357142857143
Total Inference Time: 85.28436613082886
Total Loss: 0.5422611832618713
-----
Epochs: 5

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Total Time Taken: 1036.5236241817474 s
Test Accuracy: 0.7114410714285714
Total Inference Time: 85.1926097869873
Total Loss: 0.9328731298446655
-----
Epochs: 10

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.

Epoch 00009: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Total Time Taken: 2079.5017058849335 s
Test Accuracy: 0.6942303571428572
Total Inference Time: 86.01497077941895
Total Loss: 2.297076463699341
-----


In [24]:
del data_train
del data_test