## Importing Necesseties

In [None]:
import re
import string
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

from collections import Counter

from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential 
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

## Reading the Data

In [None]:
data = pd.read_csv('../data/sentences123456_no_but.csv')
data.head(5)

## Text Preprocessing

### Cleaning the Corpus

In [None]:
'''
Make text lowercase, remove text in square brackets, remove links, remove HTML tags,
remove punctuation, remove words containing numbers, remove all single characters, 
and substitute multiple spaces with single space.
'''
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

data['sentence'] = data['sentence'].apply(lambda x:clean_text(x))

# delete row with missing values
data = data.dropna(axis=0)

data.head(5)

### Lemmatization

In [None]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

In [None]:
data['sentence'] = data['sentence'].apply(lambda x:" ".join(lemmatize_all(str(x))))
data.head(5)

### Tokenizing Data

In [None]:
data['x_temp'] = data['sentence'].apply(lambda x:str(x).split())

### Removing Stopwords

In [None]:
top = Counter([item for sublist in data['x_temp'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
def remove_stopword(x):   
    return [y for y in x if y not in stopwords.words('english')]

In [None]:
data['x_temp'] = data['x_temp'].apply(lambda x:remove_stopword(x))
top = Counter([item for sublist in data['x_temp'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')

### One-hot Encoding

In [None]:
def one_hot(x):
    if x == 0 or x == '0': # neutral or unknown
        x = [1, 0, 0]
    elif x == 1 or x == '1': # positive
        x = [0, 1, 0]
    elif x == 2 or x == '2': # negative
        x = [0, 0, 1]
    else:
        x = None
    return x

In [None]:
data['y_temp'] = data['sentiment'].apply(lambda x:one_hot(x))

# delete row with poorly classified sentiment
data = data.dropna(axis=0)

data.head(10)

### Data Splitting

In [None]:
train_data, test_data = train_test_split(data, train_size = 0.80, random_state = 0)

x_train = train_data['x_temp']
x_test = test_data['x_temp']

y_train = []
y_test = []

for i in range(len(train_data['y_temp'])):
    y_train.append(train_data['y_temp'].iloc[i])
    
for i in range(len(test_data['y_temp'])):
    y_test.append(test_data['y_temp'].iloc[i])
    
y_train = np.array(y_train) 
y_test = np.array(y_test)

### Integer Encoding

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index) # total word count
rare_cnt = 0 # number of words with frequency of occurrence below the threshold
total_freq = 0 # sum of frequencies of all word in training data
rare_freq = 0 # sum of frequency of words with frequency less than threshold

for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('total word count:',total_cnt)
print('count of rare words with a frequency of %s or less.: %s'%(threshold - 1, rare_cnt))
print("proportion of rare words in a word set:", (rare_cnt / total_cnt)*100)
print("rare word frequency ratio to total frequency of occurrence:", (rare_freq / total_freq)*100)

In [None]:
# remove words with a frequency of 3 or less
# +2 considering padding token 0 and OOV token 1
vocab_size = total_cnt - rare_cnt + 2
print('size of word set:',vocab_size)

In [None]:
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

print(x_train[:3])
print(x_test[:3])
print(len(x_train))

In [None]:
drop_train = [index for index, sentence in enumerate(x_train) if len(sentence) < 1]

# remove empty samples
x_train = np.delete(x_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)

print(len(x_train))
print(len(y_train))

### Padding

In [None]:
train_max_len = max(len(l) for l in x_train)
train_average_len = sum(map(len, x_train))/ len(x_train)
test_max_len = max(len(l) for l in x_test)
test_average_len = sum(map(len, x_test))/ len(x_test)

print("=====Train Data=====")
print("max length: ", train_max_len) 
print("average length: ", train_average_len)
print("\n=====Test Data=====")
print("max length: ", test_max_len) 
print("average length: ", test_average_len)

plt.hist([len(s) for s in x_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
    cnt = 0
    for s in nested_list:
        if(len(s) <= max_len):
            cnt = cnt + 1
    percentage = (cnt / len(nested_list))*100
    print('percentage of examples with length %s or less in the total example: %.3f'
          %(max_len,percentage))

In [None]:
max_len = 22
below_threshold_len(max_len, x_train)

In [None]:
x_train = pad_sequences(x_train, maxlen=max_len) 
x_test = pad_sequences(x_test, maxlen=max_len)

## Model Development

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100)) 
model.add(LSTM(128)) 
model.add(Dense(3, activation='softmax'))

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
opt = optimizers.RMSprop(lr=0.0001)

model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) 
history = model.fit(x_train, y_train, batch_size=10, epochs=50, callbacks=[es, mc], validation_split=0.1)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
loaded_model = load_model('best_model.h5')
print("Test Accuracy: {:.4f}%".format(loaded_model.evaluate(x_test, y_test)[1]*100))