## Importing Necesseties

In [None]:
import re
import string
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

from collections import Counter

from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split

from keras.layers import Embedding, Dense, LSTM 
from keras.models import Sequential 
from keras.preprocessing.sequence import pad_sequences

## Reading the Data

In [None]:
data = pd.read_csv('../data/review_seperated_by_sentences_conclusion.csv')
data.head(5)

## Text Preprocessing

### Cleaning the Corpus

In [None]:
'''
Make text lowercase, remove text in square brackets, remove links, remove HTML tags,
remove punctuation, remove words containing numbers, remove all single characters, 
and substitute multiple spaces with single space.
'''
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

data['sentence'] = data['sentence'].apply(lambda x:clean_text(x))

# delete row with missing values
data = data.dropna(axis=0)

data.head(5)

### Tokenizing Data

In [None]:
data['x_temp'] = data['sentence'].apply(lambda x:str(x).split())

### Removing Stopwords

In [None]:
top = Counter([item for sublist in data['x_temp'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
def remove_stopword(x):   
    return [y for y in x if y not in stopwords.words('english')]

In [None]:
data['x_temp'] = data['x_temp'].apply(lambda x:remove_stopword(x))
top = Counter([item for sublist in data['x_temp'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')

### One-hot Encoding

In [None]:
def one_hot(x):
    if x == 0 or x == '0': # neutral or unknown
        x = [1, 0, 0]
    elif x == 1 or x == '1': # positive
        x = [0, 1, 0]
    elif x == 2 or x == '2': # negative
        x = [0, 0, 1]
    else:
        x = None
    return x

In [None]:
data['y_temp'] = data['sentiment'].apply(lambda x:one_hot(x))

# delete row with poorly classified sentiment
data = data.dropna(axis=0)

data.head(10)

### Data Splitting

In [None]:
train_data, test_data = train_test_split(data, train_size = 0.80, random_state = 0)

x_train = train_data['x_temp']
x_test = test_data['x_temp']

y_train = []
y_test = []

for i in range(len(train_data['y_temp'])):
    y_train.append(train_data['y_temp'].iloc[i])
    
for i in range(len(test_data['y_temp'])):
    y_test.append(test_data['y_temp'].iloc[i])
    
y_train = np.array(y_train) 
y_test = np.array(y_test)

### Integer Encoding

In [None]:
max_words = 35000 
tokenizer = Tokenizer(num_words = max_words) 
tokenizer.fit_on_texts(x_train) 
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test) 

### Padding

In [None]:
train_max_len = max(len(l) for l in x_train)
train_average_len = sum(map(len, x_train))/ len(x_train)
test_max_len = max(len(l) for l in x_test)
test_average_len = sum(map(len, x_test))/ len(x_test)

print("=====Train Data=====")
print("max length: ", train_max_len) 
print("average length: ", train_average_len)
print("\n=====Test Data=====")
print("max length: ", test_max_len) 
print("average length: ", test_average_len)

In [None]:
max_len = max(train_max_len, test_max_len)

x_train = pad_sequences(x_train, maxlen=max_len) 
x_test = pad_sequences(x_test, maxlen=max_len)

## Model Development

In [None]:
model = Sequential()
model.add(Embedding(max_words, 100)) 
model.add(LSTM(128)) 
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) 
history = model.fit(x_train, y_train, batch_size=10, epochs=10, validation_split=0.1)

In [None]:
print("Test Accuracy: {:.2f}%".format(model.evaluate(x_test, y_test)[1]*100))