## Importing Necesseties

In [1]:
import re
import string
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

from collections import Counter

from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split

from keras.layers import Embedding, Dense, LSTM 
from keras.models import Sequential 
from keras.preprocessing.sequence import pad_sequences

## Reading the Data

In [2]:
data = pd.read_csv('../data/review_seperated_by_sentences_conclusion.csv')
data.head(5)

Unnamed: 0,sentence,sentiment
0,"It's a pretty good, inexpensive casual or busi...",1
1,Basic with few extras but pretty good,1
2,It fits as expected and is comfortable but the...,2
3,"For example, it has no side pleats in back and...",0
4,It also doesn't have a hanging loop but I don'...,0


## Text Preprocessing

### Cleaning the Corpus

In [3]:
'''
Make text lowercase, remove text in square brackets, remove links, remove HTML tags,
remove punctuation, remove words containing numbers, remove all single characters, 
and substitute multiple spaces with single space.
'''
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

data['sentence'] = data['sentence'].apply(lambda x:clean_text(x))

# delete row with missing values
data = data.dropna(axis=0)

data.head(5)

Unnamed: 0,sentence,sentiment
0,its pretty good inexpensive casual or business...,1
1,basic with few extras but pretty good,1
2,it fits as expected and is comfortable but the...,2
3,for example it has no side pleats in back and ...,0
4,it also doesnt have hanging loop but dont care...,0


### Tokenizing Data

In [4]:
data['x_temp'] = data['sentence'].apply(lambda x:str(x).split())

### Removing Stopwords

In [5]:
top = Counter([item for sublist in data['x_temp'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

Unnamed: 0,Common_words,count
0,the,1260
1,and,895
2,shirt,852
3,it,605
4,is,575
5,to,473
6,fit,432
7,this,423
8,i,411
9,for,395


In [6]:
def remove_stopword(x):   
    return [y for y in x if y not in stopwords.words('english')]

In [9]:
data['x_temp'] = data['x_temp'].apply(lambda x:remove_stopword(x))
top = Counter([item for sublist in data['x_temp'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')

Unnamed: 0,Common_words,count
0,shirt,852
1,fit,432
2,good,380
3,great,292
4,shirts,252
5,quality,236
6,nice,214
7,like,181
8,well,177
9,fits,153


### One-hot Encoding

In [10]:
def one_hot(x):
    if x == 0: # neutral or unknown
        x = [1, 0, 0]
    elif x == 1: # positive
        x = [0, 1, 0]
    elif x == 2: # negative
        x = [0, 0, 1]
    return x

In [11]:
data['y_temp'] = data['sentiment'].apply(lambda x:one_hot(x))
data.head(10)

Unnamed: 0,sentence,sentiment,x_temp,y_temp
0,its pretty good inexpensive casual or business...,1,"[pretty, good, inexpensive, casual, business, ...","[0, 1, 0]"
1,basic with few extras but pretty good,1,"[basic, extras, pretty, good]","[0, 1, 0]"
2,it fits as expected and is comfortable but the...,2,"[fits, expected, comfortable, minor, finish, i...","[0, 0, 1]"
3,for example it has no side pleats in back and ...,0,"[example, side, pleats, back, doesnt, box, ple...","[1, 0, 0]"
4,it also doesnt have hanging loop but dont care...,0,"[also, doesnt, hanging, loop, dont, care]","[1, 0, 0]"
5,if you do youll be disappointed,2,"[youll, disappointed]","[0, 0, 1]"
6,so the back is plain back which is not problem...,0,"[back, plain, back, problem, wear, work, want,...","[1, 0, 0]"
7,if youre wearing it under coat no problem of c...,0,"[youre, wearing, coat, problem, course, youll,...","[1, 0, 0]"
8,the right cuff buttons are just tad off so the...,2,"[right, cuff, buttons, tad, cuff, hem, doesnt,...","[0, 0, 1]"
9,this is only noticeable to me and only because...,0,"[noticeable, seem, habit, messing]","[1, 0, 0]"


### Data Splitting

In [12]:
train_data, test_data = train_test_split(data, train_size = 0.80, random_state = 0)

x_train = train_data['x_temp']
x_test = test_data['x_temp']
y_train = np.array(train_data['y_temp'])
y_test = np.array(test_data['y_temp'])

### Integer Encoding

In [13]:
max_words = 35000 
tokenizer = Tokenizer(num_words = max_words) 
tokenizer.fit_on_texts(x_train) 
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test) 

### Padding

In [14]:
train_max_len = max(len(l) for l in x_train)
train_average_len = sum(map(len, x_train))/ len(x_train)
test_max_len = max(len(l) for l in x_test)
test_average_len = sum(map(len, x_test))/ len(x_test)

print("=====Train Data=====")
print("max length: ", train_max_len) 
print("average length: ", train_average_len)
print("\n=====Test Data=====")
print("max length: ", test_max_len) 
print("average length: ", test_average_len)

=====Train Data=====
max length:  39
average length:  4.788148721920991

=====Test Data=====
max length:  19
average length:  4.21671826625387


In [16]:
max_len = max(train_max_len, test_max_len)

x_train = pad_sequences(x_train, maxlen=max_len) 
x_test = pad_sequences(x_test, maxlen=max_len)