## Importing Necesseties

In [None]:
import re
import string
import pandas as pd
import numpy as np

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

import pickle

from keras.preprocessing.sequence import pad_sequences

from keras.models import load_model

## Loading Data

In [None]:
print('Enter the name of file: ')
file_name = input()
location = '../data/' + file_name + '.csv'
data = pd.read_csv(location, encoding='cp949')
data.head(5)

## Text Preprocessing

### Cleaning the Corpus

In [None]:
'''
Make text lowercase, remove text in square brackets, remove links, remove HTML tags,
remove punctuation, remove words containing numbers, remove all single characters, 
and substitute multiple spaces with single space.
'''
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

In [None]:
data['sentence'] = data['sentence'].apply(lambda x:clean_text(x))

# delete row with missing values
data = data.dropna(axis=0)

data.head(5)

### Lemmatization

In [None]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

In [None]:
data['sentence'] = data['sentence'].apply(lambda x:" ".join(lemmatize_all(str(x))))
data.head(5)

### Tokenizing Data

In [None]:
data['x_temp'] = data['sentence'].apply(lambda x:str(x).split())

### Removing Stopwords

In [None]:
def remove_stopword(x):   
    return [y for y in x if y == 'not' or y not in stopwords.words('english')]

In [None]:
data['x_temp'] = data['x_temp'].apply(lambda x:remove_stopword(x))
data = data.dropna(axis=0)

### Integer Encoding

In [None]:
# loading tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
predict_data = data['x_temp']
predict_data = tokenizer.texts_to_sequences(predict_data)

### Padding

In [None]:
predict_data = pad_sequences(predict_data, maxlen=22)

## Loading Model

In [None]:
loaded_model = load_model('review_sentiment_analysis_model.h5')

## Prediction

In [None]:
prediction = loaded_model.predict(predict_data)

In [None]:
predicted_labels = np.argmax(prediction, axis=1)
predicted_labels = pd.DataFrame(predicted_labels)

In [None]:
data['sentiment']= predicted_labels
data

In [None]:
labeled_file_name = 'sentiment_labeled_' + file_name + '.csv'
data.to_csv(labeled_file_name, columns = ['sentence', 'sentiment'], index=False) 