# **Get the The library for the sentiment analysis**

In [23]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## **Intiate objects**

In [24]:
models=tf.keras.models
layers=tf.keras.layers
preprocessing=tf.keras.preprocessing
pading=tf.keras.preprocessing.sequence
uility=tf.keras.utils

# **Load and Clean the data**


In [25]:
train_text=open('train.ft.txt',encoding="utf8").readlines()

In [26]:
label=[int(i[0:11].replace("__label__","")) for i in train_text]
input_text=[i[11:] for i in train_text]

# as the data is huge we are taking only 10000 samples
label=label[:10000]
input_text=input_text[:10000]

In [27]:
input_text[0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

In [28]:
num_of_classes=len(set(label))
print("Number of classes: ",num_of_classes)

Number of classes:  2


In [29]:
def clean_text(text):
    text=re.sub(r'[^\w\s]','',text, re.UNICODE)
    text=re.sub(r'\d+','',text)
    text=text.lower()
    text=[word for word in text.split() if word not in stopwords.words("english")]
    lemmatizer=WordNetLemmatizer()
    text=[lemmatizer.lemmatize(word) for word in text]
    text=" ".join(text)
    return text

cleaned_input_text=[clean_text(i) for i in input_text]

In [30]:
cleaned_input_text[0]

'stuning even nongamer sound track beautiful paint senery mind well would recomend even people hate vid game music played game chrono cross game ever played best music back away crude keyboarding take fresher step grate guitar soulful orchestra would impress anyone care listen _'

In [31]:
tokenizer=preprocessing.text.Tokenizer()
#prepare the data for training
def prepare(input_data:list)->list:
    input_sequences=[]
    tokenizer.fit_on_texts(input_data)
    for i in input_text:
        token_list=tokenizer.texts_to_sequences([i])[0]
        input_sequences.append(token_list)
    max_len=max([len(x) for x in input_sequences])
    input_sequences=pading.pad_sequences(input_sequences,maxlen=max_len,padding="pre")
    return input_sequences,len(tokenizer.word_index)#number of unique words in the text


input_data,voc=prepare(input_text)

In [32]:
for i in range(len(label)):
    if label[i]==2:
        label[i]=1
    else:
        label[i]=0
    
labels=uility.to_categorical(y=label,num_classes=num_of_classes)

# **Create and train the model**

In [33]:
max_len=max([len(x) for x in input_data])
model=models.Sequential()
model.add(layers.Embedding(input_dim=voc+1,output_dim=100,input_length=max_len))
model.add(layers.LSTM(150,return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(100))
model.add(layers.Dense(num_of_classes,activation='softmax'))#dense layer with softmax activation as we have to predict the next word
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 215, 100)          3347000   
                                                                 
 lstm_2 (LSTM)               (None, 215, 150)          150600    
                                                                 
 dropout_1 (Dropout)         (None, 215, 150)          0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               100400    
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 3,598,202
Trainable params: 3,598,202
Non-trainable params: 0
_________________________________________________________________


In [34]:
history=model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history=model.fit(input_data,labels,epochs=5,verbose=1,batch_size=16,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# **Test the model**

## bad text

In [44]:
#save the model
model.save('sentiment.h5')
text='this a very bad product i have ever seen, i will never buy this product again'

text=clean_text(text)
token_list=tokenizer.texts_to_sequences([text])[0]
token_list=pading.pad_sequences([token_list],maxlen=max_len,padding="pre")
pred=model.predict(token_list,verbose=0)
np.argmax(pred)
if np.argmax(pred)==0:
    print("negative")
elif np.argmax(pred)==1:
    print("positive")

negative


## good text

In [42]:
text='this a very good product i have ever seen, i will buy this product again'

text=clean_text(text)
token_list=tokenizer.texts_to_sequences([text])[0]
token_list=pading.pad_sequences([token_list],maxlen=max_len,padding="pre")
pred=model.predict(token_list,verbose=0)
np.argmax(pred)
if np.argmax(pred)==0:
    print("negative")
elif np.argmax(pred)==1:
    print("positive")

positive
