<h1><center> Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from gensim.models import word2vec

import re
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [16]:
# Word2Vec parameters
W2V_SIZE = 300 
W2V_WINDOW = 7  
W2V_MIN_COUNT = 10 
W2V_WORKERS=8
W2V_EPOCH = 32 

# Tokenizer parameters
SEQUENCE_LENGTH = 300

EPOCHS = 1
BATCH_SIZE = 1024

In [17]:
col_names = ["Sentiment","Text"]
data = pd.read_csv(r'D:\CSC590_Design_Project\Data\data.csv',names = col_names,encoding="ISO-8859-1")

In [18]:
stops = set(stopwords.words("english"))
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = SnowballStemmer("english")

def process_text(text,remove_stops = False, stem = False):
    text = str(text).lower().strip()
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove url links
    text = re.sub("@[\w]*",'',text) # remove "@user"
    text = re.sub('[^a-zA-Z]',' ',text) # leave only characters
    words =[]
    for word in text.split():
        if not remove_stops or word not in stops:
            if not stem:
                words.append(word)
            else:
                words.append(stemmer.stem(word))
    return words    

data['Text'] = data['Text'].apply(lambda x: process_text(x,remove_stops = True))

In [19]:
train_rows = round(len(data.index)*0.6)
val_rows = round(len(data)*0.2)
test_rows = len(data.index)-(val_rows+train_rows)

train=data.iloc[:train_rows]
train.reset_index(drop=True, inplace=True)

val = data.iloc[train_rows:train_rows+val_rows]
val.reset_index(drop=True, inplace=True)

test = data.iloc[train_rows+val_rows:]
test.reset_index(drop=True, inplace=True)

In [None]:
w2v_model = word2vec.Word2Vec(size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=W2V_WORKERS)

In [None]:
train_sentences = train['Text'].tolist()
w2v_model.build_vocab(train_sentences)

In [None]:
W2V_EPOCH = 32 
w2v_model.train(train_sentences, total_examples=len(train_sentences), epochs=W2V_EPOCH)

In [None]:
w2v_model.save('w2v_model.model')

In [20]:
w2v_model = word2vec.Word2Vec.load("w2v_model.model")

2021-05-13 18:58:53,675 : INFO : loading Word2Vec object from w2v_model.model
2021-05-13 18:58:54,296 : INFO : loading wv recursively from w2v_model.model.wv.* with mmap=None
2021-05-13 18:58:54,296 : INFO : setting ignored attribute vectors_norm to None
2021-05-13 18:58:54,297 : INFO : loading vocabulary recursively from w2v_model.model.vocabulary.* with mmap=None
2021-05-13 18:58:54,298 : INFO : loading trainables recursively from w2v_model.model.trainables.* with mmap=None
2021-05-13 18:58:54,299 : INFO : setting ignored attribute cum_table to None
2021-05-13 18:58:54,300 : INFO : loaded w2v_model.model


In [32]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['Text'])
vocab_size = len(tokenizer.word_index)+1

In [33]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train['Text']), maxlen=SEQUENCE_LENGTH)
x_val = pad_sequences(tokenizer.texts_to_sequences(val['Text']), maxlen=SEQUENCE_LENGTH)

In [34]:
encoder = LabelEncoder()
encoder.fit(train['Sentiment'].tolist())

y_train = encoder.transform(train['Sentiment'].tolist())
y_val = encoder.transform(val['Sentiment'].tolist())

y_train = y_train.reshape(-1,1)
y_val = y_val.reshape(-1,1)

In [35]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]

In [36]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [37]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 300)          58473900  
_________________________________________________________________
dropout_2 (Dropout)          (None, 300, 300)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 58,634,401
Trainable params: 160,501
Non-trainable params: 58,473,900
_________________________________________________________________


In [38]:
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(x_val,y_val),
                    verbose=1, 
                    callbacks=callbacks)
model.save('D:\CSC590_Design_Project\Model\model.model')

Train on 960000 samples, validate on 320000 samples
Epoch 1/8
157696/960000 [===>..........................] - ETA: 1:18:36 - loss: 0.5535 - accuracy: 0.7120

KeyboardInterrupt: 