# **Sentiment Analysis using Feedforward Neural Network and Word2Vec Embeddings**

### **Importing Libraries**

In [29]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import gensim.downloader as api
import warnings

nltk.download('stopwords',quiet = True)
nltk.download('wordnet',quiet = True)
warnings.filterwarnings('ignore')

### **Importing Dataset**

In [13]:
data = pd.read_csv("./SentimentData.csv")
data.shape

(50000, 2)

In [14]:
data.sample(10)

Unnamed: 0,review,sentiment
12589,"The Japanese ""Run Lola Run,"" his is one offbea...",positive
3511,It must have been excruciating to attend the d...,negative
41657,"So, where are the cannibals? Those intrigued b...",negative
15098,"Just read through the other comments here, and...",negative
48994,Rarely does one find a movie so bad that it ac...,negative
32412,It's refreshing to see a movie that you think ...,positive
38907,I too had waited a long time to see this film....,negative
35012,I never met a single person (out of hundreds) ...,positive
6764,"After the mysterious death of an old friend,a ...",negative
43324,It was agonizingly bad movie. It will eat your...,negative


In [15]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

### **Text Preprocessing**

In [16]:
def preprocess_text(text):
    #Removing special characters, punctuation, numbers, URLs, and extra spaces.
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)#Removes URLs from the given text.
    text = re.sub(r'[^a-zA-Z\s]', '', text)#Removes special characters, punctuations, numbers
    text = re.sub(r'\s+', ' ', text).strip()#Removes extra spaces

    #Converting all text to lower case.
    text = text.lower()

    #Splitting the text into individual tokens (words).
    tokens = text.split()

    #Removing common stopwords (for example, ”the”, ”of”, ”and”,...).
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    #Coverting verbs into their base forms.
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

#Preprocessing the 'Reviews' in the data frame and storing them.
data['preprocessed_reviews'] = data['review'].apply(preprocess_text)

# Convert sentiment labels from categorical strings to numerical values
# Mapping: 'positive' -> 1, 'negative' -> 0
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})  # Adjust the mapping as needed based on your actual labels

In [17]:
data.sample(10)

Unnamed: 0,review,sentiment,preprocessed_reviews
14412,Jon Voight plays a man named Joe. Joe is shook...,1,jon voight play man named joe joe shook haunti...
26100,"This film is a travesty, and isn't fit to keep...",0,film travesty isnt fit keep company superior o...
6096,Keys to the VIP is just another one of the hor...,0,key vip another one horrible tv show see stati...
19420,Pressburger and Powell's greatest movie. David...,1,pressburger powell greatest movie david niven ...
41907,The morbid Catholic writer Gerard Reve (Jeroen...,1,morbid catholic writer gerard reve jeroen krab...
9316,The decline series is amazing and director PS ...,1,decline series amazing director p cant get eno...
33000,I was permanently scarred by this terrible fil...,0,permanently scarred terrible filmbr br main ac...
12775,I saw this movie for the first time on a sick ...,1,saw movie first time sick day school ten year ...
5895,"First and foremost I am a gay man, although do...",0,first foremost gay man although live life with...
3489,Lovely piece of good cinema. This is one of th...,1,lovely piece good cinema one film see smiling ...


### **Word Embeddings with Word2Vec**

In [18]:
#Loading the pre-trained Word2Vec model (Google News)
model = api.load("word2vec-google-news-300")

# Function to create embeddings
def get_review_embedding(review):
    words = review.split()  # Assuming the reviews are space-separated
    word_vectors = []
    
    for word in words:
        if word in model:  # Check if the word is in the model
            word_vectors.append(model[word])
    
    if word_vectors:
        return np.mean(word_vectors, axis=0)  # Average vector
    else:
        return np.zeros(model.vector_size)  # Zero vector if no words are found

# Apply the function to create word embeddings
data['embeddings'] = data['preprocessed_reviews'].apply(get_review_embedding)

#Displaying the data frame after creating embeddings for processed reviews
data.sample(10)

Unnamed: 0,review,sentiment,preprocessed_reviews,embeddings
31789,I was going to use 'The German Scream' as a su...,1,going use german scream summary already taken ...,"[0.059794337, 0.020221801, -5.5403934e-06, 0.0..."
30397,After being forced to sit through some real st...,1,forced sit real stinker racing stripe shark bo...,"[0.07020223, 0.011536153, 0.03207957, 0.091129..."
18471,We can start with the wooden acting but this f...,0,start wooden acting film disaster grown ny tel...,"[0.044482287, 0.020468945, 0.025942635, 0.0817..."
19857,"This game is one of the best RPG. Fist, It is ...",1,game one best rpg fist actually amusing battle...,"[0.089101896, 0.043404475, 0.019463645, 0.0744..."
4835,For me personally this film goes down in my to...,1,personally film go top four time exception jam...,"[0.036895655, 0.025342613, -0.03427226, 0.0757..."
25590,Quentin in my opinion has written and directed...,0,quentin opinion written directed really one go...,"[0.060750842, -0.00020087519, -0.021714706, 0...."
35994,This is the ultimate Kung Fu movie! This is th...,0,ultimate kung fu movie kung fu movie kung fu m...,"[0.08563451, 0.025835427, -0.0016312461, 0.100..."
30877,I had to watch this film because the plot was ...,0,watch film plot outrageous film lived expectat...,"[0.077582434, 0.028249007, 0.013588538, 0.0659..."
32857,Forget about the plot of this movie. Forget ab...,1,forget plot movie forget fact wonderfully acte...,"[0.06402447, 0.0041672927, 0.011843051, 0.0707..."
8952,I really do not know what people have against ...,1,really know people film definitely one favouri...,"[0.054865334, 0.023559848, -0.009990472, 0.085..."


In [19]:
len(data['embeddings'][10])

300

### **Splitting the dataset into training, testing and validation sets**

In [20]:
# Preparing the dataset
X = np.array(data['embeddings'].tolist())
y = np.array(data['sentiment'])
# Split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### **Building and training Feed Forward Neural Network**

In [32]:
# Build the model
model = keras.Sequential([
    layers.Input(shape=(300,)),  # Adjust for your embedding size
    layers.Dense(512, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=20,
                    batch_size=32)

Epoch 1/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7971 - loss: 0.4346 - val_accuracy: 0.8195 - val_loss: 0.4138
Epoch 2/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8475 - loss: 0.3530 - val_accuracy: 0.8321 - val_loss: 0.3689
Epoch 3/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8502 - loss: 0.3460 - val_accuracy: 0.8513 - val_loss: 0.3425
Epoch 4/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8594 - loss: 0.3339 - val_accuracy: 0.8160 - val_loss: 0.3941
Epoch 5/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8573 - loss: 0.3333 - val_accuracy: 0.8493 - val_loss: 0.3431
Epoch 6/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.8646 - loss: 0.3196 - val_accuracy: 0.8497 - val_loss: 0.3396
Epoch 7/20
[1m1

### **Evaluation**

In [33]:
# Get predicted probabilities
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)

# True labels (ensure y_test is in the same format)
# Assuming y_test is already binary (0 or 1)

# Calculate the accuracy
correct_predictions = np.sum(y_pred.flatten() == y_test.flatten())  # Count correct predictions
T = len(y_test)  # Total number of instances in the test set

# Calculate accuracy
accuracy = correct_predictions / T

print(f"Accuracy on D_test: {accuracy:.4f}")

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Print the results
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy on D_test: 0.8593
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8636 - loss: 0.3771
Test Loss: 0.3927
Test Accuracy: 0.8593
