# **Sentiment Analysis using Feedforward Neural Network and Word2Vec Embeddings**

### **Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import gensim.downloader as api
import warnings

nltk.download('stopwords',quiet = True)
nltk.download('wordnet',quiet = True)
warnings.filterwarnings('ignore')

### **Importing Dataset**

In [2]:
data = pd.read_csv("./SentimentData.csv")
data.shape

(50000, 2)

In [3]:
data.sample(10)

Unnamed: 0,review,sentiment
18203,It's a shame this movie didn't get more play i...,positive
10677,"It took us a couple of episodes to ""get into"" ...",positive
10249,Not the best of actors' movies.The director ha...,negative
12053,This DVD usually sells for around $20. I would...,positive
32502,I see a lot of folks on this site wishing AG w...,positive
4587,"Y'know, it's very interesting watching this......",positive
2861,"""The China Syndrome"" could not have been relea...",positive
2202,Another Day - this movie requires you to watch...,negative
27281,IT was no sense and it was so awful... i think...,negative
41673,"On one level, this film can bring out the chil...",positive


In [4]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

### **Text Preprocessing**

In [5]:
def preprocess_text(text):
    #Removing special characters, punctuation, numbers, URLs, and extra spaces.
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)#Removes URLs from the given text.
    text = re.sub(r'[^a-zA-Z\s]', '', text)#Removes special characters, punctuations, numbers
    text = re.sub(r'\s+', ' ', text).strip()#Removes extra spaces

    #Converting all text to lower case.
    text = text.lower()

    #Splitting the text into individual tokens (words).
    tokens = text.split()

    #Removing common stopwords (for example, ”the”, ”of”, ”and”,...).
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    #Coverting verbs into their base forms.
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

#Preprocessing the 'Reviews' in the data frame and storing them.
data['preprocessed_reviews'] = data['review'].apply(preprocess_text)

# Convert sentiment labels from categorical strings to numerical values
# Mapping: 'positive' -> 1, 'negative' -> 0
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})  # Adjust the mapping as needed based on your actual labels

In [6]:
data.sample(10)

Unnamed: 0,review,sentiment,preprocessed_reviews
2901,Since my third or fourth viewing some time ago...,1,since third fourth viewing time ago ive abstai...
2800,and anyone who watches this film will agree. T...,1,anyone watch film agree film directed day plot...
19793,This is such a great film! Never mind the low ...,1,great film never mind low rating really idea c...
27141,Corean cinema can be quite surprising for an o...,1,corean cinema quite surprising occidental audi...
24946,Spoiler Alert Well I think this movie is proba...,0,spoiler alert well think movie probably worst ...
34606,"I find I enjoy this show, but the format needs...",1,find enjoy show format need work first good at...
46070,alright this movie might have been good if the...,0,alright movie might good plot behind title did...
23116,I went to see this because of snipes / statham...,0,went see snipe statham honestly chaos terrible...
17114,"Mean spirited, and down right degrading adapta...",0,mean spirited right degrading adaptation class...
21232,"if i could rate it a zero i would , coming fro...",0,could rate zero would coming someone like shoc...


### **Word Embeddings with Word2Vec**

In [7]:
#Loading the pre-trained Word2Vec model (Google News)
model = api.load("word2vec-google-news-300")

# Function to create embeddings
def get_review_embedding(review):
    words = review.split()  # Assuming the reviews are space-separated
    word_vectors = []
    
    for word in words:
        if word in model:  # Check if the word is in the model
            word_vectors.append(model[word])
    
    if word_vectors:
        return np.mean(word_vectors, axis=0)  # Average vector
    else:
        return np.zeros(model.vector_size)  # Zero vector if no words are found

# Apply the function to create word embeddings
data['embeddings'] = data['preprocessed_reviews'].apply(get_review_embedding)

#Displaying the data frame after creating embeddings for processed reviews
data.sample(10)

Unnamed: 0,review,sentiment,preprocessed_reviews,embeddings
29873,"Joe Don Baker. He was great in ""Walking Tall"" ...",0,joe baker great walking tall good bitpart gold...,"[0.029917685, 0.022685226, -0.013670604, 0.096..."
14118,Leave it to Braik to put on a good show. Final...,1,leave braik put good show finally zorak living...,"[0.04333496, 0.015791636, -0.0036208138, 0.095..."
40808,"When I first got wind of this picture, it was ...",1,first got wind picture called shepherd suppose...,"[0.07394431, 0.05116824, -0.000601153, 0.06174..."
37787,It is a truism that it takes a lot of effort t...,0,truism take lot effort make bad movie one exce...,"[0.05006476, 0.0034094702, -0.04726294, 0.0843..."
28839,Pretty poor Firestarter clone that seems more ...,0,pretty poor firestarter clone seems like bad t...,"[0.09375922, 0.030691028, -0.01133128, 0.08002..."
30304,So umm this woman has a vagina that sucks peop...,0,umm woman vagina suck people umm there dude li...,"[0.048493944, 0.016780438, 0.010638511, 0.1359..."
14224,"Ever since I started visiting this site, and v...",0,ever since started visiting site voting movie ...,"[0.05767489, 0.027181417, 0.010050765, 0.12420..."
42373,I rented this movie simply because Rosario Daw...,1,rented movie simply rosario dawson sat watch b...,"[0.052316885, 0.037958153, 0.015223141, 0.0830..."
28154,"The most amazing, spiritually uplifting movie ...",1,amazing spiritually uplifting movie restoratio...,"[0.09076538, 0.018694958, -0.022803752, 0.1261..."
24277,**1/2 for this Diane Keaton farce.<br /><br />...,0,diane keaton farcebr br someone tell m keaton ...,"[0.03529368, 0.027727127, -0.005451353, 0.0747..."


In [8]:
len(data['embeddings'][10])

300

### **Splitting the dataset into training, testing and validation sets**

In [9]:
# Preparing the dataset
X = np.array(data['embeddings'].tolist())
y = np.array(data['sentiment'])
# Split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### **Building and training Feed Forward Neural Network**

In [10]:
# Build the model
model = keras.Sequential([
    layers.Input(shape=(300,)),  # Adjust for your embedding size
    layers.Dense(512, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=20,
                    batch_size=32)

Epoch 1/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7796 - loss: 0.4375 - val_accuracy: 0.8165 - val_loss: 0.4153
Epoch 2/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8426 - loss: 0.3619 - val_accuracy: 0.8397 - val_loss: 0.3622
Epoch 3/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8550 - loss: 0.3365 - val_accuracy: 0.8492 - val_loss: 0.3419
Epoch 4/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8561 - loss: 0.3305 - val_accuracy: 0.8231 - val_loss: 0.3984
Epoch 5/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8544 - loss: 0.3356 - val_accuracy: 0.8393 - val_loss: 0.3573
Epoch 6/20
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8586 - loss: 0.3251 - val_accuracy: 0.8508 - val_loss: 0.3361
Epoch 7/20
[1m1

### **Evaluation**

In [11]:
# Get predicted probabilities
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)

# True labels (ensure y_test is in the same format)
# Assuming y_test is already binary (0 or 1)

# Calculate the accuracy
correct_predictions = np.sum(y_pred.flatten() == y_test.flatten())  # Count correct predictions
T = len(y_test)  # Total number of instances in the test set

# Calculate accuracy
accuracy = correct_predictions / T

print(f"Accuracy on D_test: {accuracy:.4f}")

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Print the results
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy on D_test: 0.8476
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8524 - loss: 0.3698
Test Loss: 0.3782
Test Accuracy: 0.8476
