# Sentiment Analysis
------------
### A01378965 - Emilio Rios Ochoa
### A01379868 - Jared Abraham Flores Guarneros

# Naive Bayes method
##*(Reference Only)*

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [None]:
# Read Dataset and save it into a Dataframe
df = pd.read_csv("IMDB Dataset.csv")

In [None]:
# Percentage of the dataset to be used as part of the testing
test_group = 0.5
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=test_group, random_state=42)

In [None]:
# Defines a pipeline to make sure the classification is done after the vectorization
model = Pipeline([
    # Vectorizes the data to be managed as numbers instead of text
    ('vectorizer', CountVectorizer()),
    # Applies a Multinomial Naive Bayes
    ('classifier', MultinomialNB())
])

In [None]:
# Trains the model using the training sets
model.fit(X_train, y_train)
# The model is tested to validate its accuracy
y_pred = model.predict(X_test)
# The accuracy and other metrics are reported to evaluate its performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
# Displays the results
print("Accuracy: {:.2f}%".format(accuracy*100))
print("Report:\n", report)

Accuracy: 84.75%
Report:
               precision    recall  f1-score   support

    negative       0.83      0.87      0.85     12483
    positive       0.87      0.82      0.84     12517

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



In [None]:
# Executes a few more tests (not seen in the dataset) to evaluate the prediction's accuracy
new_comments = ["It was really awful mate", "A bit slow, I loved it though", "I hated it, the actors were as bad as my back pain"]
new_pred = model.predict(new_comments)

print("New review test:", new_comments)
print("Test prediction:", new_pred)


New review test: ['It was really awful mate', 'A bit slow, I loved it though', 'I hated it, the actors were as bad as my back pain']
Test prediction: ['negative' 'positive' 'negative']


# SVM Method

In [None]:
# For source data manipulation
import pandas as pd
# For regex manipulation in data cleaning
import re
# For plotting the confusion matrix
import matplotlib.pyplot as plt
# For model construction
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, classification_report
# NLTK for language manipulation
from nltk import download as nltk_download
nltk_download("stopwords")
nltk_download("punkt")
nltk_download("wordnet")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [24]:
# Read Dataset and save it into a Dataframe
df = pd.read_csv("IMDB Dataset.csv")

In [25]:
# Function to remove any unnecessary information from each comment
def clean_text(text):
    # Define lists of elements to be modified or removed
    conditional_words = ["unless", "if", "until", "except"]
    unwanted_html_tags = ["<.*?>"]
    # Remove unwanted HTML tags (Replace with nothing)
    for tag in unwanted_html_tags:
        text = re.sub(tag, "", text)
    # Replace anything that is not a letter (nubmers, apostrophes, etc.) with an space
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    # Tokenize text (Split sentences into words)
    tokens = word_tokenize(text)
    # Replace conditional statements with negation words
    filtered_tokens = [token if token not in conditional_words else "not" for token in tokens]
    # Remove stop words (such as the, for, etc.)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in filtered_tokens if token not in stop_words]
    # Lemmatize tokens (consider similar words as just one)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Create a new cleaned comment/review string 
    return " ".join(lemmatized_tokens)


In [None]:
# Split the data into the text and labels for training
X = df["review"]
y = df["sentiment"]

# Display the original text before cleaning
print("=====> BEFORE CLEANING:")
print(X)

# Iterate to clean all comments in the source dataframe
for i in range(len(df)):
    original_text = df.loc[i, "review"]
    cleaned_text = clean_text(original_text)
    df.loc[i, "review"] = cleaned_text

# "Reloads" the review column since it was modified during the cleaning
X = df["review"]
# Display the text after cleaning
print("=====> AFTER CLEANING:")
print(X)


In [27]:
# Convert text data to numerical values to be interpreted by the model
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [28]:
# Percentage of the dataset to be used as part of the testing
test_group = 0.25
# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_group, random_state=42)

In [29]:
# Creates the model using a linear kernel and trains it
svm = LinearSVC()
svm.fit(X_train, y_train)

# The model is tested to validate its accuracy
y_pred = svm.predict(X_test)

# The accuracy and other metrics are reported to evaluate its performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
# Displays the results
print("Accuracy: {:.2f}%".format(accuracy*100))
print("Report:\n", report)

In [None]:
# Executes a few more tests (not seen in the dataset) to evaluate the prediction's accuracy
new_comments = ["It was really awful mate", "A bit slow, I loved it though", "I hated it, the actors were as bad as my back pain"]
new_pred = svm.predict(vectorizer.transform(new_comments))

print("New review test:", new_comments)
print("Test prediction:", new_pred)

In [None]:
# Generates a confusion matrix graph to have a visual on the model's performance
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
cm_graph = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ["Negative", "Positive"])
cm_graph.plot(colorbar=False)
plt.show()