SENTIMENTAL ANALYSIS ON MOVIE REVIEWS

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
review_df = pd.read_csv("https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/X_train.csv")
review_df

Unnamed: 0,review
0,"Shame, is a Swedish film in Swedish with Engli..."
1,I know it's rather unfair to comment on a movi...
2,"""Bread"" very sharply skewers the conventions o..."
3,After reading tons of good reviews about this ...
4,During the Civil war a wounded union soldier h...
...,...
39995,"As a Pagan, I must say this movie has little i..."
39996,A lot of the comments seem to treat this film ...
39997,I've only seen most of the series since I leav...
39998,"The ""all I have is 5 dollars and my wedding ri..."


DATA PREPROCESSING

In [3]:
def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)  # Remove non-alphabetic characters
    return text

review_df["review"] = review_df["review"].apply(clean_text)

In [4]:
review_df["review"] = review_df["review"].str.lower()

In [5]:
nltk.download('punkt')
def tokenize_text(text):
    return nltk.word_tokenize(text)

review_df["review"] = review_df["review"].apply(tokenize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

review_df["review"] = review_df["review"].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

review_df["review"] = review_df["review"].apply(lemmatize_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


with the distribution its clear that the data is balanced data.

In [8]:
!pip install textblob



ADD SENTIMENT COLUMN

In [9]:
from textblob import TextBlob

sentiments = []
for review in review_df['review']:
    review_text = str(review)  # Convert the review to a string
    blob = TextBlob(review_text)
    polarity = blob.sentiment.polarity
    sentiment = 1 if polarity > 0 else 0  # Assign sentiment label based on polarity
    sentiments.append(sentiment)


In [10]:
# Add the sentiment column to the dataset
review_df['Sentiment'] = sentiments

# Save the updated dataset
review_df.to_csv("labeled_data.csv", index=False)

In [11]:
review_df

Unnamed: 0,review,Sentiment
0,"[shame, swedish, film, swedish, english, subti...",1
1,"[know, rather, unfair, comment, movie, without...",0
2,"[bread, sharply, skewer, convention, horror, m...",1
3,"[reading, ton, good, review, movie, decided, t...",1
4,"[civil, war, wounded, union, soldier, hide, is...",1
...,...,...
39995,"[pagan, must, say, movie, little, magickal, si...",1
39996,"[lot, comment, seem, treat, film, baseball, mo...",1
39997,"[seen, series, since, leave, tv, background, n...",0
39998,"[dollar, wedding, ring, scene, riot, also, guf...",1


In [12]:
# Assuming your labeled data is stored in a DataFrame called 'df_labels'
sentiment_counts = review_df['Sentiment'].value_counts()

print("Sentiment Distribution:")
print(sentiment_counts)

Sentiment Distribution:
Sentiment
1    29399
0    10601
Name: count, dtype: int64


In [13]:
X = review_df['review']
y = review_df['Sentiment']

BALANCE THE CLASS

In [14]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler

#Perform oversampling to balance the classes (assuming binary classification)
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(review_df['review'].apply(' '.join).values.reshape(-1, 1), review_df['Sentiment'])

In [15]:
# Create a balanced DataFrame
balanced_review_df = pd.DataFrame({'review': X_resampled.squeeze(), 'Sentiment': y_resampled})

TRAIN AND TEST SPLITTING

In [16]:
from sklearn.model_selection import train_test_split

#Split the data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(balanced_review_df['review'], balanced_review_df['Sentiment'], test_size=0.2, random_state=42)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Feature extraction (TF-IDF)
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_data)
val_features = vectorizer.transform(val_data)

In [20]:
# # Count the number of features
# num_features = len(vectorizer.get_feature_names())
# print("Number of features:", num_features)
# Count the number of features
num_features = len(vectorizer.get_feature_names_out())
print("Number of features:", num_features)

Number of features: 75448


#NAVIE BAYES CLASSIFIER

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [22]:
# Step 6: Model Training and Evaluation (Naive Bayes)
nb_model = MultinomialNB()
nb_model.fit(train_features, train_labels)
nb_predictions = nb_model.predict(val_features)

In [23]:
# Step 7: Print classification report
print("Naive Bayes Classification Report:")
print(classification_report(val_labels, nb_predictions))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.89      0.84      5869
           1       0.87      0.77      0.82      5891

    accuracy                           0.83     11760
   macro avg       0.83      0.83      0.83     11760
weighted avg       0.83      0.83      0.83     11760



Overall, the Naive Bayes model achieved reasonable performance on the test set with an accuracy of 83%. The precision, recall, and F1-score indicate satisfactory performance for both class 0 and class 1.

SVM CLASSIFIER

In [24]:
from sklearn.svm import SVC

# Create an SVM classifier
svm = SVC()

# Train the SVM model
svm.fit(train_features, train_labels)

# Predict on the test data
svm_predictions = svm.predict(val_features)

# Calculate the evaluation metrics for SVM
svm_metrics = classification_report(val_labels, svm_predictions)

# Print the evaluation metrics for SVM
print("SVM Classification Report:")
print(svm_metrics)

RANDOM FOREST CLASSIFIER

In [26]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Train the Random Forest model
rf.fit(train_features, train_labels)

# Predict on the test data
rf_predictions = rf.predict(val_features)

# Calculate the evaluation metrics for Random Forest
rf_metrics = classification_report(val_labels, rf_predictions)

# Print the evaluation metrics for Random Forest
print("Random Forest Classification Report:")
print(rf_metrics)


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      5869
           1       0.95      0.94      0.95      5891

    accuracy                           0.95     11760
   macro avg       0.95      0.95      0.95     11760
weighted avg       0.95      0.95      0.95     11760



LOGISTIC REGRESSION

In [27]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(train_features, train_labels)
logreg_predictions = logreg.predict(val_features)
logreg_report = classification_report(val_labels, logreg_predictions)
print("Logistic Regression Classification Report:")
print(logreg_report)

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      5869
           1       0.96      0.92      0.94      5891

    accuracy                           0.94     11760
   macro avg       0.94      0.94      0.94     11760
weighted avg       0.94      0.94      0.94     11760



GRADIENT BOOSTING CLASSIFIER

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting
gradient_boosting = GradientBoostingClassifier()
gradient_boosting.fit(train_features, train_labels)
gb_predictions = gradient_boosting.predict(val_features)
gb_report = classification_report(val_labels, gb_predictions)
print("Gradient Boosting Classification Report:")
print(gb_report)

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      5869
           1       0.84      0.84      0.84      5891

    accuracy                           0.84     11760
   macro avg       0.84      0.84      0.84     11760
weighted avg       0.84      0.84      0.84     11760



PERFORMING HYPER PARAMETER TUNING

In [30]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Instantiate the Ridge model
ridge_model = Ridge()

# Define the parameter grid for hyperparameter tuning
param_grid = {'alpha': [0.1, 1.0, 10.0], 'solver': ['auto']}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, cv=5)

# Fit the Grid Search object to the data
grid_search.fit(train_features, train_labels)  # Replace X_train and y_train with your training data

# Print the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best parameters: {'alpha': 1.0, 'solver': 'auto'}
Best score: 0.6794608841192258


In [31]:
import joblib

# Save the model to a file
joblib.dump(logreg, 'linear_regression_model.pkl')

['linear_regression_model.pkl']