<a href="https://colab.research.google.com/github/cur10usityDrives/Sentiment-Analysis/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Loading data using pandas read_csv function
train_data = pd.read_csv("imdb_train.csv")
valid_data = pd.read_csv("imdb_valid.csv")
test_data = pd.read_csv("imdb_test.csv")

# Separate features and labels - assign the text column to X and labels to y
X_train, y_train = train_data.iloc[:, 0], train_data.iloc[:, 1]
X_valid, y_valid = valid_data.iloc[:, 0], valid_data.iloc[:, 1]
X_test, y_test = test_data.iloc[:, 0], test_data.iloc[:, 1]

# Vectorize text data using tfxidf for unigram features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))  # Unigram features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Naive Bayes classifier - MultinomialNB()
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tfidf, y_train)

# Evaluate on validation set
valid_accuracy = naive_bayes.score(X_valid_tfidf, y_valid)
print("Validation Accuracy (Unigram):", valid_accuracy)

# Now, let's try with unigram + bigram features, still using tfxidf
tfidf_vectorizer_ngram = TfidfVectorizer(ngram_range=(1, 2))  # Unigram + Bigram features
X_train_tfidf_ngram = tfidf_vectorizer_ngram.fit_transform(X_train)
X_valid_tfidf_ngram = tfidf_vectorizer_ngram.transform(X_valid)
X_test_tfidf_ngram = tfidf_vectorizer_ngram.transform(X_test)

# Train Naive Bayes classifier on uni+bigram vectorized X_train set - MultinomialNB()
naive_bayes_ngram = MultinomialNB()
naive_bayes_ngram.fit(X_train_tfidf_ngram, y_train)

# Evaluate on validation set
valid_accuracy_ngram = naive_bayes_ngram.score(X_valid_tfidf_ngram, y_valid)
print("Validation Accuracy (Unigram + Bigram):", valid_accuracy_ngram)

# Choose the best model based on validation accuracy
if valid_accuracy > valid_accuracy_ngram:
    best_model = naive_bayes
    best_vectorizer = tfidf_vectorizer
else:
    best_model = naive_bayes_ngram
    best_vectorizer = tfidf_vectorizer_ngram

# Evaluate the best model on the test set by vectorizing the test set using the best_vectorizer
test_pred = best_model.predict(best_vectorizer.transform(X_test))
test_accuracy = accuracy_score(y_test, test_pred)
print("Test Accuracy (Best Model):", test_accuracy)
# Identify top-10 most predictive features based on absolute differences in log probabilities
import numpy as np
feature_names = best_vectorizer.get_feature_names_out()
feature_probs = best_model.feature_log_prob_

# Calculate absolute differences in log probabilities
differences = np.abs(feature_probs[1] - feature_probs[0])

# Sort features based on absolute differences
top_10_indices = differences.argsort()[-10:][::-1]
top_10_features = [feature_names[idx] for idx in top_10_indices]
print("Top-10 Most Predictive Features:", top_10_features)

Validation Accuracy (Unigram): 0.8616
Validation Accuracy (Unigram + Bigram): 0.8846
Test Accuracy (Best Model): 0.8934
Top-10 Most Predictive Features: ['worst movie', 'waste of', 'waste your', 'waste', 'worst movies', 'don waste', 'worst film', 'the worst', 'worst', 'this crap']


In [None]:
# Identify top-10 most predictive features
feature_names = best_vectorizer.get_feature_names_out()
# For MultinomialNB
feature_probs = best_model.feature_log_prob_
# Difference in log probabilities for positive vs. negative sentiment
top_10_indices_pos = feature_probs[1] - feature_probs[0]
top_10_indices_pos = top_10_indices_pos.argsort()[-10:][::-1]
top_10_features_pos = [feature_names[feature] for feature in top_10_indices_pos]
print("Top-10 Most Predictive Features for positive sentiment:", top_10_features_pos)

top_10_indices_neg = feature_probs[0] - feature_probs[1]
top_10_indices_neg = top_10_indices_neg.argsort()[-10:][::-1]
top_10_features_neg = [feature_names[feature] for feature in top_10_indices_neg]
print("Top-10 Most Predictive Features for negative sentiment:", top_10_features_neg)

Top-10 Most Predictive Features for positive sentiment: ['highly recommended', 'well worth', '10 10', 'loved this', 'is must', 'highly recommend', 'matthau', 'must see', 'definitely worth', 'loved it']
Top-10 Most Predictive Features for negative sentiment: ['worst movie', 'waste of', 'waste your', 'waste', 'worst movies', 'don waste', 'worst film', 'the worst', 'worst', 'this crap']


In [None]:
# Identify top-10 most predictive features based on absolute differences in log probabilities
import numpy as np
feature_names = best_vectorizer.get_feature_names_out()
if hasattr(best_model, 'feature_log_prob_'):  # For MultinomialNB
    feature_probs = best_model.feature_log_prob_
else:  # For BernoulliNB or other classifiers
    feature_probs = best_model.coef_

# Calculate absolute differences in log probabilities
differences = np.abs(feature_probs[1] - feature_probs[0])

# Sort features based on absolute differences
top_10_indices = differences.argsort()[-10:][::-1]
top_10_features = [feature_names[idx] for idx in top_10_indices]
print("Top-10 Most Predictive Features:", top_10_features)


Top-10 Most Predictive Features: ['worst movie', 'waste of', 'waste your', 'waste', 'worst movies', 'don waste', 'worst film', 'the worst', 'worst', 'this crap']
