In [None]:
import pandas as pd

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [None]:
train_dir = './aclImdb/train'
test_dir = './aclImdb/test'

data_train = load_files(train_dir, categories=['pos','neg'], shuffle=True, encoding='utf-8')
data_test = load_files(test_dir, categories=['pos','neg'], shuffle=True, encoding='utf-8')

X_train, y_train = data_train.data, data_train.target
X_test, y_test = data_test.data, data_test.target

In [None]:
nb_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(max_features=20000, ngram_range=(1,2))),
    ('nb', MultinomialNB())
])

nb_pipeline.fit(X_train, y_train)
nb_preds = nb_pipeline.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))
print("\nClassification Report:\n", classification_report(y_test, nb_preds))

In [None]:
df_nb = pd.DataFrame({
    'review': X_test,
    'true_label': y_test,
    'predicted_label': nb_preds
})

misclassified_nb = df_nb[df_nb['true_label'] != df_nb['predicted_label']]
print(f"Number of misclassified NB reviews: {len(misclassified_nb)}")

for i, row in misclassified_nb.sample(5).iterrows():
    print(f"True label: {row['true_label']}, Predicted: {row['predicted_label']}")
    print(row['review'])
    print("-"*80)

In [None]:
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2))),
    ('lr', LogisticRegression(max_iter=2000))
])

lr_pipeline.fit(X_train, y_train)
lr_preds = lr_pipeline.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print("\nClassification Report:\n", classification_report(y_test, lr_preds))

In [None]:
df_lr = pd.DataFrame({
    'review': X_test,
    'true_label': y_test,
    'predicted_label': lr_preds
})

misclassified_lr = df_lr[df_lr['true_label'] != df_lr['predicted_label']]
print(f"Number of misclassified reviews: {len(misclassified_lr)}")

for i, row in misclassified_lr.sample(5).iterrows():
    print(f"True label: {row['true_label']}, Predicted: {row['predicted_label']}")
    print(row['review'])
    print("-"*80)

In [None]:
ensemble_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2))),
    ('ensemble', VotingClassifier(
        estimators=[
            ('nb', MultinomialNB()),
            ('lr', LogisticRegression(max_iter=2000))
        ],
        voting='hard'
    ))
])

ensemble_pipeline.fit(X_train, y_train)

ensemble_preds = ensemble_pipeline.predict(X_test)
print("Ensemble Model Accuracy:", accuracy_score(y_test, ensemble_preds))
print("\nClassification Report:\n", classification_report(y_test, ensemble_preds))

In [None]:
df_ensemble = pd.DataFrame({
    'review': X_test,
    'true_label': y_test,
    'predicted_label': ensemble_preds
})

misclassified_ensemble = df_ensemble[df_ensemble['true_label'] != df_ensemble['predicted_label']]
print(f"Number of misclassified reviews in ensemble: {len(misclassified_ensemble)}")

for i, row in misclassified_ensemble.sample(5).iterrows():
    print(f"True label: {row['true_label']}, Predicted: {row['predicted_label']}")
    print(row['review'])
    print("-"*80)

## Analysis of Misclassified Reviews

### From examining the misclassified reviews produced by our models, we identified a couple reasons as to why our basic models are currently struggling to correctly label these reviews:

#### 1. Word weighting in long reviews:
For lengthy reviews, our models treat all words equally. When a review contains many positive words, these can outweigh negative words (or vice versa), causing the model to clearly misclassify. For example, a negative review with some early positive phrases might be labeled as positive because the total count of positive words dominates the prediction.

#### 2. Lack of contextual understanding:
Our models cannot capture context or sarcasm. As we read some of the reviews ourselves, there was a clear trend.

Take this short snippet as an example: 
- "This film is about a bunch of misfits who are supposed to be assigned to a task that is expected to fail miserably. The misfits pull together to successfully complete their mission."

As you can see, this is very clearly a summary of the movie's plot, but our models see words like "misfits," "fail," and "miserably" and takes them completely out of context labeling them as weight for a negative prediction.