# Cell 1 — Imports

In [1]:
import pandas as pd
import numpy as np
from joblib import load


# Cell 2 — Load Test Data & Model

In [2]:
test_df = pd.read_csv("../data/processed/test.csv")

X_test_text = test_df["review"]
y_test = test_df["sentiment"]

lr_model = load("../models/logistic_regression.pkl")


# Cell 3 — Rebuild TF-IDF (same config)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    sublinear_tf=True
)

X_train_text = pd.read_csv("../data/processed/train.csv")["review"]
tfidf.fit(X_train_text)

X_test = tfidf.transform(X_test_text)


# Cell 4 — Predictions & Errors

In [4]:
test_df["prediction"] = lr_model.predict(X_test)
test_df["correct"] = test_df["prediction"] == test_df["sentiment"]

test_df.head()


Unnamed: 0,review,sentiment,prediction,correct
0,although year still remember complete waste co...,0,0,True
1,blob great horror movie merely vividly horrifi...,1,1,True
2,saddest thing film people cared leave review n...,1,1,True
3,warning review reveal ending movie scoop dont ...,0,0,True
4,film garnered interest praise received simply ...,0,0,True


# Cell 5 — Misclassified Samples

In [5]:
errors = test_df[test_df["correct"] == False]
errors.head(10)


Unnamed: 0,review,sentiment,prediction,correct
64,hollywood long love affair bogus arabian night...,0,1,False
74,michael feifer writes directs fictitious story...,0,1,False
84,watching john preform one kind show shareit re...,1,0,False
87,could breakout role valeria golino film instea...,0,1,False
111,preface stating big fan jjl one patrick theref...,0,1,False
121,known brad linaweaver florida state early insp...,0,1,False
130,purchased dvd recently totally awed rush song ...,1,0,False
131,spoiler includedalthough many commentator call...,1,0,False
137,back smalltime texas filmmaker named brownrigg...,1,0,False
138,agreeable boy paper nonsense sprightly perform...,1,0,False


# Cell 6 — Inspect False Positives & Negatives

In [6]:
false_positive = errors[(errors["sentiment"] == 0)]
false_negative = errors[(errors["sentiment"] == 1)]

print("False Positives:", len(false_positive))
print("False Negatives:", len(false_negative))


False Positives: 263
False Negatives: 241


# Cell 7 — Read Failure Examples

In [7]:
print("FALSE POSITIVE EXAMPLE:\n")
print(false_positive.iloc[0]["review"])

print("\nFALSE NEGATIVE EXAMPLE:\n")
print(false_negative.iloc[0]["review"])


FALSE POSITIVE EXAMPLE:

hollywood long love affair bogus arabian night tale product stood test time memorable jon hall maria montez film long since become camp one filled dubbed song anachronistic slang slapstick truly bounteous crop mesopotamian corn pretty near intolerable today nominated imaginative special effect almost unnoticeable day age consisting mainly trick photography outstanding positive feature survives beautiful color clarity sad say many film made genre come alexander korda original thief baghdad almost arabian night film superior one though loser

FALSE NEGATIVE EXAMPLE:

watching john preform one kind show shareit really something watch grown man portray child like fact every character became could picture looked like entertaining understand individual freak real stand john real talent


# Cell 8 — Most Influential Words

In [8]:
feature_names = tfidf.get_feature_names_out()
coefficients = lr_model.coef_[0]

top_positive = sorted(
    zip(coefficients, feature_names),
    reverse=True
)[:20]

top_negative = sorted(
    zip(coefficients, feature_names)
)[:20]

print("Top Positive Words:")
for coef, word in top_positive:
    print(word, coef)

print("\nTop Negative Words:")
for coef, word in top_negative:
    print(word, coef)


Top Positive Words:
great 7.856554442887639
excellent 7.262269055002063
perfect 5.928969492552718
amazing 5.244440345851048
wonderful 4.9959918861562596
best 4.884512727963093
loved 4.699865433251272
hilarious 4.575713778247438
favorite 4.482138939731196
one best 4.4634617771016565
fun 4.333791614834957
enjoyed 4.293653008955886
brilliant 4.287512046917063
today 4.175665564026532
superb 4.099991009817821
definitely 3.87621370770862
fantastic 3.833289483818374
enjoyable 3.822798858881254
still 3.7969335589241564
love 3.7132761630322535

Top Negative Words:
worst -9.86758608197379
bad -8.488353908765859
awful -8.156611603381226
waste -7.3055838567088465
boring -6.753378415534074
poor -6.232080966387384
terrible -6.1283433568387355
nothing -5.926447719995649
dull -5.359802390686741
worse -5.25565781146149
horrible -5.121961354483391
poorly -5.106343039364744
stupid -4.887765409339379
fails -4.838007229860551
annoying -4.533221379988914
disappointing -4.488868210875732
supposed -4.47356814