In [3]:
# ============================================================
# Sentiment Analysis on Movie Reviews
# positive = 1, negative = 0
# BoW + TF-IDF + Logistic Regression (Pipeline)
# ============================================================

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion

# ------------------------------------------------------------
# 1. Movie review texts and labels
# ------------------------------------------------------------

texts = [
    "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked.",
    "Phil the Alien is one of those quirky films where the humour is based around the oddness of everything.",
    "I thought this was a wonderful way to spend time on a too hot summer weekend.",
    "Basically there's a family where a little boy thinks there's a zombie in his closet.",
    "Petter Mattei's Love in the Time of Money is a visually stunning film to watch.",
    "Encouraged by the positive comments about this film I was looking forward to watching it."
]

# 1 = positive, 0 = negative
labels = np.array([1, 0, 1, 0, 1, 0])

# ------------------------------------------------------------
# 2. Train-test split
# ------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.33,
    random_state=42,
    stratify=labels
)

# ------------------------------------------------------------
# 3. Pipeline (BoW + TF-IDF + Logistic Regression)
# ------------------------------------------------------------

pipeline = Pipeline([
    ("features", FeatureUnion([
        ("bow", CountVectorizer(lowercase=True, stop_words="english")),
        ("tfidf", TfidfVectorizer(lowercase=True, stop_words="english"))
    ])),
    ("clf", LogisticRegression(
        max_iter=1000,
        solver="liblinear"
    ))
])

# ------------------------------------------------------------
# 4. Train model
# ------------------------------------------------------------

pipeline.fit(X_train, y_train)

# ------------------------------------------------------------
# 5. Evaluation (NO WARNINGS)
# ------------------------------------------------------------

y_pred = pipeline.predict(X_test)

print("\n=== Classification Report ===")
print(
    classification_report(
        y_test,
        y_pred,
        target_names=["negative", "positive"],
        zero_division=0
    )
)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ------------------------------------------------------------
# 6. Predict on new reviews (IMPORTANT FIX)
# ------------------------------------------------------------

new_reviews = [
    "What an absolutely stunning movie, you won't regret watching it",
    "This movie was boring, slow and a complete waste of time"
]

new_preds = pipeline.predict(new_reviews)
new_probs = pipeline.predict_proba(new_reviews)

for review, label, prob in zip(new_reviews, new_preds, new_probs):
    sentiment = "positive" if label == 1 else "negative"
    print("\nReview:", review)
    print(f"Predicted sentiment: {sentiment} (confidence={prob[label]:.2f})")



=== Classification Report ===
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
    positive       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Confusion Matrix:
[[0 1]
 [1 0]]

Review: What an absolutely stunning movie, you won't regret watching it
Predicted sentiment: positive (confidence=0.54)

Review: This movie was boring, slow and a complete waste of time
Predicted sentiment: positive (confidence=0.59)
