Uses logistic regression for sentiment analysis, this approach will struggle with words not previously seen in the dataset which can be greatly improved by adding a lot more new and unique reviews

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [8]:
data = {
    'review': [
        "I love this movie",
        "I hate this movie",
        "Absolutely amazing",
        "Tom Cruise is a fantastic actor",
        "This movie is great",
        "Terrible acting and boring plot",
        "An absolute masterpiece!",
        "Waste of time",
        "Brilliant and heartwarming",
        "Boring, predictable, slow",
        "A rare movie where the sequel is actually better than the prequel",
        "Fantastic visuals and storytelling",
        "Not good, not bad, just average"
    ],
    'label': [1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0]  # 1=Positive, 0=Negative
}

# creates a table of the data
df = pd.DataFrame(data)


# using an 80-20 split for training/testing
# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['label'], test_size=0.2, random_state=42)

In [9]:
# ------------------ Step 4: TF-IDF Vectorization ------------------
vectorizer = TfidfVectorizer()

# fit_transform does the following
# Learns the vocabulary (all unique words) from the training data
# Computes the IDF for each word
# Stores that internally for later use
# For each review in the training set, calculates the TF-IDF vector
# Converts each review into a numeric vector based on word importance
X_train_tfidf = vectorizer.fit_transform(X_train)


# transform does the following
# For each review in the testing set, calculates the TF-IDF vector
# Converts each review into a numeric vector based on word importance
X_test_tfidf = vectorizer.transform(X_test)


In [10]:

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)



# The logistic regression model learns a weight (θ) for each feature and a bias term (θ₀):
# z=θ0​+θ1​x1​+θ2​x2​+⋯+θn​xn​
# the output is then converted to 0 or 1 using a sigmoid activation function

In [11]:
# Evaluate Model
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict New Reviews
def predict_sentiment(review):
    review_tfidf = vectorizer.transform([review])
    prediction = model.predict(review_tfidf)[0]
    return "Positive" if prediction == 1 else "Negative"

# Test with a new review
new_review = "This movie was absolutely amazing and touching"
print("New Review:", new_review)
print("Predicted Sentiment:", predict_sentiment(new_review))

Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.67      1.00      0.80         2

    accuracy                           0.67         3
   macro avg       0.33      0.50      0.40         3
weighted avg       0.44      0.67      0.53         3

New Review: This movie was absolutely amazing and touching
Predicted Sentiment: Positive


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
