In [1]:
# Install dependencies
# !pip install spacy pandas scikit-learn

import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from spacy.training.example import Example

In [2]:
# Load your dataset
df = pd.read_csv("kalki_movie_reviews.csv")

In [3]:
# Show columns to verify
print("📋 Columns in file:", df.columns.tolist())

📋 Columns in file: ['Comments', 'Ratings']


In [4]:
# Try to automatically detect text and label columns
possible_text_cols = [c for c in df.columns if "review" in c.lower() or "text" in c.lower() or "comment" in c.lower()]
possible_label_cols = [c for c in df.columns if "sentiment" in c.lower() or "label" in c.lower() or "target" in c.lower() or "rating" in c.lower()]

if not possible_text_cols or not possible_label_cols:
    raise ValueError("❌ Could not detect text or sentiment columns. Please specify manually.")

TEXT_COL = possible_text_cols[0]
LABEL_COL = possible_label_cols[0]

print(f"✅ Using text column: {TEXT_COL}")
print(f"✅ Using label column: {LABEL_COL}")



✅ Using text column: Comments
✅ Using label column: Ratings


In [5]:
# Normalize labels
df[LABEL_COL] = df[LABEL_COL].astype(str).str.lower().str.strip()



In [6]:
# Map labels to spaCy-friendly format
def label_to_dict(label):
    if label in ["positive", "pos", "1", "good", "yes"]:
        return {"POSITIVE": 1.0, "NEGATIVE": 0.0}
    else:
        return {"POSITIVE": 0.0, "NEGATIVE": 1.0}

df["cats"] = df[LABEL_COL].apply(label_to_dict)



In [7]:
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)



In [8]:
# Create a blank English model
nlp = spacy.blank("en")



In [9]:
# Add text categorizer
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")



1

In [10]:
# Initialize model
optimizer = nlp.initialize()



In [11]:
# Train
for epoch in range(5):  # increase to 10+ for better accuracy
    losses = {}
    for i, row in train_df.iterrows():
        doc = nlp.make_doc(str(row[TEXT_COL]))
        example = Example.from_dict(doc, {"cats": row["cats"]})
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Epoch {epoch+1} - Losses: {losses}")



Epoch 1 - Losses: {'textcat': 3.414907012794077}
Epoch 2 - Losses: {'textcat': 3.695010416740704e-09}
Epoch 3 - Losses: {'textcat': 6.317867708963915e-11}
Epoch 4 - Losses: {'textcat': 1.3339134645567429e-12}
Epoch 5 - Losses: {'textcat': 3.133023052287185e-14}


In [12]:
# Evaluate
correct = 0
total = 0
for i, row in test_df.iterrows():
    doc = nlp(str(row[TEXT_COL]))
    pred = "positive" if doc.cats["POSITIVE"] > doc.cats["NEGATIVE"] else "negative"
    true = "positive" if row["cats"]["POSITIVE"] == 1.0 else "negative"
    if pred == true:
        correct += 1
    total += 1

accuracy = correct / total
print(f"\n🎯 Test Accuracy: {accuracy:.2f}")




🎯 Test Accuracy: 1.00


In [13]:
# Test on new examples
examples = [
    "The movie was absolutely fantastic!",
    "It was boring and too long.",
    "Superb acting and great story.",
]
for text in examples:
    doc = nlp(text)
    print(f"\n{text} → {doc.cats}")# Initialize model
optimizer = nlp.initialize()




The movie was absolutely fantastic! → {'POSITIVE': 0.05906382203102112, 'NEGATIVE': 0.9409362077713013}

It was boring and too long. → {'POSITIVE': 3.392460357076743e-08, 'NEGATIVE': 1.0}

Superb acting and great story. → {'POSITIVE': 1.3859840919394628e-07, 'NEGATIVE': 0.9999998807907104}


In [14]:
# Train
for epoch in range(5):  # increase to 10+ for better accuracy
    losses = {}
    for i, row in train_df.iterrows():
        doc = nlp.make_doc(str(row[TEXT_COL]))
        example = Example.from_dict(doc, {"cats": row["cats"]})
        nlp.update([example], sgd=optimizer, losses=losses)
    print(f"Epoch {epoch+1} - Losses: {losses}")


Epoch 1 - Losses: {'textcat': 3.2344277052895682}
Epoch 2 - Losses: {'textcat': 2.7510674283382624e-09}
Epoch 3 - Losses: {'textcat': 5.0309864104032416e-11}
Epoch 4 - Losses: {'textcat': 1.153974144397634e-12}
Epoch 5 - Losses: {'textcat': 2.7087791463342175e-14}


In [15]:

# Evaluate
correct = 0
total = 0
for i, row in test_df.iterrows():
    doc = nlp(str(row[TEXT_COL]))
    pred = "positive" if doc.cats["POSITIVE"] > doc.cats["NEGATIVE"] else "negative"
    true = "positive" if row["cats"]["POSITIVE"] == 1.0 else "negative"
    if pred == true:
        correct += 1
    total += 1

accuracy = correct / total
print(f"\n🎯 Test Accuracy: {accuracy:.2f}")




🎯 Test Accuracy: 1.00


In [16]:
# Test on new examples
examples = [
    "The movie was absolutely fantastic!",
    "It was boring and too long.",
    "Superb acting and great story.",
]
for text in examples:
    doc = nlp(text)
    print(f"\n{text} → {doc.cats}")


The movie was absolutely fantastic! → {'POSITIVE': 0.0006541184266097844, 'NEGATIVE': 0.9993458390235901}

It was boring and too long. → {'POSITIVE': 3.0536415351889445e-07, 'NEGATIVE': 0.9999996423721313}

Superb acting and great story. → {'POSITIVE': 1.233375041920226e-05, 'NEGATIVE': 0.9999877214431763}
