In [1]:
pip install kaggle



In [2]:
# STEP 1 — Install Kaggle API
!pip install kaggle --quiet

# STEP 2 — Upload kaggle.json (API key from Kaggle)
from google.colab import files
files.upload()  # Select kaggle.json here

# STEP 3 — Move kaggle.json to the correct location
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# STEP 4 — Download Sentiment140 dataset from Kaggle into Colab
!kaggle datasets download -d kazanova/sentiment140

# STEP 5 — Unzip in Colab
!unzip -q sentiment140.zip


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.21GB/s]


In [3]:
import pandas as pd

df = pd.read_csv(
    "training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    names=["target", "id", "date", "flag", "user", "text"]
)

df.head()


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
# Convert target labels: 0 → negative, 4 → positive
df['target'] = df['target'].map({0: 'negative', 4: 'positive'})

In [5]:
# Cleaning function
def clean_tweet(text):
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)  # remove @mentions
    text = re.sub(r'#', '', text)  # remove hashtag symbol
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'\s+', ' ', text)  # remove extra whitespace
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # reduce repeated characters
    return text.strip()

In [6]:
import re
import string

In [7]:
# Apply cleaning
df['clean_text'] = df['text'].apply(clean_tweet)

In [8]:
# Drop duplicates & empty rows
df = df.drop_duplicates(subset='clean_text')
df = df[df['clean_text'].str.strip() != ""]

# Show sample
df.head(10)

Unnamed: 0,target,id,date,flag,user,text,clean_text
0,negative,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",a thats a bummer you shoulda got david carr of...
1,negative,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...
2,negative,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...
3,negative,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,negative,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...
5,negative,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew,not the whole crew
6,negative,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug,need a hug
7,negative,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...,hey long time no see yes rains a bit only a bi...
8,negative,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it,nope they didnt have it
9,negative,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?,que me muera


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [10]:
print("DataFrame shape:", df.shape)
print("\nExample rows:")
display(df[['clean_text','target']].sample(5, random_state=42))

DataFrame shape: (1518008, 7)

Example rows:


Unnamed: 0,clean_text,target
872269,i loved sleeping in finally the weekend,positive
273359,heading into town to meet ryan and brian need ...,negative
950277,long long dayglad to be home,positive
363681,this prop thing has so much people in a bad mo...,negative
1318279,eating jollibees mango ice caze with ice cream...,positive


In [11]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['target'])   # negative -> 0, positive -> 1 (we'll print mapping)
label_mapping = {cls: int(code) for cls, code in zip(le.classes_, le.transform(le.classes_))}
print("\nLabel mapping:", label_mapping)


Label mapping: {'negative': 0, 'positive': 1}


In [12]:
# --- Check class distribution (important for choosing stratified splits) ---
counts = df['label'].value_counts()
percent = df['label'].value_counts(normalize=True) * 100
print("\nClass counts:\n", counts)
print("\nClass distribution (%):\n", percent.round(2))


Class counts:
 label
0    765819
1    752189
Name: count, dtype: int64

Class distribution (%):
 label
0    50.45
1    49.55
Name: proportion, dtype: float64


In [13]:
# --- Small sanity: drop any empty texts if present (defensive) ---
df = df[df['clean_text'].str.strip().astype(bool)].reset_index(drop=True)
print("\nAfter dropping empty cleaned tweets:", df.shape)


After dropping empty cleaned tweets: (1518008, 8)


In [17]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [18]:
# 1️⃣ Take a sample for faster execution
df_sample = df.sample(n=200_000, random_state=42).reset_index(drop=True)
X_text = df_sample['clean_text']
y = df_sample['label']

In [19]:
# 2️⃣ Define pipelines with optimized TF-IDF and fast SVM
pipelines = {
    "LogisticRegression": Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
        ('clf', LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, random_state=42))
    ]),
    "DecisionTree": Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
        ('clf', DecisionTreeClassifier(random_state=42))
    ]),
    "SVM":Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
    ('clf', CalibratedClassifierCV(
        estimator=LinearSVC(max_iter=2000, random_state=42),
        cv=3
    ))
])
}

In [20]:
# 3️⃣ Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [21]:
# 4️⃣ Evaluate models
results = {name: {"accuracy": [], "precision": [], "recall": [], "f1": [], "roc_auc": []}
           for name in pipelines}

for fold, (train_idx, val_idx) in enumerate(skf.split(X_text, y), 1):
    X_train, X_val = X_text.iloc[train_idx], X_text.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    print(f"\n--- Fold {fold} ---")
    for name, pipe in pipelines.items():
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_val)
        y_prob = pipe.predict_proba(X_val)[:,1]

        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        roc = roc_auc_score(y_val, y_prob)

        results[name]["accuracy"].append(acc)
        results[name]["precision"].append(prec)
        results[name]["recall"].append(rec)
        results[name]["f1"].append(f1)
        results[name]["roc_auc"].append(roc)

        print(f"{name} → acc={acc:.4f}, prec={prec:.4f}, rec={rec:.4f}, f1={f1:.4f}, roc_auc={roc:.4f}")



--- Fold 1 ---
LogisticRegression → acc=0.7931, prec=0.7883, rec=0.7984, f1=0.7933, roc_auc=0.8722
DecisionTree → acc=0.6895, prec=0.6864, rec=0.6919, f1=0.6891, roc_auc=0.6898
SVM → acc=0.7908, prec=0.7844, rec=0.7991, f1=0.7917, roc_auc=0.8696

--- Fold 2 ---
LogisticRegression → acc=0.7965, prec=0.7903, rec=0.8043, f1=0.7972, roc_auc=0.8757
DecisionTree → acc=0.6966, prec=0.6940, rec=0.6975, f1=0.6958, roc_auc=0.6972
SVM → acc=0.7929, prec=0.7848, rec=0.8041, f1=0.7943, roc_auc=0.8725

--- Fold 3 ---
LogisticRegression → acc=0.7912, prec=0.7835, rec=0.8017, f1=0.7925, roc_auc=0.8723
DecisionTree → acc=0.6945, prec=0.6913, rec=0.6971, f1=0.6942, roc_auc=0.6949
SVM → acc=0.7890, prec=0.7791, rec=0.8036, f1=0.7912, roc_auc=0.8696

--- Fold 4 ---
LogisticRegression → acc=0.7920, prec=0.7871, rec=0.7977, f1=0.7924, roc_auc=0.8718
DecisionTree → acc=0.6913, prec=0.6894, rec=0.6905, f1=0.6899, roc_auc=0.6916
SVM → acc=0.7870, prec=0.7802, rec=0.7962, f1=0.7881, roc_auc=0.8680

--- Fold 5 

In [22]:
# 5️⃣ Average results
summary = []
for name, metrics in results.items():
    summary.append({
        "model": name,
        "accuracy_mean": np.mean(metrics["accuracy"]),
        "precision_mean": np.mean(metrics["precision"]),
        "recall_mean": np.mean(metrics["recall"]),
        "f1_mean": np.mean(metrics["f1"]),
        "roc_auc_mean": np.mean(metrics["roc_auc"])
    })

summary_df = pd.DataFrame(summary)
print("\n=== Average Performance Across 5 Folds (Sampled Dataset) ===")
print(summary_df)


=== Average Performance Across 5 Folds (Sampled Dataset) ===
                model  accuracy_mean  precision_mean  recall_mean   f1_mean  \
0  LogisticRegression       0.793425        0.787394     0.801025  0.794148   
1        DecisionTree       0.692970        0.690781     0.693024  0.691896   
2                 SVM       0.790005        0.782308     0.800663  0.791376   

   roc_auc_mean  
0      0.872869  
1      0.693340  
2      0.869727  


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd

In [32]:
# Sampled dataset
X_text = df_sample['clean_text']
y = df_sample['label']

In [33]:
# 1️⃣ Fit TF-IDF once per fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {"accuracy": [], "precision": [], "recall": [], "f1": []}

In [34]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X_text, y), 1):
    X_train_text, X_val_text = X_text.iloc[train_idx], X_text.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Fit TF-IDF once
    tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    X_train = tfidf.fit_transform(X_train_text)
    X_val = tfidf.transform(X_val_text)

    # Define models
    lr = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, random_state=42)
    svm = LinearSVC(max_iter=2000, random_state=42)

    # Voting ensemble (hard voting)
    voting_clf = VotingClassifier(
        estimators=[('lr', lr), ('svm', svm)],
        voting='hard'
    )

    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    results["accuracy"].append(acc)
    results["precision"].append(prec)
    results["recall"].append(rec)
    results["f1"].append(f1)

    print(f"--- Fold {fold} ---")
    print(f"Voting → acc={acc:.4f}, prec={prec:.4f}, rec={rec:.4f}, f1={f1:.4f}")


--- Fold 1 ---
Voting → acc=0.7869, prec=0.7879, rec=0.7821, f1=0.7850
--- Fold 2 ---
Voting → acc=0.7886, prec=0.7870, rec=0.7884, f1=0.7877
--- Fold 3 ---
Voting → acc=0.7863, prec=0.7840, rec=0.7873, f1=0.7856
--- Fold 4 ---
Voting → acc=0.7863, prec=0.7873, rec=0.7815, f1=0.7844
--- Fold 5 ---
Voting → acc=0.7884, prec=0.7868, rec=0.7883, f1=0.7875


In [35]:
# Average metrics
summary_df = pd.DataFrame({
    "metric": results.keys(),
    "mean": [np.mean(v) for v in results.values()]
})
print("\n=== Average Performance Across 5 Folds ===")
print(summary_df)


=== Average Performance Across 5 Folds ===
      metric      mean
0   accuracy  0.787285
1  precision  0.786588
2     recall  0.785516
3         f1  0.786047


In [36]:
import joblib

# Example
joblib.dump(voting_clf, "voting_sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [37]:
def predict_sentiment(tweet, model, vectorizer):
    X = vectorizer.transform([tweet])
    pred = model.predict(X)[0]
    return "positive" if pred == 1 else "negative"

# Example usage
tweet = "I love this new phone!"
predict_sentiment(tweet, voting_clf, tfidf)


'positive'

In [39]:
# 1️⃣ Train TF-IDF on full dataset
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_full = tfidf.fit_transform(df_sample['clean_text'])
y_full = df_sample['label']

In [40]:
# 2️⃣ Train ensemble on full data
lr = LogisticRegression(max_iter=2000, solver='saga', n_jobs=-1, random_state=42)
svm = LinearSVC(max_iter=2000, random_state=42)
voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('svm', svm)],
    voting='hard'
)
voting_clf.fit(X_full, y_full)

In [41]:
# 3️⃣ Save model and TF-IDF
import joblib
joblib.dump(voting_clf, "voting_sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [46]:
# ===== 6️⃣ Ready-to-use prediction function =====
def predict_sentiment(tweet, model=voting_clf, vectorizer=tfidf):
    """
    Input: single tweet (string)
    Output: 'positive' or 'negative'
    """
    X = vectorizer.transform([tweet])
    pred = model.predict(X)[0]
    return "positive" if pred == 1 else "negative"

# ===== 7️⃣ Example usage =====
example_tweet = "i will beat you!"
print("Tweet:", example_tweet)
print("Predicted sentiment:", predict_sentiment(example_tweet))

Tweet: i will beat you!
Predicted sentiment: positive
