<a href="https://colab.research.google.com/github/divyaanshi1308-web/ML-01/blob/main/ML02_YouTube_Comment_Sentiment_%26_Spam_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================
# 1. Install & Import Libraries
# =============================

!pip install nltk scikit-learn matplotlib seaborn

import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download punkt_tab for tokenization

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# =============================
# 2. Load Dataset
# =============================

# Kaggle dataset: "YouTube Spam Collection Dataset"
# Direct sample dataset from UCI repo
url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"

# NOTE: This dataset has 'label' column (0 = negative, 1 = positive)
# We'll simulate it for YouTube-style comments

df = pd.read_csv(url)
df = df[['tweet', 'label']]  # tweet = text, label = sentiment

# Rename columns
df.rename(columns={'tweet':'comment', 'label':'sentiment'}, inplace=True)

print("Sample Data:")
print(df.head())

Sample Data:
                                             comment  sentiment
0   @user when a father is dysfunctional and is s...          0
1  @user @user thanks for #lyft credit i can't us...          0
2                                bihday your majesty          0
3  #model   i love u take with u all the time in ...          0
4             factsguide: society now    #motivation          0


In [None]:
# =============================
# 3. Data Preprocessing
# =============================

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # remove links
    text = re.sub(r'[^a-z\s]', '', text)  # remove special chars
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df['cleaned'] = df['comment'].apply(clean_text)

print("Before:", df['comment'][0])
print("After:", df['cleaned'][0])

Before:  @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run
After: user father dysfunctional selfish drag kid dysfunction run


In [None]:
# =============================
# 4. Sentiment Analysis Model
# =============================

X = df['cleaned']
y = df['sentiment']   # 0 = Negative, 1 = Positive

# Vectorization - Fit on the larger sentiment dataset
vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train model (Naive Bayes)
model_sentiment = MultinomialNB()
model_sentiment.fit(X_train, y_train)

# Predictions
y_pred = model_sentiment.predict(X_test)

print("🎯 Sentiment Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

🎯 Sentiment Model Accuracy: 0.9507273580478649

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.93      0.33      0.49       456

    accuracy                           0.95      6393
   macro avg       0.94      0.67      0.73      6393
weighted avg       0.95      0.95      0.94      6393



In [None]:
# =============================
# 5. Spam Detection (Fake Simulation)
# =============================

# Let's create a small spam dataset manually
spam_data = {
    "comment": [
        "Subscribe to my channel and win iPhone",
        "Click here to get free money",
        "Worst video ever",
        "I love this tutorial, very helpful!",
        "This is spam comment visit my website now"
    ],
    "spam": [1, 1, 0, 0, 1]  # 1 = Spam, 0 = Not Spam
}

spam_df = pd.DataFrame(spam_data)
spam_df['cleaned'] = spam_df['comment'].apply(clean_text)

# Use the SAME vectorizer fitted on the sentiment data
X_spam = vectorizer.transform(spam_df['cleaned']) # Use transform, not fit_transform
y_spam = spam_df['spam']

spam_model = LogisticRegression()
spam_model.fit(X_spam, y_spam)

print("Spam Model Trained on Sample Data ✅")

Spam Model Trained on Sample Data ✅


In [None]:
# =============================
# 6. Prediction Function
# =============================

def predict_comment(comment):
    cleaned = clean_text(comment)

    # Sentiment
    vec = vectorizer.transform([cleaned])
    sentiment_pred = model_sentiment.predict(vec)[0]
    sentiment = "Positive 😄" if sentiment_pred == 1 else "Negative 😞"

    # Spam
    spam_pred = spam_model.predict(vec)[0]
    spam_status = "Spam 🚨" if spam_pred == 1 else "Not Spam ✅"

    print(f"\nComment: {comment}")
    print(f"→ Sentiment: {sentiment}")
    print(f"→ Spam Check: {spam_status}")

In [None]:
# =============================
# 7. Test Predictions
# =============================

predict_comment("I really loved this video, awesome work!")
predict_comment("This video is waste of time, very boring.")
predict_comment("Subscribe to my channel for free gifts!!!")
predict_comment("Great explanation, I learned a lot.")


Comment: I really loved this video, awesome work!
→ Sentiment: Negative 😞
→ Spam Check: Spam 🚨

Comment: This video is waste of time, very boring.
→ Sentiment: Negative 😞
→ Spam Check: Spam 🚨

Comment: Subscribe to my channel for free gifts!!!
→ Sentiment: Negative 😞
→ Spam Check: Spam 🚨

Comment: Great explanation, I learned a lot.
→ Sentiment: Negative 😞
→ Spam Check: Spam 🚨
