In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving cyberbullying_tweets(ML).csv to cyberbullying_tweets(ML) (1).csv
User uploaded file "cyberbullying_tweets(ML) (1).csv" with length 7174545 bytes


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# STEP 1: Import Libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

# STEP 2: Load the Dataset
df = pd.read_csv("/content/cyberbullying_tweets(ML).csv")  # Adjust path if needed
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# STEP 3: Preprocessing Function
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['cleaned_tweet'] = df['tweet_text'].apply(preprocess_text)

# STEP 4: Feature and Label Selection
X = df['cleaned_tweet']
y = df['cyberbullying_type']

# STEP 5: TF-IDF Vectorization (reduced feature size to speed up)
tfidf = TfidfVectorizer(max_features=1000)
X_vectorized = tfidf.fit_transform(X)

# STEP 6: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# STEP 7: Models to Train
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(solver='saga', max_iter=100, n_jobs=-1),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# STEP 8: Train and Evaluate
for name, model in models.items():
    print(f"\n🔹 Model: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



🔹 Model: Naive Bayes
Accuracy: 0.7378304657994125
Confusion Matrix:
 [[1503    6    6   51   30    6]
 [  28 1469    9   19   65   46]
 [  22   32 1185  130  100   45]
 [ 233   36   95  628  389  243]
 [ 173  106  117  208  773  217]
 [   7    9   18   26   27 1475]]
Classification Report:
                      precision    recall  f1-score   support

                age       0.76      0.94      0.84      1602
          ethnicity       0.89      0.90      0.89      1636
             gender       0.83      0.78      0.81      1514
  not_cyberbullying       0.59      0.39      0.47      1624
other_cyberbullying       0.56      0.48      0.52      1594
           religion       0.73      0.94      0.82      1562

           accuracy                           0.74      9532
          macro avg       0.73      0.74      0.72      9532
       weighted avg       0.73      0.74      0.72      9532


🔹 Model: Logistic Regression
Accuracy: 0.8175618967687789
Confusion Matrix:
 [[1549    4    3