<a href="https://colab.research.google.com/github/gregthemitch/cyberbullying_detection_demo_models/blob/main/Cyberbullying_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download Kaggle Dataset

In [None]:
!kaggle datasets download -d andrewmvd/cyberbullying-classification -f cyberbullying_tweets.csv -p ./project/data
# Unzip csv from zip file
!unzip -j "./project/data/cyberbullying_tweets.csv.zip" "*.csv" -d "./project/data"
# Remove zip file
!rm ./project/data/cyberbullying_tweets.csv.zip

Dataset URL: https://www.kaggle.com/datasets/andrewmvd/cyberbullying-classification
License(s): Attribution 4.0 International (CC BY 4.0)
Downloading cyberbullying_tweets.csv.zip to ./project/data
  0% 0.00/2.82M [00:00<?, ?B/s]
100% 2.82M/2.82M [00:00<00:00, 229MB/s]
Archive:  ./project/data/cyberbullying_tweets.csv.zip
replace ./project/data/cyberbullying_tweets.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./project/data/cyberbullying_tweets.csv  


Data Processing and Feature Selection

In [None]:
import numpy as np
import pandas as pd

import nltk
import gensim
import gensim.downloader

In [None]:
data = pd.read_csv("project/data/cyberbullying_tweets.csv")

In [None]:
# Load in GloVe vectors for twitter
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [None]:
# Download stopwords from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import sklearn
import scipy
import gensim
import nltk
import sys
import joblib

print("Python version: ", sys.version)

print("numpy version: ", np.__version__)
print("pandas version: ", pd.__version__)
print("nltk version: ", nltk.__version__)
print("sklearn version: ", sklearn.__version__)
print("scipy version: ", scipy.__version__)
print("gensim version: ", gensim.__version__)
print("joblib version: ", joblib.__version__)


Python version:  3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
numpy version:  1.26.4
pandas version:  2.2.2
nltk version:  3.9.1
sklearn version:  1.6.0
scipy version:  1.13.1
gensim version:  4.3.3
joblib version:  1.4.2


Build pipeline

In [None]:
import string
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from scipy import sparse
from scipy.sparse.linalg import svds
from sklearn.svm import SVC
from sklearn.metrics import hinge_loss, classification_report, confusion_matrix
from scipy.spatial.distance import cdist


# Custom transformer to clean data
class CleanText(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def _clean_text(self, text):
      """
      Function to clean an indiviudal string, removing stop words and punctuation.

      Returns (str): The cleaned string.
      """
      # Get relevant english stop words from nltk
      STOP_WORDS = set(nltk.corpus.stopwords.words('english'))
      # \w is for words
      WORD = re.compile(r'\w+')

      # Remove usernames (strings following @)
      clean_text = re.sub(r"@[a-z]+", "", text.lower())
      # Remove punctuation
      clean_text = clean_text.translate(
          str.maketrans('', '', string.punctuation)
      ).replace("“", "").replace("”", "")

      words = WORD.findall(clean_text)
      return np.array(
          " ".join([word for word in words if word not in STOP_WORDS])
      )

    def _clean_data(self, X):
      """
      Takes in an array of strings and cleans each string.

      Returns (np.array): An array of cleaned strings.
      """
      cleaned = np.array([],  dtype=object)

      for i, sentence in enumerate(X):
        cleaned = np.append(cleaned, self._clean_text(sentence))

      return cleaned

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return self._clean_data(X)

# Transformer to numerically embed sentences
class SentenceEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, word_embedding_model: gensim.models.KeyedVectors):
      self.word_embedding_model = word_embedding_model

    def _word_embedding(self, X):
      """
      Takes in an array of strings and converts each string into a vector. The
      method involves getting the average of the word vectors for each sentence.

      Returns (np.array): A 2D array (n x length of word vectors).
      """

      # Get sentence embeddings using the average of its words' embeddings
      sentence_embeddings = []
      for i, sentence in enumerate(X):
          if sentence:
              sentence_vec = self.word_embedding_model.get_mean_vector(
                  sentence.split(" "), ignore_missing=True).reshape(1, -1)

              if not sentence_vec.any():
                  raise ValueError(
                      "Could not create sentence embedding for sentence: "
                      f"{sentence}"
                  )

              sentence_embeddings.append(sentence_vec)

          else:
            raise ValueError(
                "Sentence is empty. X and Y will have mismatched dimensions"
            )

      return np.concatenate(sentence_embeddings, axis=0)

    def fit(self, X, y=None):
      self.tfidf = TfidfVectorizer()
      self.tfidf.fit(X)

      return self

    def transform(self, X):
      sent_tfidf = self.tfidf.transform(X)
      sent_vec = self._word_embedding(X)

      return sparse.hstack([sparse.csr_array(sent_vec), sent_tfidf])

# Transformer for truncated SVD
# This is a holdover from the class project's manual implmentation
class TruncatedSVD(BaseEstimator, TransformerMixin):
  def __init__(self, n_components):
    self.n_components = n_components
    self.Vt = None

  def fit(self, X, y=None):
    """
    Reduces dimensions using truncated SVD for sparse matrices.
    """
    assert self.n_components <= X.shape[1], \
      "Number of components is greater than data's number of features"

    U, S, Vt = svds(X, k=self.n_components)
    self.Vt = Vt

    return self

  def transform(self, X):
    # print(f"Reducing dimensions to {self.n_components} using truncated SVD...")
    return X @ self.Vt.T


In [None]:
def report_leakage(X_train, X_test):
  leaks = {}

  for i, row in enumerate(X_train):
    for j, test_row in enumerate(X_test):
      if row == test_row:
        # raise Exception(f"row {i} in training data is in testing data in row {j}")
        leaks[row] = [i, j]


  return leaks

leaks = report_leakage(X_train, X_test)
print(len(leaks))

In [None]:
def bullying_category(df):
    """
    Split data into datasets by cyberbullying category, with no cyberbullying
    and a cyberbullying category
    """
    datasets = {
        'other_cyberbullying': None,
        'religion': None,
        'gender': None,
        'ethnicity': None,
        'age': None
    }

    for key in datasets:
        filtered_df = df[df["cyberbullying_type"].isin(['not_cyberbullying', key])].copy()
        filtered_df["encoded_y"] = (filtered_df["cyberbullying_type"] == key).astype(int)
        filtered_df.loc[filtered_df["encoded_y"] == 0, "encoded_y"] = -1

        datasets[key] = filtered_df.reset_index(drop=True)

    return datasets


def preprocess(df, embedding_model):
  idx = []

  for i, text in enumerate(df["tweet_text"]):
    # Get relevant english stop words from nltk
    STOP_WORDS = set(nltk.corpus.stopwords.words('english'))
    # \w is for words
    WORD = re.compile(r'\w+')

    # Remove usernames (strings following @)
    clean_text = re.sub(r"@[a-z]+", "", text.lower())
    # Remove punctuation
    clean_text = clean_text.translate(
        str.maketrans('', '', string.punctuation)
    ).replace("“", "").replace("”", "")

    words = WORD.findall(clean_text)
    if [word for word in words if (word not in STOP_WORDS) and (word in embedding_model)]:
      idx.append(i)

  return df.loc[idx, :].reset_index(drop=True)


def split_data(df, train_size):
  idx = np.arange(df.shape[0])
  np.random.shuffle(idx)

  train_size = int(train_size * df.shape[0])
  test_size = df.shape[0] - train_size

  train_idx = idx[:train_size]
  test_idx = idx[train_size:]

  return df.loc[train_idx], df.loc[test_idx]


def get_data(df, train_size, embedding_model):
  data_by_cat = bullying_category(data)
  cleaned_data = {key: preprocess(value, embedding_model)
    for key, value in data_by_cat.items()}

  final_data = {}
  for key, value in cleaned_data.items():
    final_data[key] = split_data(value, train_size)

  return final_data


In [None]:
from joblib import dump
from google.colab import files


def prepare_data_for_pipeline(data_dictionary, category):
  X_train, X_test = data_dictionary[category][0]["tweet_text"],\
    data_dictionary[category][1]["tweet_text"]

  y_train, y_test = data_dictionary[category][0]["encoded_y"],\
    data_dictionary[category][1]["encoded_y"]

  return X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()


def save_models(data_dictionary):
  """
  Create pipelines for each category of cyberbullying and save the models/pipelines.
  Code also downloads the model.
  """
  for category in data_dictionary:
    X_train, X_test, y_train, y_test = prepare_data_for_pipeline(data_dictionary, category)

    pipe = Pipeline([
      ('cleaning', CleanText()),
      ('embedding', SentenceEmbedder(glove_vectors)),
      ('svd', TruncatedSVD(500)),
      ('svc', SVC(kernel='poly'))
    ])

    pipe.fit(X_train, y_train)

    with open(f'{category}.z', "wb") as f:
      dump(pipe, f, protocol=5)

    # files.download(f'{category}.z')

# Using 80% of the data to train the model
data_dict = get_data(data, .8, glove_vectors)
save_models(data_dict)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
pipeline.score(X_train, y_train)

Reducing dimensions to 10 using truncated SVD...


0.9304320203303685

In [None]:
pipeline.score(X_test, y_test)

Reducing dimensions to 10 using truncated SVD...


0.9307716735471578

In [None]:
pipeline.predict(np.array(["motherfuckers"]))

Reducing dimensions to 500 using truncated SVD...


array([-1])

In [None]:
from joblib import load
with open("ethnicity.z", "rb") as f:
    test = load(f)

In [None]:
#Attempt to use ONNX
# !pip install skl2onnx
# !pip install onnxruntime

# from skl2onnx import to_onnx
# import onnxruntime as rt

# class Cyberbullying_Detector:
#   def __init__(self, category, path="") -> None:
#     self.category = category
#     self.Vt = None

#     self._trained = False
#     self._onnx_loaded = False

#   def train(self, df, category_name, n_components, train_size, valid_size, test_size, **kwargs):

#     self._trained = True
#     pass

#   def predict(self, text):
#     pass

#   def save(self, path):
#     # Convert into ONNX format.
#     onx = to_onnx(clr, X[:1])

#     with open(f"{path}cb_{self.category}.onnx", "wb") as f:
#         f.write(onx.SerializeToString())

#   def load(self, path)
#     sess = rt.InferenceSession("rf_iris.onnx", providers=["CPUExecutionProvider"])
#     input_name = sess.get_inputs()[0].name
#     label_name = sess.get_outputs()[0].name
#     pred_onx = sess.run([label_name], {input_name: X_test.astype(np.float32)})[0]

In [None]:
# !pip install skl2onnx
# !pip install onnxruntime

# import numpy as np
# from sklearn.datasets import load_iris
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier

# iris = load_iris()
# X, y = iris.data, iris.target
# X = X.astype(np.float32)
# X_train, X_test, y_train, y_test = train_test_split(X, y)
# clr = RandomForestClassifier()
# clr.fit(X_train, y_train)

# # Convert into ONNX format.
# from skl2onnx import to_onnx

# onx = to_onnx(clr, X[:1])
# with open("rf_iris.onnx", "wb") as f:
#     f.write(onx.SerializeToString())

# # Compute the prediction with onnxruntime.
# import onnxruntime as rt

# sess = rt.InferenceSession("rf_iris.onnx", providers=["CPUExecutionProvider"])
# input_name = sess.get_inputs()[0].name
# label_name = sess.get_outputs()[0].name
# pred_onx = sess.run([label_name], {input_name: X_test.astype(np.float32)})[0]

Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pack