##Importa librerías

In [None]:
# Install libraries
#!pip install boto3
#!pip install emoji
#!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.3.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.2 (from mlflow)
  Downloading mlflow_skinny-3.3.2-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.2 (from mlflow)
  Downloading mlflow_tracing-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.3.2->mlflow)
  Downloading databricks_sdk-0.65.0-py3-none-any.whl.metadata (39 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_relay-3.2.0-py3-none-any.whl

In [None]:
# Getting the data
import boto3
from google.colab import userdata

# General use
import os
import numpy as np # linear algebra
import pandas as pd # data processing

# Text pre processing
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
import emoji
import re

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE #class balance

# Model training and evaluation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score

# Experiments workflow
import mlflow
import mlflow.sklearn

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Llaves de AWS
AWS_ACCESS_KEY_ID = userdata.get('AWS_ACCESS_KEY_ID_PROJECT')
AWS_SECRET_ACCESS_KEY = userdata.get('AWS_SECRET_ACCESS_KEY_PROJECT')
AWS_DEFAULT_REGION = 'us-east-2'

s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_DEFAULT_REGION
)

# --- Configuración de S3 ---
s3_uri = "s3://s3-cyberbullying-classification-data/files/md5/c8/5e2d40bf87b27619f2a4c49fcb9cda"

# Extraemos el bucket y la clave (key) del URI
bucket_name = s3_uri.split('/')[2]
object_key = '/'.join(s3_uri.split('/')[3:])

print(f"Cargando datos desde el bucket: {bucket_name}")
print(f"Clave del objeto: {object_key}")

try:
    # Obtenemos el objeto desde S3
    s3_object = s3.get_object(Bucket=bucket_name, Key=object_key)

    # Leemos el contenido del archivo (asumiendo que es un CSV) en un DataFrame
    # Nota: Si tu archivo es .json, .txt, etc., cambia pd.read_csv por la función adecuada.
    data = pd.read_csv(s3_object['Body'])
    print("\n¡Datos cargados con éxito!")


except Exception as e:
    print(f"Error al cargar los datos: {e}")

Cargando datos desde el bucket: s3-cyberbullying-classification-data
Clave del objeto: files/md5/c8/5e2d40bf87b27619f2a4c49fcb9cda

¡Datos cargados con éxito!


##Preprocesamiento y Extracción de Características

In [None]:
data.head(10)# First 10 rows od the dataset

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying
8,@stockputout everything but mostly my priest,not_cyberbullying
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying


In [None]:
data.drop_duplicates(inplace=True) #Drops duplicates
data.reset_index(drop=True,inplace=True) #Resets index

In [None]:
# Drops ambiguous entries
conflict_mask = (data.groupby('tweet_text')['cyberbullying_type'].transform('nunique') > 1)
data = data[~conflict_mask].reset_index(drop=True)

In [None]:
# Preprocessing
lemmatizer = WordNetLemmatizer() # Creates lemmatizer

def get_wordnet_pos(tag): # Map POS tags to WordNet format
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

def lemmatizer_words(text): # Creates function to tokenize and lemmatize
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens) #Part-of-speech tag
    lemmatized = [lemmatizer.lemmatize(word.lower(), get_wordnet_pos(tag)) for word, tag in tagged]
    return " ".join(lemmatized)

data['tweet_text']=data['tweet_text'].astype(str) # Turns object type to string type
data['tweet_text']=data['tweet_text'].apply(lambda text: lemmatizer_words(text)) # Lambda function to lemmatize a specific column

# Creates and combine a list of special stop words
stopwords_especiales = ['rt', 'just', 'don', 'http', 'got', 'https', 'wa', 'amp', 've', 'mkr'] # List of domain-specific stop words
comb_stopwords = list(ENGLISH_STOP_WORDS.union(stopwords_especiales)) #Combine stop words

In [None]:
data_copy = data.copy()

In [None]:
#Train - test split (avoids data leakage)
x_train, x_test, y_train, y_test = train_test_split(data_copy['tweet_text'], data_copy['cyberbullying_type'], test_size=0.2, random_state=42)

In [None]:
y_train.value_counts()

Unnamed: 0_level_0,count
cyberbullying_type,Unnamed: 1_level_1
age,6427
religion,6393
ethnicity,6334
gender,6249
not_cyberbullying,5090
other_cyberbullying,5009


In [None]:
# Defines a function that pre process the data and vectorize it
def vectoriza(data):
    cleaned_docs = []
    for doc in data:
        sin_emojis = emoji.replace_emoji(doc, replace="")
        no_punct = re.sub(r'[^\w\s]', '', sin_emojis)
        no_digits = re.sub(r'\d+', '', no_punct)
        cleaned_docs.append(no_digits)

    tfidf_vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words=comb_stopwords,
        token_pattern=r'(?u)\b\w\w\w+\b'
    )
    vectorizador = tfidf_vectorizer.fit_transform(cleaned_docs)
    return tfidf_vectorizer, vectorizador

vectorizer, x_train_vectorizado = vectoriza(x_train)
print(f"Número de documentos de entrenamiento: {x_train_vectorizado.shape[0]}")

Número de documentos de entrenamiento: 35502


In [None]:
y_train.value_counts() #Checks class balance

Unnamed: 0_level_0,count
cyberbullying_type,Unnamed: 1_level_1
age,6427
religion,6393
ethnicity,6334
gender,6249
not_cyberbullying,5090
other_cyberbullying,5009


In [None]:
# Class balance with SMOTE
smote = SMOTE(sampling_strategy='not majority', random_state=42)
x_train, y_train = smote.fit_resample(x_train_vectorizado, y_train)
y_train.value_counts()

Unnamed: 0_level_0,count
cyberbullying_type,Unnamed: 1_level_1
ethnicity,6427
not_cyberbullying,6427
other_cyberbullying,6427
gender,6427
religion,6427
age,6427


##Entrenamiento y evaluación del Modelo

In [None]:
# Defines mlflow server
mlflow.set_tracking_uri('databricks')
experiment = mlflow.set_experiment('/cyberbullying-classification')

In [None]:
# Initialize and train KNN model
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train, y_train)
mlflow.sklearn.log_model(knn, "knn_model")

In [None]:
# Initialize and train KNN model
with mlflow.start_run(experiment_id=experiment.experiment_id):
    # Log parameters
    n_neighbors = 5
    mlflow.log_param("n_neighbors", n_neighbors)
    # Train model
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
    knn.fit(x_train, y_train)

    # Test data preprocessing
    def transforma(data, vectorizer):
        cleaned_docs = []
        for doc in data:
            sin_emojis = emoji.replace_emoji(doc, replace="")
            no_punct = re.sub(r'[^\w\s]', '', sin_emojis)
            no_digits = re.sub(r'\d+', '', no_punct)
            cleaned_docs.append(no_digits)

        return vectorizer.transform(cleaned_docs)
    x_test_vectorizado = transforma(x_test, vectorizer)
    print(f"Forma de matriz de características de prueba: {x_test_vectorizado.shape}")

    # Make predictions with trained model
    knn_test_pred = knn.predict(x_test_vectorizado) # Makes predictions with test data
    # Log model
    mlflow.sklearn.log_model(knn, "knn_model")

    # Evaluate KNN model
    print(classification_report(y_test, knn_test_pred))
    # Get labels
    original_labels = data_copy['cyberbullying_type'].unique()
    # Confusion matrix of KNN model
    cm_knn = confusion_matrix(y_test, knn_test_pred)
    print(ConfusionMatrixDisplay(cm_knn, display_labels=original_labels).plot(cmap='Blues', xticks_rotation='vertical'))

    # Log metrics
    accuracy = accuracy_score(y_test, knn_test_pred)
    recall = recall_score(y_test, knn_test_pred, average='weighted')
    print(f"\nAccuracy: {accuracy}, \nRecall: {recall}")

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("recall", recall)



In [None]:
# Cross validation score
cv_scores_knn = cross_val_score(knn, x_train, y_train, cv=5)

print('Cross validation score (K Nearest Neighbor):', cv_scores_knn.mean())

Cross validation score (K Nearest Neighbor): 0.4536848892557857
