In [2]:
import pandas as pd
import os

In [5]:
train_df = pd.read_csv('BullyingMultiClase.csv')
predict_df = pd.read_csv('BullyingPredict.csv')

text     0
label    0
dtype: int64

# Feature extraction

In [None]:
if not os.path.exists('features'):
    os.makedirs('features')

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

if not os.path.exists('features/tfidf'):
    os.makedirs('features/tfidf')

tfidf_folder = "features/tfidf"

In [None]:
tfidf= TfidfVectorizer(sublinear_tf=True, min_df=5,
                       ngram_range=(1, 2), stop_words='english', max_features=10000)
features_train = tfidf.fit_transform(train_df.text).toarray()
labels_train = train_df.label
features_predict = tfidf.fit_transform(predict_df.text).toarray()

In [None]:
import os
from joblib import dump

# Make sure the folder exists
os.makedirs(tfidf_folder, exist_ok=True)

# Save the TF-IDF vectorizer and features
dump(tfidf, os.path.join(tfidf_folder, "tfidf_vectorizer.joblib"))
dump(features_train, os.path.join(tfidf_folder, "features_train.joblib"))
dump(labels_train, os.path.join(tfidf_folder, "labels_train.joblib"))
dump(features_predict, os.path.join(tfidf_folder, "features_predict.joblib"))


# BERT_EMBEDDING

In [None]:
if not os.path.exists('features/tfidf'):
    os.makedirs('features/tfidf')

bert_folder = "features/bert"

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from tqdm import tqdm  # optional progress bar

# 1. Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Load tokenizer and base model (no classification head)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModel.from_pretrained("xlm-roberta-base").to(device)
model.eval()  # Turn off dropout, etc.


# 3. Define a mean pooling function
def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = torch.sum(last_hidden_state * mask, dim=1)
    counts = torch.clamp(mask.sum(1), min=1e-9)
    return summed / counts  # Shape: [batch_size, 768]


# 4. Function to extract features from a list of texts
from tqdm import tqdm


def extract_features(texts, batch_size=64):
    all_embeddings = []
    dataloader = DataLoader(texts, batch_size=batch_size)
    for batch in tqdm(dataloader, desc="Extracting features"):
        # Tokenize a batch of texts
        encoded = tokenizer(batch, padding=True, truncation=True,
                            return_tensors="pt", max_length=128)
        encoded = {k: v.to(device) for k, v in encoded.items()}

        with torch.no_grad():
            output = model(**encoded)
            embeddings = mean_pool(output.last_hidden_state, encoded["attention_mask"])
            all_embeddings.append(embeddings.cpu())

    return torch.cat(all_embeddings, dim=0)


# 5. Extract features for train data
x_train = extract_features(train_df["text"].tolist(), batch_size=256)
y_train = train_df["label"]

# 6. Extract features for predict data
x_predict = extract_features(predict_df["text"].tolist(), batch_size=256)

# 7. Save the features and labels
torch.save(x_train, os.path.join(bert_folder, "x_train.pt"))
torch.save(y_train, os.path.join(bert_folder, "y_train.pt"))
torch.save(x_predict, os.path.join(bert_folder, "x_predict.pt"))
