## DistilBERT Toxicity Classifier - Implementation - Modification to run on Apple Silicon

George Cotea, 2024

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm.notebook import tqdm
import pandas
import torch
import numpy as np

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch


# Initialize tokenizer and model
model_name = "citizenlab/distilbert-base-multilingual-cased-toxicity"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Setup MPS device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [18]:
# Import the CSV file with the messages to classify 
import pandas as pd
df = pd.read_csv("/Users/george/Desktop/Uni/2023-2024/Thesis/Final Thesis Collection/testCSV/nomoderationjoined.csv")

In [19]:
# Adjusted functions for manual inference
def classify_text(text, tokenizer, model, device, threshold=2/3):
    # Tokenization
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)

    # Model inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = softmax(outputs.logits, dim=1)

    # Handling the output
    predictions = predictions.cpu().numpy() # Move to CPU and transform to numpy array
    toxic_scores = predictions[:, 0]  
    
    # Determine classification based on threshold
    classification = 'toxic' if toxic_scores.mean() >= threshold else 'not_toxic'
    score = toxic_scores.mean()
    
    return classification, score

# Apply classification across DataFrame with progress bar
from tqdm.auto import tqdm
tqdm.pandas(desc="Classifying messages")

# Ensure all messages are strings
df['message'] = df['message'].astype(str)

# Apply the classification function with progress_apply
results = df['message'].progress_apply(lambda x: classify_text(x, tokenizer, model, device))
df['classification'], df['score'] = zip(*results)

Classifying messages:   0%|          | 0/50606 [00:00<?, ?it/s]

In [20]:
df.to_csv("/Users/george/Desktop/Uni/2023-2024/Thesis/Final Thesis Collection/distilClassified/nomoderationjoinedtoxicity.csv")