In [None]:

!pip install pandas numpy scikit-learn matplotlib transformers


In [None]:

import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from numpy.linalg import norm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
import torch

df = pd.read_csv("train_embedded.csv")
df["embedding"] = df["embedding"].apply(ast.literal_eval)


In [None]:

def get_center(df, mask):
    return np.mean(df[mask]["embedding"].to_list(), axis=0)

v_neutro = get_center(df, 
    (df["identity_hate"] == 0) &
    (df["insult"] == 0) &
    (df["obscene"] == 0) &
    (df["threat"] == 0) &
    (df["toxic"] == 0)
)

v_racismo = get_center(df, df["identity_hate"] == 1)
v_insulto = get_center(df, df["insult"] == 1)
v_obsceno = get_center(df, df["obscene"] == 1)
v_ameaca = get_center(df, df["threat"] == 1)
v_toxico = get_center(df, df["toxic"] == 1)


In [None]:

v_corrigir_racismo = v_neutro - v_racismo
v_corrigir_insulto = v_neutro - v_insulto
v_corrigir_obsceno = v_neutro - v_obsceno
v_corrigir_ameaca = v_neutro - v_ameaca
v_corrigir_toxico = v_neutro - v_toxico

vetores_corrigir = [
    v_corrigir_racismo,
    v_corrigir_insulto,
    v_corrigir_obsceno,
    v_corrigir_ameaca,
    v_corrigir_toxico
]


In [None]:

X = np.stack(df["embedding"].values)
Y = df[["identity_hate", "insult", "obscene", "threat", "toxic"]].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

clf = MultiOutputClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train, y_train)


In [None]:

def detox_embedding(embedding, probs, vetores_corrigir):
    if probs.sum() == 0:
        return embedding.copy()
    weights = probs / probs.sum()
    v_corrigir = sum(w * v for w, v in zip(weights, vetores_corrigir))
    return embedding + v_corrigir


In [None]:

model_name = "facebook/sonar-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def sonar_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [None]:

comentario = "You don't belong here. Go back to your country."
embedding_original = sonar_embedding(comentario)

probs_raw = clf.predict_proba([embedding_original])
probs = np.array([p[0][1] for p in probs_raw])

embedding_corrigido = detox_embedding(embedding_original, probs, vetores_corrigir)


In [None]:

def cosine_distance(a, b):
    return 1 - np.dot(a, b) / (norm(a) * norm(b))

categorias = ["identity_hate", "insult", "obscene", "threat", "toxic"]
nomes = ["Racismo", "Insulto", "Obsceno", "Ameaca", "Toxico"]
vetores_toxicos = [v_racismo, v_insulto, v_obsceno, v_ameaca, v_toxico]

for nome, vetor in zip(nomes, vetores_toxicos):
    d_orig = cosine_distance(embedding_original, vetor)
    d_detox = cosine_distance(embedding_corrigido, vetor)
    print(f"{nome.upper()} - Original → {d_orig:.4f} | Detox → {d_detox:.4f}")


In [None]:

X = [embedding_original, embedding_corrigido, v_neutro,
     v_racismo, v_insulto, v_obsceno, v_ameaca, v_toxico]

labels = ["Original", "Detox", "Neutro", 
          "Racismo", "Insulto", "Obsceno", "Ameaca", "Toxico"]

colors = ["blue", "green", "black", "red", "orange", "purple", "brown", "crimson"]

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(10, 6))
for i, point in enumerate(X_pca):
    plt.scatter(point[0], point[1], c=colors[i])
    plt.text(point[0]+0.01, point[1]+0.01, labels[i], fontsize=9)
plt.title("Embeddings em relação às categorias tóxicas")
plt.grid()
plt.show()
