In [None]:
!pip install pandas numpy scikit-learn matplotlib

In [None]:

import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from numpy.linalg import norm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Carregar dataset
df = pd.read_csv("train_embedded.csv")
df["embedding"] = df["embedding"].apply(ast.literal_eval)


In [None]:

def get_center(df, mask):
    return np.mean(df[mask]["embedding"].to_list(), axis=0)

# Vetor neutro
v_neutro = get_center(df, 
    (df["identity_hate"] == 0) &
    (df["insult"] == 0) &
    (df["obscene"] == 0) &
    (df["threat"] == 0) &
    (df["toxic"] == 0)
)

# Vetores tóxicos
v_racismo = get_center(df, df["identity_hate"] == 1)
v_insulto = get_center(df, df["insult"] == 1)
v_obsceno = get_center(df, df["obscene"] == 1)
v_ameaca = get_center(df, df["threat"] == 1)
v_toxico = get_center(df, df["toxic"] == 1)


In [None]:

v_corrigir_racismo = v_neutro - v_racismo
v_corrigir_insulto = v_neutro - v_insulto
v_corrigir_obsceno = v_neutro - v_obsceno
v_corrigir_ameaca = v_neutro - v_ameaca
v_corrigir_toxico = v_neutro - v_toxico

vetores_corrigir = [
    v_corrigir_racismo,
    v_corrigir_insulto,
    v_corrigir_obsceno,
    v_corrigir_ameaca,
    v_corrigir_toxico
]


In [None]:

X = np.stack(df["embedding"].values)
Y = df[["identity_hate", "insult", "obscene", "threat", "toxic"]].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

clf = MultiOutputClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train, y_train)


In [None]:

def detox_embedding(embedding, probs, vetores_corrigir):
    if probs.sum() == 0:
        return embedding.copy()
    weights = probs / probs.sum()
    v_corrigir = sum(w * v for w, v in zip(weights, vetores_corrigir))
    return embedding + v_corrigir


In [None]:

row = df.iloc[0]
embedding_original = np.array(row["embedding"])

# Prever as probabilidades para o comentário
probs_raw = clf.predict_proba([embedding_original])
probs = np.array([p[0][1] for p in probs_raw])  # pega prob. da classe 1

# Aplica detox
embedding_corrigido = detox_embedding(embedding_original, probs, vetores_corrigir)


In [None]:

def cosine_distance(a, b):
    return 1 - np.dot(a, b) / (norm(a) * norm(b))

print("↔️ original → tóxico:", cosine_distance(embedding_original, v_toxico))
print("↔️ detox    → tóxico:", cosine_distance(embedding_corrigido, v_toxico))


In [None]:

pca = PCA(n_components=2)
X = pca.fit_transform([embedding_original, embedding_corrigido, v_toxico, v_neutro])

plt.scatter(X[:,0], X[:,1], c=["blue", "green", "red", "black"])
plt.legend(["Original", "Detox", "Tóxico", "Neutro"])
plt.title("Visualização dos embeddings")
plt.grid()
plt.show()


In [None]:

# Instalar e carregar SONAR
!pip install -q transformers

from transformers import AutoTokenizer, AutoModel
import torch

model_name = "facebook/sonar-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def sonar_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [None]:

# Inserir comentário manual e gerar detox
comentario = "Seu lugar não é aqui. Volta pra sua terra."  # Substitua pelo que quiser
embedding_comentario = sonar_embedding(comentario)

probs_raw = clf.predict_proba([embedding_comentario])
probs = np.array([p[0][1] for p in probs_raw])

embedding_detox = detox_embedding(embedding_comentario, probs, vetores_corrigir)

print("Distância original → tóxico:", cosine_distance(embedding_comentario, v_toxico))
print("Distância detox    → tóxico:", cosine_distance(embedding_detox, v_toxico))
