In [9]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV
import pandas as pd
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer

### Prepare Data

In [10]:
train_en_path = "./data_sources/train/train_en.csv"
test_en_path = "./data_sources/test/test_en.csv"

train_it_path = "./data_sources/train/train_it.csv"
test_it_path = "./data_sources/test/test_it.csv"

train_es_path = "./data_sources/train/train_es.csv"
test_es_path = "./data_sources/test/test_es.csv"

train_df = pd.read_csv(train_it_path)
test_df = pd.read_csv(test_it_path)

# convert to lists
X_train_texts = train_df['text'].tolist()
y_train = train_df['label'].tolist()


# test set
X_test_texts = test_df['text'].tolist()
y_test = test_df['label'].tolist()

### Encode Texts

In [27]:
model_name = "paraphrase-multilingual-MiniLM-L12-v2"
embedder = SentenceTransformer(model_name)

# Compute embeddings
X_train = embedder.encode(X_train_texts, convert_to_numpy=True)
X_test  = embedder.encode(X_test_texts, convert_to_numpy=True)

In [11]:
# try with tdifd
vectorizer = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train_texts)
X_test_tfidf = vectorizer.transform(X_test_texts)

In [4]:
wandb.init(project="reappropriation-logreg", name="logreg_pipeline")

[34m[1mwandb[0m: Currently logged in as: [33msravisconti[0m ([33msravisconti-projects[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Hyperparameter Tuning

In [28]:
param_grid = {"C": [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42),
    param_grid,
    scoring="f1_macro", # optimize for macro F1 score
    cv=5 # cross validation folds
)
grid.fit(X_train, y_train)

best_clf = grid.best_estimator_

# Log best hyperparameters
# wandb.config.update(grid.best_params_)

### Train final model and evaluate

In [30]:
# train final model
best_clf.fit(X_train, y_train)

# evaluate on test set
y_pred = best_clf.predict(X_test)
# convert to numpy arrays
y_pred = np.array(y_pred)
                  
y_true = np.array(y_test)

f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)
cm = confusion_matrix(y_test, y_pred)

class_names = ["offensive", "reappropriative"]

cm_df = pd.DataFrame(
    cm,
    index=[f"True: {c}" for c in class_names],
    columns=[f"Pred: {c}" for c in class_names]
)

# Log metrics
# wandb.log({
#     "f1_macro": f1,
#     "accuracy": accuracy,
#     "classification_report": report,
#     "confusion_matrix": cm
# })

print(f"Test F1 Macro: {f1}")
print(f"Test Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(cm_df)

Test F1 Macro: 0.8721407624633432
Test Accuracy: 0.9128440366972477
Classification Report:
              precision    recall  f1-score   support

           0     0.9758    0.9148    0.9443       176
           1     0.7170    0.9048    0.8000        42

    accuracy                         0.9128       218
   macro avg     0.8464    0.9098    0.8721       218
weighted avg     0.9259    0.9128    0.9165       218

Confusion Matrix:
                       Pred: offensive  Pred: reappropriative
True: offensive                    161                     15
True: reappropriative                4                     38


In [4]:
# tfidf version
param_grid = {"C": [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42),
    param_grid,
    scoring="f1_macro", # optimize for macro F1 score
    cv=5 # cross validation folds
)
grid.fit(X_train_tfidf, y_train)

best_clf = grid.best_estimator_

# train final model
best_clf.fit(X_train_tfidf, y_train)

# evaluate on test set
y_pred = best_clf.predict(X_test_tfidf)
# # convert to numpy arrays
# y_pred = np.array(y_pred)
                  
# y_true = np.array(y_test)

f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)
cm = confusion_matrix(y_test, y_pred)

class_names = ["offensive", "reappropriative"]

cm_df = pd.DataFrame(
    cm,
    index=[f"True: {c}" for c in class_names],
    columns=[f"Pred: {c}" for c in class_names]
)

print(f"Test F1 Macro: {f1}")
print(f"Test Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(cm_df)

Test F1 Macro: 0.9196246019775431
Test Accuracy: 0.9495412844036697
Classification Report:
              precision    recall  f1-score   support

           0     0.9714    0.9659    0.9687       176
           1     0.8605    0.8810    0.8706        42

    accuracy                         0.9495       218
   macro avg     0.9159    0.9234    0.9196       218
weighted avg     0.9501    0.9495    0.9498       218

Confusion Matrix:
                       Pred: offensive  Pred: reappropriative
True: offensive                    170                      6
True: reappropriative                5                     37


### Error Analysis

In [12]:
# Indices of false positives : predicted offensive but actually reappropriate
fn_indices = np.where((y_true == 1) & (y_pred == 0))[0]

fn_sentences = [X_test_texts[i] for i in fn_indices]

print("Predicted offensive but actually reappropriate:\n")
for i, sentence in enumerate(fn_sentences):
    print(i)
    print(sentence)
    print("\n")

# save to CSV with space between them
pd.DataFrame(fn_sentences, columns=["sentence"]).to_csv("error_analysis/logistic_regression/false_negatives_it.csv", index=True, sep='\t')

Predicted offensive but actually reappropriate:

0
@USER @USER Per qualcuno sono gay,per altri transgender, per latri ancora LGBT. Per so froci e rottinculo! Bloccatemi vigliacchi di Twitter


1
@USER non sono fiduciosa negli spazi di aggregazione online. Letteralmente ci sono i queer frocialisti eccetera eccetera e poi i circoletti di gay che postano nudes ogni giorno hanno onlyfans e sono convinti di vivere in un porno 24/7

√® un incubo


2
credo semplicemente che per fare autoironia possiamo semplicemente chiamarci froci noi della lgbt, se me lo viene a dire un etero sinceramente gli spacco la faccia, cos√¨ come io non mi rivolgerei mai ad un'altra persona chiamandola negra


3
Sono frocio ma - a differenza sua che guarda solo il suo orticello - supporto i diritti delle donne sex workers che vogliono aprirsi un account onlyfans.

E supporto ovviamente anche le donne che non vogliono aprirsi un account onlyfans.




In [13]:
# Indices of false positives : predicted reappropriate but actually offensive
fp_indices = np.where((y_true == 0) & (y_pred == 1))[0]

fp_sentences = [X_test_texts[i] for i in fp_indices]

print("Predicted reappropriate but actually offensive:\n")
for i, sentence in enumerate(fp_sentences):
    print(i)
    print(sentence)
    print("\n")

# save to CSV with space between them
pd.DataFrame(fp_sentences, columns=["sentence"]).to_csv("error_analysis/logistic_regression/false_positives_it.csv", index=True, sep='\t')

Predicted reappropriate but actually offensive:

0
Tutti i vostri "frocio", mi aprivano un varco verso le camere a gas. Mi hanno ucciso ma IO HO POTUTO LOTTARE, loro no. Anche per questo serve una legge contro l'omofobia

#omocausto #GiornataDellaMemoria2021 @USER


1
Che poi √® l‚Äôetero medio come Pio e Amedeo che va a trans la sera a farsi spaccare i cocchi! #pioeamedeo #LGBTQ #DDLZan #DDLZanLeggeControOmofobia #right  #HumanRights


2
Lo ripeto: i froci fanno vomitare e i loro pride di merda altrettanto visto che la domenica vorrei dormire e mi svegliano con Lady Gaga a palla


3
Ieri sono uscito con una tuta arcobaleno e un tipo mi fa "frocio se vuoi ti faccio conoscere qualcuno che te lo mette al culo" e all'inizio non l'ho presa benissimo


4
Io mi sono rotto i coglioni di tutte queste porcate. La Madonna √® la Madonna, Dio √® Dio, trans lgbt lesbiche gay sono quello che vogliono essere. A tutto c‚Äô√® un limite!


5
Lesbiche e finocchi perfettamente distanziati fra loro e con t

### Output probabilities

In [12]:
y_prob = best_clf.predict_proba(X_test_tfidf)

In [None]:
threshold = 0.6

def low_confidence(probs, threshold=0.6):
    max_probs = probs.max(axis=1)
    return max_probs < threshold

In [None]:
low_conf_idx = low_confidence(y_prob, threshold=0.6)

for i in low_conf_idx.nonzero()[0]:
    text = X_test_texts[i]
    prompt = f"Classify the following text as 'offensive' or 'reappropriative':\n\n{text}"
    llm_label = call_llm(prompt)  # your function to query an LLM
    y_pred_final[i] = llm_label


In [13]:
y_prob

array([[0.81839939, 0.18160061],
       [0.83681347, 0.16318653],
       [0.60086947, 0.39913053],
       [0.40972923, 0.59027077],
       [0.35779106, 0.64220894],
       [0.8661298 , 0.1338702 ],
       [0.11722643, 0.88277357],
       [0.38574331, 0.61425669],
       [0.3485175 , 0.6514825 ],
       [0.66814016, 0.33185984],
       [0.66117165, 0.33882835],
       [0.45987733, 0.54012267],
       [0.72621264, 0.27378736],
       [0.86710536, 0.13289464],
       [0.87106177, 0.12893823],
       [0.46777453, 0.53222547],
       [0.76609739, 0.23390261],
       [0.6674139 , 0.3325861 ],
       [0.66726902, 0.33273098],
       [0.90125417, 0.09874583],
       [0.63535403, 0.36464597],
       [0.85581392, 0.14418608],
       [0.79255221, 0.20744779],
       [0.67521394, 0.32478606],
       [0.78886134, 0.21113866],
       [0.76272596, 0.23727404],
       [0.34134585, 0.65865415],
       [0.79885035, 0.20114965],
       [0.84344447, 0.15655553],
       [0.82260433, 0.17739567],
       [0.

## NON FUNZIONA

### Data Augmentation

In [None]:
# swap and delete augmentation
import nlpaug.augmenter.word as naw

del_aug = naw.RandomWordAug(action="delete", aug_p=0.1)
swap_aug = naw.RandomWordAug(action="swap", aug_p=0.1)

def augment_text(text, n_aug=3):
    """Generate n_aug augmented versions of text."""
    augmented = []
    for _ in range(n_aug):
        choice = np.random.choice(["del", "swap"])
        if choice == "del":
            out = del_aug.augment(text)
        else:
            out = swap_aug.augment(text)
        if isinstance(out, list):
            out = out[0]
    
        augmented.append(out)
        
    return augmented

# Augment only the minority class (reappropriative: label 1)
df = train_df.copy() 

minority_label = 1  # reappropriative
n_aug = 2           # number of augmentations per sample

aug_rows = []

for _, row in df[df["label"] == minority_label].iterrows():
    augmented_versions = augment_text(row["text"], n_aug=n_aug)
    for i, aug_text in enumerate(augmented_versions):
        aug_rows.append({"id": f"{row['id']}_aug_{i}", "text": aug_text, "bio" : row["bio"], "label": minority_label, "lang" : row["lang"]})

df_aug = pd.concat([df, pd.DataFrame(aug_rows)], ignore_index=True)
# --> very bad results, do not use


In [None]:
# Back-translation augmentation
from deep_translator import GoogleTranslator

def back_translate(text):
    print(text)
    try:
        es = GoogleTranslator(source='it', target='en').translate(text)
        it_back = GoogleTranslator(source='en', target='it').translate(es)
        print(it_back)
        return it_back
    except:
        return text  # fallback
    
aug_texts = [back_translate(t) for t in train_df[train_df.label==1].text]
aug_labels = [1 for _ in aug_texts]

# --> very bad results, do not use

madonna che urto i ricchioni di elite URL
madonna che colpisce l'URL dei ragazzi ricchi d'√©lite
#ottoemezzo #ddlzan #LGBTQ 
Tutti i...#froci di destra, 'calorosamente' ringraziano
per i 'calorosi' applausi dei senatori che oggi hanno affossato il #ddlzan !
#ottoemezzo #ddlzan #LGBTQ 
Tutti i... #froci a destra, grazie 'di cuore'
per gli applausi 'calorosi' dei senatori che oggi hanno distrutto il #ddlzan!
sto valutando di venire a Milano per due notti dal 15 feb
√® ancora molto in forse ma qualcuno di voi forci potrebbe ospitarmi? ü•∫
Sto pensando di venire a Milano per due notti dal 15 febbraio
E' ancora molto incerto ma qualcuno di voi magari potrebbe ospitarmi? ü•∫
e anche la terza dose √® andata ho finalmente il 5g e sono ancora + forcio di prima we have to stan URL
e anche la terza dose √® andata, finalmente ho 5 g e sono ancora pi√π forte di prima, dobbiamo stan URL
A volte mi ricordo di quanto sia bello essere lella e non poterlo dire con fierezza a nessuno
A volte ricordo qu

In [None]:
# synonym replacement using masked language model
from transformers import pipeline
import random

mlm = pipeline("fill-mask", model="Musixmatch/umberto-commoncrawl-cased-v1")

def mlm_augment(text, prob=0.15, top_k=5):
    words = text.split()
    new_words = []
    for w in words:
        if random.random() < prob and len(w) > 3:
            masked = text.replace(w, "<mask>", 1)
            preds = mlm(masked, top_k=top_k)
            candidates = [p["token_str"].strip() for p in preds]
            new_words.append(random.choice(candidates))
        else:
            new_words.append(w)
    print(" ".join(new_words))
    return " ".join(new_words)

aug_texts = [mlm_augment(t) for t in train_df[train_df.label==1].text]
aug_labels = [1 for _ in aug_texts]
# --> very bad results, do not use

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at Musixmatch/umberto-commoncrawl-cased-v1 were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from 

</s>NOTUSED che urto i ricchioni di elite URL
#ottoemezzo : #LGBTQ Tutti i...#froci di destra, 'calorosamente' ringraziano per i 'calorosi' applausi dei senatori che oggi hanno affossato il - !
sto valutando di venire a Milano per due notti dal 15 feb √® ancora molto in forse ma qualcuno di voi forci potrebbe aiutare ü•∫
e anche la terza dose √® andata ho finalmente il 5g e sono ancora + forcio di prima we have to go URL
A volte mi ricordo di quanto sia bello sentirsi tristi e non poterla negare con fierezza a nessuno
Chiss√† cosa dice quella forcia di noi safficone che adoriamo i suoi kg di phessa da mattino a sera
no vabb√® appena iniziato american love stories e c‚Äô√® gi√† la coppia forcia e la coppia scopa per ora NESSUNA coppia eterosessuale in vista we love to see it
Certo che noi mamme siamo un circo!!! Quello che su Instagram si lamentava di Grindr e del sesso occasionale ha postato un selfie con Holy Mary üò≥
@USER Un po‚Äô come #omofobia √® colpa mia che so frocio o forse 

### Train on augmented Dataset

In [None]:
X_train_aug_texts = df_aug['text'].tolist()
y_train_aug = df_aug['label'].tolist()

vectorizer = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
X_train_aug = vectorizer.fit_transform(X_train_aug_texts)

# tfidf augmented version
param_grid = {"C": [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(
    LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42),
    param_grid,
    scoring="f1_macro", # optimize for macro F1 score
    cv=5 # cross validation folds
)
grid.fit(X_train_aug, y_train_aug)

best_clf = grid.best_estimator_

# train final model
best_clf.fit(X_train_aug, y_train_aug)

# evaluate on test set
y_pred = best_clf.predict(X_test_tfidf)


f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, digits=4)
cm = confusion_matrix(y_test, y_pred)

class_names = ["offensive", "reappropriative"]

cm_df = pd.DataFrame(
    cm,
    index=[f"True: {c}" for c in class_names],
    columns=[f"Pred: {c}" for c in class_names]
)

print(f"Test F1 Macro: {f1}")
print(f"Test Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(cm_df)

AttributeError: 'list' object has no attribute 'shuffle'