In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
!pip install pysentimiento



In [1]:
# %%

from pathlib import Path

import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import f1_score
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          DataCollatorWithPadding, Trainer, TrainingArguments,
                          pipeline)
from pysentimiento.preprocessing import preprocess_tweet

if Path("/kaggle").exists():
    input_path = Path("/kaggle") / "input" / "racism"
    output_path = Path("/kaggle") / "working"
    tmp_path = output_path / "tmp"
    models_path = output_path / "artifacts"
    models_path.mkdir(exist_ok=True, parents=True)
    tmp_path.mkdir(exist_ok=True, parents=True)
else:
    data_path = Path("data")
    input_path = data_path / "split"
    tmp_path = data_path / "tmp"
    tmp_path.mkdir(exist_ok=True, parents=True)
    models_path = Path("models") / "artifacts"
    output_path = models_path

def replace_quotes(x: pd.Series) -> pd.Series:
    """
    Replace quotes with the word "cita" to keep some info
    """
    regex = '"([^"]*)"'
    x = x.str.replace(regex, "cita")

    return x

def process_text(df):
    """
    Some preprocessing of emojis and quotes
    """
    df["message"] = df["message"].apply(preprocess_tweet)
    #df["message"] = replace_quotes(df["message"])
    return df

df = pd.read_csv( data_path / "labels_racism_regression_train.txt", delimiter="|")
df2 = pd.read_csv(data_path / "labels_racism.csv", sep="|")
df["clas_label"] = df2.label
messages = df.message.unique()
np.random.seed(42)
messages_train = np.random.choice(
    messages, size=int(len(messages) * 0.6), replace=False)


df_train = df.loc[df.message.isin(messages_train)].reset_index(drop=True)
df_test = df.loc[~df.message.isin(messages_train)].reset_index(drop=True)


# %% Preprocess data a bit
train_df = process_text(df_train)
test_df = process_text(df_test)

train_df = train_df.rename(columns={
    'message': 'text'})
train_df.to_csv(tmp_path / "labels_racism_train.csv", index=False)

test_df = test_df.rename(columns={
    'message': 'text'})
test_df.to_csv(tmp_path / "labels_racism_test.csv", index=False)


In [5]:
len(df)

5565

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


# MODEL_NAME = "pysentimiento/robertuito-sentiment-analysis"
MODEL_NAME = "PlanTL-GOB-ES/roberta-base-bne"
#MODEL_NAME = "PlanTL-GOB-ES/roberta-large-bne"


# %% Load tokenizer and train
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=1, ignore_mismatched_sizes=True
)


def preprocess_function(examples):
    # txt = [x.replace("gitano", "negro") for x in txt]
    return tokenizer(examples["text"], truncation=True, padding=True)


# %% Load ready for hf
dataset = load_dataset(path=str(tmp_path), data_files={
    'train': 'labels_racism_train.csv',
    'validation': 'labels_racism_test.csv'
    }
)



# %% Preprocess data
tokenized_data = dataset.map(preprocess_function, batched=True)


# %% Train model
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# %% Predict on validation
preds = trainer.predict(tokenized_data["validation"])

In [None]:
train_preds = trainer.predict(tokenized_data["train"])

In [None]:
def threshold_optimisation(preds, y, n_thresholds=200):

    df_thresholds = pd.DataFrame()
    for threshold in np.linspace(0, 1, n_thresholds):
        df_thresholds = df_thresholds.append(
            pd.DataFrame({
                "threshold": [threshold],
                "f1": [f1_score(preds > threshold, y)]
            })
        )

    return df_thresholds.sort_values("f1", ascending=False).head(1).threshold.to_list()[0]

In [None]:
y = train_df.clas_label
y_preds = sigmoid(train_preds.predictions[:, 1] - train_preds.predictions[:, 0])
opt_threshold = threshold_optimisation(y_preds, y)
opt_threshold

In [None]:
y_valid = preds.label_ids
y_valid_preds = sigmoid(preds.predictions[:, 1] - preds.predictions[:, 0])
opt_threshold_valid = threshold_optimisation(y_valid_preds, y_valid)
opt_threshold_valid

In [None]:
print(f1_score(preds.label_ids, y_valid_preds > opt_threshold))

In [None]:
print(f1_score(train_preds.label_ids, y_preds > opt_threshold))

In [None]:
trainer.save_model(models_path)
tokenizer.save_pretrained(models_path)


In [None]:
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)


In [None]:
test_df_out = pd.read_csv(input_path / "labels_racism_test.txt", delimiter="|").query("label != 'unknown'")
test_df_out.assign(racist_score=y_valid_preds).to_csv(output_path / "hf_v1_validation.csv", index=False)

In [None]:
# %% Create pipeline
p = pipeline(
    "text-classification", model=str(models_path), tokenizer=str(models_path))

# %% Save pipeline
p.save_pretrained(models_path / "pipe")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["full"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)
