In [None]:
%pip install argilla datasets scikit-learn cleanlab -qqq

In [None]:
import argilla as rg
import pandas as pd

In [None]:
# Replace api_url with the url to your HF Spaces URL if using Spaces
# Replace api_key if you configured a custom API key
rg.init(
    api_url="",
    api_key=""
)

In [None]:
from datasets import load_dataset,DatasetDict,Dataset

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from argilla.labeling.text_classification import find_label_errors


In [None]:
data = pd.read_excel("../../../../nfs/aishare/Data/Intent/raw/Label-Arbeiten/Tina-Teil-1-kürzen.xlsx")
data.drop(columns=["NEU?","demand_text"], inplace=True)
data.rename(columns={"new_demand_text":"label"}, inplace=True)
data.head()

In [None]:
# remove labels with less than 10 samples
data = data.groupby("label").filter(lambda x: len(x) > 10)
data.head()

In [None]:
data["label"] = data["label"].str.lower()
data["label"] = data["label"].str.replace("?", "")

In [None]:
unique_labels = data["label"].unique()
label_count = data["label"].value_counts()
print("Unique labels: ", unique_labels)
print("Label count: ", label_count)


In [None]:
data

In [None]:
# remove labels with less than 10 samples
data = data[data["label"].isin(label_count[label_count>1].index)]
data

In [None]:
#rename label to label_name and make label be the id for the label
data.rename(columns={"label":"label_name"}, inplace=True)
label2id = {label:idx for idx,label in enumerate(unique_labels)}
data["label"] = data["label_name"].apply(lambda x: label2id[x])
data

In [None]:
# fix the data to be correctly showing german umlauts
data["text"] = data["text"].str.replace("Ã¼","ü")
data["text"] = data["text"].str.replace("Ã¤","ä")
data["text"] = data["text"].str.replace("Ã¶","ö")
data["text"] = data["text"].str.replace("ÃŸ","ß")
data["text"] = data["text"].str.replace("Ã©","é")

data


In [None]:
data.to_csv("data.csv", index=False)

In [None]:
# turn pandas df to huggingface dataset
dataset = Dataset.from_pandas(data)
dataset

In [None]:
# split dataset into train and test set (80/20)
ds = dataset.train_test_split(test_size=0.3)
ds_test = ds["test"]
ds_train = ds["train"]




In [None]:
ds.push_to_hub("fathyshalab/reklambox-1")

In [None]:
from transformers import set_seed
from datasets import load_dataset
from sentence_transformers.losses import (
    CosineSimilarityLoss,
    ContrastiveLoss,
    BatchAllTripletLoss,
    BatchHardTripletLoss,
)
from setfit import SetFitModel, SetFitTrainer, DistillationSetFitTrainer, sample_dataset
import os
from sklearn.metrics import f1_score, roc_auc_score
import json
from tqdm import tqdm
from sklearn.metrics import balanced_accuracy_score, f1_score, accuracy_score, roc_auc_score
import plotly.express as px


In [None]:
ds = load_dataset("fathyshalab/reklambox")
ds_train = ds["train"]
ds_test = ds["test"]


In [None]:
#add a feature called sentence_length
ds_train = ds_train.map(lambda x: {"sentence_length": len(x["text"].split())})
ds_test =  ds_test.map(lambda x: {"sentence_length": len(x["text"].split())})

In [None]:
# plot the sentence length distribution per label in a box plot
fig = px.box(ds_train, x="label_name", y="sentence_length", color="label_name")
fig.show()
fig = px.box(ds_test, x="label_name", y="sentence_length", color="label_name")
fig.show()

In [None]:
#filter the dataset to have only samples that are less than 512 tokens	
ds_train = ds_train.filter(lambda x: len(x["text"].split()) < 256)
ds_test = ds_test.filter(lambda x: len(x["text"].split()) < 256)

In [None]:
ds_train


In [None]:
def compute_metrics(y_pred,y_true):
    return {
        "f1": f1_score(y_true, y_pred, average="macro"),
        "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
        "accuracy": accuracy_score(y_true, y_pred),
    }

In [None]:
# turn into dataframe and plot the distribution of sentences add an option to see the top 10 sentences  and the labels
df = ds_train.to_pandas()
df["text"].str.split().apply(len).hist(bins=100)
#sort the rows by label
df.sort_values(by="label", inplace=True)
# label distribution  between train and test
fig = px.histogram(df, x="label_name", color="label_name", title="Label distribution in train set")
fig.show()

# turn into dataframe and plot the distribution of sentences
df = ds_test.to_pandas()
df.sort_values(by="label", inplace=True)

df["text"].str.split().apply(len).hist(bins=100)
# label distribution  between train and test in a pie chart
fig = px.histogram(df, x="label_name", color="label_name")
fig.show()


# add sentence length to the dataset
ds_train = ds_train.map(lambda x: {"sentence_length": len(x["text"].split())})
ds_test = ds_test.map(lambda x: {"sentence_length": len(x["text"].split())})






In [None]:
ds_train

In [None]:
# plot the sentence length distribution per label in a box plot
fig = px.box(ds_train, x="label_name", y="sentence_length", color="label_name")
fig.show()

In [None]:
fig = px.box(ds_test, x="label_name", y="sentence_length", color="label_name")
fig.show()

In [None]:
ds["train"] = ds_train
ds["test"] = ds_test
ds.push_to_hub("fathyshalab/reklambox-filtered")

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

train_ds = sample_dataset(ds_train, label_column="label", num_samples=8)
test_ds = ds["test"]
# Load SetFit model from Hub
model = model = SetFitModel.from_pretrained("sentence-transformers/all-roberta-large-v1")
# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class= ContrastiveLoss,
    metric=compute_metrics,
    batch_size=16,
    num_epochs=10,
    use_amp=True,
    warmup_proportion=0.0

)
trainer.train()
metrics = trainer.evaluate()
print(metrics)

In [None]:
# define our classifier as a pipeline of token counts + naive bayes model
classifier = Pipeline([("vect", CountVectorizer()), ("clf", MultinomialNB())])

# fit the classifier
classifier.fit(X=ds_train["text"], y=ds_train["label"])


In [None]:
# compute test accuracy
classifier.score(
    X=ds_test["text"],
    y=ds_test["label"],
)

In [None]:
# get predicted probabilities for all labels
probabilities = classifier.predict_proba(ds_test["text"])

In [None]:
classifier.classes_

In [None]:
ds_test

In [None]:
unique_labels = ds_train.to_pandas()["label_name"].unique()

unique_labels


In [None]:
# create records for the test set
records = [
    rg.TextClassificationRecord(
        text=data["text"],
        prediction=list(zip(unique_labels, prediction)),
        annotation=unique_labels[data["label"]],
        metadata={"split": "test"},
    )
    for data, prediction in zip(ds_train, probabilities)
]


In [None]:
# get records with potential label errors
records_with_label_error = find_label_errors(records)

In [None]:
# uncover label errors in the Argilla web app
rg.log(records_with_label_error, "label_errors")


In [None]:

# function to compute the loss example-wise
def loss_per_example(batch):
    encoded_input = tokenizer(batch["text"], padding=True, truncation=True, return_tensors='pt',max_length=512).to(device)
    labels = torch.tensor(batch["label"], device=device)

    with torch.no_grad():
        output = model(**encoded_input)
        batch["predicted_label"] = torch.argmax(output.logits, axis=1)
        # compute the probabilities for logging them into Argilla
        batch["predicted_probas"] = torch.nn.functional.softmax(output.logits, dim=0)

    # don't reduce the loss (return the loss for each example)
    loss = torch.nn.functional.cross_entropy(output.logits, labels, reduction="none")
    batch["loss"] = loss

    # datasets complains with numpy dtypes, let's use Python lists
    for k, v in batch.items():

        batch[k] = v

    return batch
losses_ds = ds_enc.map(
    loss_per_example, batched=True, batch_size=32
)

# turn the dataset into a Pandas dataframe, sort by descending loss and visualize the top examples.
pd.set_option("display.max_colwidth", None)

losses_ds.set_format("pandas")
losses_df = losses_ds[:][["label", "predicted_label", "loss", "predicted_probas"]]

# add the text column removed by the trainer
losses_df["text"] = ds_enc["text"]
losses_df.sort_values("loss", ascending=False).head(10)


In [None]:


ds = load_dataset("fathyshalab/mdsci", split="test")  # only for getting the label names


In [None]:
label_names = ds.to_pandas()["label_name"].unique()
label_names

In [None]:
# creates a Text classification record for logging into Argilla
def make_record(row):
    return rg.TextClassificationRecord(
        text=row.text,
        # this is the "gold" label in the original dataset
        annotation=[(unique_labels[row.label])],
        # this is the prediction together with its probability
        prediction=[
            (
               unique_labels[row.predicted_label],
                row.predicted_probas[row.predicted_label],
            )
        ],
        # metadata fields can be used for sorting and filtering, here we log the loss
        metadata={"loss": row.loss},
        # who makes the prediction
        prediction_agent="fathyshalab/autotrain-reklambox-3527295358",
        # source of the gold label
        annotation_agent="fathyshalab/reklambox",
    )

# if you want to log the full dataset remove the indexing
top_losses = losses_df.sort_values("loss", ascending=False)[0:499]

# build Argilla records
records = top_losses.apply(make_record, axis=1)

rg.log(records, name="reklambox_error_analysis")

In [None]:
dataset = rg.load("mdcsi", query="annotated_by:argilla").to_pandas()

# let's do some transformations before uploading the dataset
dataset["loss"] = dataset.metadata.transform(lambda r: r["loss"])
dataset = dataset.rename(columns={"annotation": "corrected_label"})

dataset.head()

In [None]:
# let's add the original dataset labels to share them together with the corrected ones
# we sort by ascending loss our corrected dataset
dataset = dataset.sort_values("loss", ascending=False)

# we add original labels in string form
id2label = list(dataset.corrected_label.unique())
original_labels = [id2label[i] for i in top_losses[0:50].label.values]
dataset["original_label"] = original_labels

In [None]:
ds = dataset[["text", "corrected_label", "original_label"]].to_dict(orient="list")

hf_ds = Dataset.from_dict(
    ds,
    features=Features(
        {
            "text": Value("string"),
            "corrected_label": ClassLabel(names=list(dataset.corrected_label.unique())),
            "original_label": ClassLabel(names=list(dataset.corrected_label.unique())),
        }
    ),
)