**Sign in to Hugging Face**

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [25]:
import pandas as pd 

# Load dataset

In [24]:
#data with 2 labels
df = pd.read_csv("../data/clean/feedback_prize_2_labels.csv")

id2label = {0: "CLAIM", 1: "PREMISE"}
label2id = {"CLAIM": 0, "PREMISE": 1}

In [9]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
"""
split train and test set
"""
train, test = train_test_split(df, test_size=0.2, random_state=42)
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(test)

ds = DatasetDict()

ds['train'] = tds
ds['test'] = vds

# Preprocess

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    """
    a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT’s maximum input length
    """
    return tokenizer(examples["text"], truncation=True)

To apply the preprocessing function over the entire dataset, use 🤗 Datasets map function. You can speed up map by setting batched=True to process multiple elements of the dataset at once

In [None]:
tokenized_ds = ds.map(preprocess_function, batched=True)

Now create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximium length.

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluate

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    """
    a function that passes predictions and labels to compute to calculate the accuracy
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Train

In [None]:
num_labels = len(df.label.unique())

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels, id2label=id2label, label2id=label2id
)

In [None]:
training_args = TrainingArguments(
    output_dir="text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.push_to_hub()