In [None]:
!pip install transformers sentencepiece datasets

In [None]:
from datasets import load_dataset, ClassLabel

ds = load_dataset("csv",data_files="tos.csv")["train"]

label_names = ds.unique('label') # list of labels (subreddits)
labels = ClassLabel(
    num_classes=len(label_names), 
    names=label_names,
    )

ds = ds.train_test_split(test_size=0.2, shuffle=True)
ds

In [None]:
from transformers import AutoTokenizer

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tok_func(x): 
    tok_x = tokenizer(x["text"], padding=True, truncation=True)
    tok_x['label'] = labels.str2int(x['label'])
    return tok_x
tok_ds = ds.map(tok_func, batched=True)

In [None]:
from datasets import load_metric
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

bs = 32
epochs = 10
lr = 1e-5

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments('model', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True, 
                         evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, 
                         num_train_epochs=epochs, weight_decay=0.01, report_to='none')

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_names))

trainer = Trainer(model, args, train_dataset=tok_ds['train'], eval_dataset=tok_ds['test'], 
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

trainer.train();

In [None]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

text = "We reserve the right to make changes to our site, policies, Service Terms, and these Conditions of Use at any time"
label_names[int(pipe(text)[0]['label'].split('_')[-1])]