In [None]:
!pip install setfit
import torch
import random
import numpy as np
from sentence_transformers.losses import CosineSimilarityLoss
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from setfit import SetFitTrainer,SetFitModel
import pyarrow as pa
import pandas as pd
from datasets import Dataset
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, StratifiedKFold
from statistics import mean 
from sklearn.metrics import classification_report
from sklearn import metrics

random.seed(0)
torch.manual_seed(0)
np.random.seed(0)
if torch.cuda.is_available():  
  device = "cuda:0" 
else:  
  device = "cpu"  

In [74]:
device

'cuda:0'

##DATASET

In [139]:
df = pd.read_csv("train.txt", delimiter=';', header=None, names=['sentence','label'])
df, _ = train_test_split(df, train_size=0.05,stratify=df['label'],random_state=20)

In [140]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['label_enc'] = labelencoder.fit_transform(df['label'])

In [142]:
df.rename(columns={'label':'label_desc'},inplace=True)
df.rename(columns={'label_enc':'label'},inplace=True)

In [143]:
df

Unnamed: 0,sentence,label_desc,label
10232,i get the feeling that the rest of yall are a ...,anger,0
4247,i am feeling like painting tonight and simply ...,joy,2
9184,i feel i need to be punished,sadness,4
8602,i worked very hard on holding my technique whe...,joy,2
8286,i feel it is important to support young people...,joy,2
...,...,...,...
2270,i really feel that im the least talented perso...,joy,2
2256,i plodded through this taking far too long but...,joy,2
5426,i feel so peaceful so i know i made the right ...,joy,2
15892,i feel lethargic i just feel blah but when i m...,sadness,4


In [144]:
train_df , eval_df = train_test_split(df, train_size=0.6,stratify=df['label'],random_state=20)

In [146]:
eval_df , test_df = train_test_split(eval_df, train_size=0.6,stratify=eval_df['label'],random_state=20)

## MODEL

In [150]:
def model_init(params):
    params = params or {}
    return SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2", **params)

def model_init_roberta(params):
    params = params or {}
    return SetFitModel.from_pretrained("all-roberta-large-v1", **params)


def compute_metrics(labels,pred):
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred,average='macro')
    precision = precision_score(y_true=labels, y_pred=pred,average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred,average='macro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



# Hyperparameter Search (optional)

In [None]:
skf = StratifiedKFold(n_splits=5)
iterations_test = [10,15,20,25]
batch_test = [4,8,16]

hyperparameter_search_results = []
for num_iterations in iterations_test:
    for batch_size in batch_test:
        list_of_results = []
        for train_index, test_index in skf.split(train_df['doc'], train_df['target']):

            train_dataset = Dataset(pa.Table.from_pandas(train_df.iloc[train_index]))
            eval_dataset = train_df.iloc[test_index]
            trainer = SetFitTrainer(
                model_init=model_init,
                train_dataset=train_dataset,
                loss_class=CosineSimilarityLoss,
                batch_size=batch_size,
                num_iterations=num_iterations, # The number of text pairs to generate for contrastive learning
                num_epochs=1, # The number of epochs to use for constrastive learning
                column_mapping={"doc": "text", "target": "label"},
                seed=123,
            )
            trainer.train()

            preds = trainer.model.predict(list(eval_dataset['doc']))
            truth = list(eval_dataset['target'])
            results = compute_metrics(truth,preds)
            list_of_results.append(results)
        avg_results = {}
        for key in results.keys():
            avg_results[key] = mean([d[key] for d in list_of_results])
        hyperparameter_search_results.append(avg_results)

#Test

In [151]:
train_dataset = Dataset(pa.Table.from_pandas(train_df))
eval_dataset = Dataset(pa.Table.from_pandas(eval_df))

#If you did hyperparameter search, initiallize values here. 
trainer = SetFitTrainer(
    model_init=model_init_roberta,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=25, # The number of text pairs to generate for contrastive learning
    num_epochs=1, # The number of epochs to use for constrastive learning
    column_mapping={"sentence": "text", "label": "label"},
    seed=123,
)
trainer.train()


config.json not found in HuggingFace Hub
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 24000
  Num epochs = 1
  Total optimization steps = 1500
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1500 [00:00<?, ?it/s]

In [152]:
preds = trainer.model.predict(list(test_df['sentence']))
truth = list(test_df['label'])
results = compute_metrics(truth,preds)
print(results)

{'accuracy': 0.796875, 'precision': 0.7117313610413739, 'recall': 0.7373554984533097, 'f1': 0.7166925920350576}


In [153]:
from sklearn.metrics import classification_report
print(classification_report(truth,preds))

              precision    recall  f1-score   support

           0       0.85      0.65      0.73        17
           1       0.74      0.88      0.80        16
           2       0.88      0.84      0.86        43
           3       0.55      0.60      0.57        10
           4       0.89      0.86      0.88        37
           5       0.38      0.60      0.46         5

    accuracy                           0.80       128
   macro avg       0.71      0.74      0.72       128
weighted avg       0.81      0.80      0.80       128

