##### Import Libs

In [54]:
import pandas as pd
import swifter
from transformers import pipeline, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

##### Load the Dataset

In [55]:
# Load the CSV dataset
# Can't specify split because the dataset has not been divided to train and test yet
# So everything is loaded to "train"
dataset = load_dataset('csv', data_files='./datasets/train-balanced-sarcasm.csv')

# Split the dataset into train-test
dict_train_test_split = dataset["train"].train_test_split(test_size=0.2)

# Access the train and test sets
dict_train = dict_train_test_split["train"]
dict_test = dict_train_test_split["test"]

##### Initialize the Tokenizer and the Model

In [40]:
# Can also do this
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('martin-ha/toxic-comment-model')
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Adjust num_labels as needed

In [41]:
# Tokenize the dataset
# Here is where you specify the input column
def tokenize_function(input_string):
    return tokenizer(input_string['Text'], padding='max_length', truncation=True)

# Apply the tokenization function to your dataset
dict_train_tokenized = dict_train.map(tokenize_function, batched=True)

# Format dataset for PyTorch
# Here is where you specify the label/target column
dict_train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'target'])



Map: 100%|██████████| 800/800 [00:00<00:00, 2687.23 examples/s]


In [42]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',              # Directory to save results
    num_train_epochs=3,                  # Number of training epochs
    per_device_train_batch_size=16,      # Batch size per device during training
    per_device_eval_batch_size=64,       # Batch size per device during evaluation
    warmup_steps=500,                   # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                  # Strength of weight decay
    logging_dir='./logs',               # Directory for storing logs
    logging_steps=10,                   # Frequency of logging steps
)

In [43]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=dict_train_tokenized,        # The dataset to train on
)

In [44]:
# Start training
trainer.train()





ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.

#### Load the Dataset

In [167]:
df = pd.read_csv("datasets\mtsamples.csv", index_col=0)

In [168]:
# Filtering to 12 classes for simplicity
list_med_sp = [
    " Allergy / Immunology",
    " Bariatrics",
    " Cardiovascular / Pulmonary",
    " Urology",
    " Dentistry",
    " Rheumatology",
    " Radiology",
    " Psychiatry / Psychology",
    " Podiatry",
    " Orthopedic",
    " Opthalmology",
    " Neurology"
]
# Filter the DataFrame
df = df[df['medical_specialty'].isin(list_med_sp)]

# df_stratified, _ = train_test_split(df, train_size=0.03, stratify=df['medical_specialty'])

#### Specify the candidate labels for the ZSC

In [169]:
candidate_labels = df["medical_specialty"].unique().tolist()

#### Create the Classifier

In [170]:
classifier = (
    pipeline(task="zero-shot-classification",
             model="tasksource/deberta-small-long-nli"
    )
)

In [171]:
# Function to classify a single description
def classify_description(description):
    result = classifier(description, candidate_labels)
    # Find the index of the maximum score
    max_index = result['scores'].index(max(result['scores']))
    # Return the label with the highest score
    return result['labels'][max_index], result['scores'][max_index]

#### Zero-shot

In [172]:
df[['zero_shot_class', 'score']] = (
    df['description']
    .swifter.progress_bar(enable=True)
    .apply(classify_description)  # Apply function directly
    .apply(pd.Series)  # Convert tuple/list output into DataFrame
)

Pandas Apply: 100%|██████████| 1543/1543 [49:52<00:00,  1.94s/it]


In [174]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Assuming 'y_true' is the actual label and 'y_pred' is the predicted label
y_true = df['medical_specialty']
y_pred = df['zero_shot_class']

# Get accuracy score
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Get confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


Accuracy: 0.53
Classification Report:
                             precision    recall  f1-score   support

       Allergy / Immunology       0.16      1.00      0.27         7
                 Bariatrics       0.75      0.83      0.79        18
 Cardiovascular / Pulmonary       0.81      0.23      0.36       372
                  Dentistry       0.76      0.81      0.79        27
                  Neurology       0.71      0.31      0.44       223
                 Orthopedic       0.71      0.74      0.72       355
                   Podiatry       0.55      0.38      0.45        47
    Psychiatry / Psychology       0.78      0.74      0.76        53
                  Radiology       0.31      0.76      0.44       273
               Rheumatology       0.18      0.40      0.25        10
                    Urology       0.93      0.59      0.73       158

                   accuracy                           0.53      1543
                  macro avg       0.60      0.62      0.54     