##### Import Libs

In [1]:
import pandas as pd
from transformers import pipeline, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from datasets import load_dataset
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


##### Load the Dataset

In [2]:
# Load the CSV dataset
# Can't specify split because the dataset has not been divided to train and test yet
# So everything is loaded to "train"
dataset = load_dataset('csv', data_files='./datasets/youtoxic_english_1000.csv')

# Define a function to do some preprocessing - rename target and map values
# I'll rename the target column "IsToxic" to "labels".
def preprocess_data(data):
    # Rename columns    
    data['labels'] = data.pop('IsToxic')
    data['text'] = data.pop('Text')
    # Map true/false to 1/0
    data['labels'] = 1 if data['labels'] == True else 0
    return data

# Run the preprocessing
dataset = dataset.map(preprocess_data)

# Split the dataset into train-test
dict_train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=823)

# Access the train and test sets
dict_train = dict_train_test_split["train"]
dict_test = dict_train_test_split["test"]

Generating train split: 1000 examples [00:00, 17972.15 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3617.90 examples/s]


##### (1) Use the Pretrained Network directly

##### Initialize the Tokenizer and the Model

In [3]:
# Can also do this
# Initialize tokenizer and model
model = "martin-ha/toxic-comment-model"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)  # Adjust num_labels as needed
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

# Use pipeline(task="text-classification") for simplicity and ease of use, especially for standard tasks where you don't need much customization.
# Use TextClassificationPipeline when you need more control over the pipeline components or when you're working on a more complex or customized task.

  return self.fget.__get__(instance, owner)()


In [21]:
inputs_test = dict_test['text']
actual_labels = dict_test["labels"]
# Make predictions
predictions = classifier(inputs_test)
# Predicted labels are "non-toxic" and "toxic", so we still need to map that
label_mapping = {"non-toxic": 0, "toxic": 1}
predicted_labels = [label_mapping[pred['label']] for pred in predictions]

In [23]:
# Print classification report
print(classification_report(actual_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.71      0.85      0.78       118
           1       0.70      0.51      0.59        82

    accuracy                           0.71       200
   macro avg       0.71      0.68      0.68       200
weighted avg       0.71      0.71      0.70       200




##### (2) Perform Finetuning using the dataset

In [191]:
# Tokenize the dataset
# Here is where you specify the input column
def tokenize_function(input_string):
    return tokenizer(input_string['text'], padding='max_length', truncation=True, truncation_strategy='longest_first')

# Apply the tokenization function to your dataset
dict_train_tokenized = dict_train.map(tokenize_function, batched=True)
dict_test_tokenized = dict_test.map(tokenize_function, batched=True)

# Format dataset for PyTorch
# Here is where you specify the label/target column
dict_train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dict_test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map: 100%|██████████| 200/200 [00:00<00:00, 2347.95 examples/s]


In [192]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',              # Directory to save results
    num_train_epochs=5,                  # Number of training epochs
    per_device_train_batch_size=16,      # Batch size per device during training
    per_device_eval_batch_size=32,       # Batch size per device during evaluation
    warmup_steps=500,                   # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                  # Strength of weight decay
    logging_dir='./logs',               # Directory for storing logs
    logging_steps=10,                   # Frequency of logging steps
)

In [193]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=dict_train_tokenized,        # The dataset to train on
)

In [194]:
# Start training
trainer.train()

Could not read Jupyter Notebook: No module named 'nbconvert'


ClearML Task: overwriting (reusing) task id=9e614809235d475da9b542269ae68b6b
ClearML results page: https://app.clear.ml/projects/a280739eaca04ea6b7be4b98665e026b/experiments/9e614809235d475da9b542269ae68b6b/output/log


Parameters must be of builtin type (Transformers/accelerator_config[AcceleratorConfig])
  4%|▍         | 10/250 [00:50<20:30,  5.13s/it]

{'loss': 0.9165, 'grad_norm': 6.543761253356934, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.2}


  8%|▊         | 20/250 [01:41<19:31,  5.09s/it]

{'loss': 1.0359, 'grad_norm': 90.1664047241211, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.4}


 12%|█▏        | 30/250 [02:33<18:42,  5.10s/it]

{'loss': 0.7416, 'grad_norm': 26.681711196899414, 'learning_rate': 3e-06, 'epoch': 0.6}


 16%|█▌        | 40/250 [03:20<16:33,  4.73s/it]

{'loss': 0.8805, 'grad_norm': 14.477645874023438, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}


 20%|██        | 50/250 [04:08<15:44,  4.72s/it]

{'loss': 0.8904, 'grad_norm': 12.889693260192871, 'learning_rate': 5e-06, 'epoch': 1.0}


 24%|██▍       | 60/250 [05:08<20:18,  6.41s/it]

{'loss': 0.7475, 'grad_norm': 4.488498210906982, 'learning_rate': 6e-06, 'epoch': 1.2}


 28%|██▊       | 70/250 [06:10<17:23,  5.80s/it]

{'loss': 0.6863, 'grad_norm': 12.406845092773438, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.4}


 32%|███▏      | 80/250 [07:11<15:19,  5.41s/it]

{'loss': 0.6392, 'grad_norm': 15.060066223144531, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.6}


 36%|███▌      | 90/250 [08:13<15:48,  5.93s/it]

{'loss': 0.5373, 'grad_norm': 34.87422561645508, 'learning_rate': 9e-06, 'epoch': 1.8}


 40%|████      | 100/250 [09:03<12:16,  4.91s/it]

{'loss': 0.7311, 'grad_norm': 140.96536254882812, 'learning_rate': 1e-05, 'epoch': 2.0}


 44%|████▍     | 110/250 [09:53<11:21,  4.87s/it]

{'loss': 0.6792, 'grad_norm': 40.50693130493164, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.2}


 48%|████▊     | 120/250 [10:42<10:32,  4.86s/it]

{'loss': 0.6006, 'grad_norm': 83.99149322509766, 'learning_rate': 1.2e-05, 'epoch': 2.4}


 52%|█████▏    | 130/250 [11:32<09:44,  4.87s/it]

{'loss': 0.5875, 'grad_norm': 4.254546165466309, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.6}


 56%|█████▌    | 140/250 [12:23<09:42,  5.29s/it]

{'loss': 0.7433, 'grad_norm': 1.8301067352294922, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.8}


 60%|██████    | 150/250 [13:23<08:53,  5.33s/it]

{'loss': 0.7007, 'grad_norm': 120.06061553955078, 'learning_rate': 1.5e-05, 'epoch': 3.0}


 64%|██████▍   | 160/250 [14:13<07:18,  4.87s/it]

{'loss': 0.6134, 'grad_norm': 24.287200927734375, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.2}


 68%|██████▊   | 170/250 [15:02<06:29,  4.87s/it]

{'loss': 0.5933, 'grad_norm': 16.255552291870117, 'learning_rate': 1.7000000000000003e-05, 'epoch': 3.4}


 72%|███████▏  | 180/250 [15:52<05:41,  4.87s/it]

{'loss': 0.6495, 'grad_norm': 186.526123046875, 'learning_rate': 1.8e-05, 'epoch': 3.6}


 76%|███████▌  | 190/250 [16:42<04:52,  4.87s/it]

{'loss': 0.6692, 'grad_norm': 25.667909622192383, 'learning_rate': 1.9e-05, 'epoch': 3.8}


 80%|████████  | 200/250 [17:32<04:03,  4.87s/it]

{'loss': 0.6484, 'grad_norm': 24.13962173461914, 'learning_rate': 2e-05, 'epoch': 4.0}


 84%|████████▍ | 210/250 [18:21<03:14,  4.86s/it]

{'loss': 0.6153, 'grad_norm': 8.585556983947754, 'learning_rate': 2.1e-05, 'epoch': 4.2}


 88%|████████▊ | 220/250 [19:11<02:25,  4.87s/it]

{'loss': 0.6443, 'grad_norm': 19.688703536987305, 'learning_rate': 2.2000000000000003e-05, 'epoch': 4.4}


 92%|█████████▏| 230/250 [20:01<01:37,  4.87s/it]

{'loss': 0.5958, 'grad_norm': 5.147881031036377, 'learning_rate': 2.3000000000000003e-05, 'epoch': 4.6}


 96%|█████████▌| 240/250 [20:50<00:48,  4.86s/it]

{'loss': 0.6791, 'grad_norm': 26.59487533569336, 'learning_rate': 2.4e-05, 'epoch': 4.8}


100%|██████████| 250/250 [21:40<00:00,  4.87s/it]

{'loss': 0.5958, 'grad_norm': 6.926318645477295, 'learning_rate': 2.5e-05, 'epoch': 5.0}
{'train_runtime': 1328.4035, 'train_samples_per_second': 3.011, 'train_steps_per_second': 0.188, 'train_loss': 0.6968718776702881, 'epoch': 5.0}


100%|██████████| 250/250 [21:45<00:00,  5.22s/it]


TrainOutput(global_step=250, training_loss=0.6968718776702881, metrics={'train_runtime': 1328.4035, 'train_samples_per_second': 3.011, 'train_steps_per_second': 0.188, 'total_flos': 529869594624000.0, 'train_loss': 0.6968718776702881, 'epoch': 5.0})

In [195]:
# Evaluate the model on the test set
eval_results = trainer.evaluate(eval_dataset=dict_test_tokenized)
print(f"Evaluation results: {eval_results}")

100%|██████████| 7/7 [00:26<00:00,  3.78s/it]

Evaluation results: {'eval_loss': 0.6941024661064148, 'eval_runtime': 26.4832, 'eval_samples_per_second': 7.552, 'eval_steps_per_second': 0.264, 'epoch': 5.0}





In [187]:
# Make predictions on the test set
predictions, labels, _ = trainer.predict(test_dataset=dict_test_tokenized)
# Convert predictions to labels (optional, depending on your task)
predicted_labels = predictions.argmax(axis=-1)

100%|██████████| 4/4 [00:50<00:00, 12.68s/it]


In [196]:
from sklearn.metrics import classification_report
print(classification_report(labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.67      0.80      0.73       118
           1       0.59      0.43      0.50        82

    accuracy                           0.65       200
   macro avg       0.63      0.61      0.61       200
weighted avg       0.64      0.65      0.63       200



In [197]:
predicted_labels

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0], dtype=int64)

#### Load the Dataset

In [167]:
df = pd.read_csv("datasets\mtsamples.csv", index_col=0)

In [168]:
# Filtering to 12 classes for simplicity
list_med_sp = [
    " Allergy / Immunology",
    " Bariatrics",
    " Cardiovascular / Pulmonary",
    " Urology",
    " Dentistry",
    " Rheumatology",
    " Radiology",
    " Psychiatry / Psychology",
    " Podiatry",
    " Orthopedic",
    " Opthalmology",
    " Neurology"
]
# Filter the DataFrame
df = df[df['medical_specialty'].isin(list_med_sp)]

# df_stratified, _ = train_test_split(df, train_size=0.03, stratify=df['medical_specialty'])

#### Specify the candidate labels for the ZSC

In [169]:
candidate_labels = df["medical_specialty"].unique().tolist()

#### Create the Classifier

In [170]:
classifier = (
    pipeline(task="zero-shot-classification",
             model="tasksource/deberta-small-long-nli"
    )
)

In [171]:
# Function to classify a single description
def classify_description(description):
    result = classifier(description, candidate_labels)
    # Find the index of the maximum score
    max_index = result['scores'].index(max(result['scores']))
    # Return the label with the highest score
    return result['labels'][max_index], result['scores'][max_index]

#### Zero-shot

In [172]:
df[['zero_shot_class', 'score']] = (
    df['description']
    .swifter.progress_bar(enable=True)
    .apply(classify_description)  # Apply function directly
    .apply(pd.Series)  # Convert tuple/list output into DataFrame
)

Pandas Apply: 100%|██████████| 1543/1543 [49:52<00:00,  1.94s/it]


In [174]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Assuming 'y_true' is the actual label and 'y_pred' is the predicted label
y_true = df['medical_specialty']
y_pred = df['zero_shot_class']

# Get accuracy score
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Get confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


Accuracy: 0.53
Classification Report:
                             precision    recall  f1-score   support

       Allergy / Immunology       0.16      1.00      0.27         7
                 Bariatrics       0.75      0.83      0.79        18
 Cardiovascular / Pulmonary       0.81      0.23      0.36       372
                  Dentistry       0.76      0.81      0.79        27
                  Neurology       0.71      0.31      0.44       223
                 Orthopedic       0.71      0.74      0.72       355
                   Podiatry       0.55      0.38      0.45        47
    Psychiatry / Psychology       0.78      0.74      0.76        53
                  Radiology       0.31      0.76      0.44       273
               Rheumatology       0.18      0.40      0.25        10
                    Urology       0.93      0.59      0.73       158

                   accuracy                           0.53      1543
                  macro avg       0.60      0.62      0.54     