##### Import Libs

In [1]:
import pandas as pd
from transformers import pipeline, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from datasets import load_dataset
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


##### Load the Dataset

In [2]:
# Load the CSV dataset
# Can't specify split because the dataset has not been divided to train and test yet
# So everything is loaded to "train"
dataset = load_dataset('csv', data_files='./datasets/youtoxic_english_1000.csv')

# Define a function to do some preprocessing - rename target and map values
# I'll rename the target column "IsToxic" to "labels".
def preprocess_data(data):
    # Rename columns    
    data['labels'] = data.pop('IsToxic')
    data['text'] = data.pop('Text')
    # Map true/false to 1/0
    data['labels'] = 1 if data['labels'] == True else 0
    return data

# Run the preprocessing
dataset = dataset.map(preprocess_data)

# Split the dataset into train-test
dict_train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=823)

# Access the train and test sets
dict_train = dict_train_test_split["train"]
dict_test = dict_train_test_split["test"]

##### (1) Use the Pretrained Network directly

##### Initialize the Tokenizer and the Model

In [3]:
# Can also do this
# Initialize tokenizer and model
model = "martin-ha/toxic-comment-model"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)  # Adjust num_labels as needed
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)

# Use pipeline(task="text-classification") for simplicity and ease of use, especially for standard tasks where you don't need much customization.
# Use TextClassificationPipeline when you need more control over the pipeline components or when you're working on a more complex or customized task.

  return self.fget.__get__(instance, owner)()


In [21]:
inputs_test = dict_test['text']
actual_labels = dict_test["labels"]
# Make predictions
predictions = classifier(inputs_test)
# Predicted labels are "non-toxic" and "toxic", so we still need to map that
label_mapping = {"non-toxic": 0, "toxic": 1}
predicted_labels = [label_mapping[pred['label']] for pred in predictions]

In [23]:
# Print classification report
print(classification_report(actual_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.71      0.85      0.78       118
           1       0.70      0.51      0.59        82

    accuracy                           0.71       200
   macro avg       0.71      0.68      0.68       200
weighted avg       0.71      0.71      0.70       200




##### (2) Perform Finetuning using the dataset

In [6]:
# Tokenize the dataset
# Here is where you specify the input column
def tokenize_function(input_string):
    return tokenizer(input_string['text'], padding='max_length', truncation=True, truncation_strategy='longest_first')

# Apply the tokenization function to your dataset
dict_train_tokenized = dict_train.map(tokenize_function, batched=True)
dict_test_tokenized = dict_test.map(tokenize_function, batched=True)

# Format dataset for PyTorch
# Here is where you specify the label/target column
dict_train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
dict_test_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [7]:
# Hyperparams + other config
num_train_epochs = 10
per_device_train_batch_size = 16
per_device_eval_batch_size = 32
weight_decay = 0.01
learning_rate = 2e-5
logging_steps = 10
warmup_steps = 50 # Calculated using total_train_steps = len(dict_train_tokenized) // per_device_train_batch_size*num_train_epochs

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',                                    # Directory to save results
    num_train_epochs=num_train_epochs,                         # Number of training epochs
    per_device_train_batch_size=per_device_train_batch_size,   # Batch size per device during training
    per_device_eval_batch_size=per_device_eval_batch_size,     # Batch size per device during evaluation
    warmup_steps=warmup_steps,                                 # Number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                                 # Strength of weight decay
    learning_rate=learning_rate,                               # Learning Rate
    logging_dir='./logs',                                      # Directory for storing logs
    logging_steps=logging_steps,                               # Frequency of logging steps
    evaluation_strategy="epoch",
    save_strategy="epoch",                                  
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [8]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                          # The model to train
    args=training_args,                   # Training arguments
    train_dataset=dict_train_tokenized,   # The dataset to train on
)

In [9]:
# Start training
trainer.train()

ClearML Task: overwriting (reusing) task id=9e614809235d475da9b542269ae68b6b
2024-08-20 00:03:30,488 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/a280739eaca04ea6b7be4b98665e026b/experiments/9e614809235d475da9b542269ae68b6b/output/log


  2%|▏         | 10/500 [00:31<26:29,  3.24s/it]

{'loss': 0.9185, 'grad_norm': 6.162717819213867, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.2}


  4%|▍         | 20/500 [01:04<25:55,  3.24s/it]

{'loss': 1.048, 'grad_norm': 81.02702331542969, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.4}


  6%|▌         | 30/500 [01:37<25:49,  3.30s/it]

{'loss': 0.7712, 'grad_norm': 16.85659408569336, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.6}


  8%|▊         | 40/500 [02:10<24:46,  3.23s/it]

{'loss': 0.967, 'grad_norm': 11.461485862731934, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.8}


 10%|█         | 50/500 [02:43<24:17,  3.24s/it]

{'loss': 1.0366, 'grad_norm': 11.549699783325195, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.0}


 12%|█▏        | 60/500 [03:16<23:57,  3.27s/it]

{'loss': 0.8333, 'grad_norm': 6.004233360290527, 'learning_rate': 2.4000000000000003e-06, 'epoch': 1.2}


 14%|█▍        | 70/500 [03:49<23:19,  3.26s/it]

{'loss': 0.8081, 'grad_norm': 4.98615026473999, 'learning_rate': 2.8000000000000003e-06, 'epoch': 1.4}


 16%|█▌        | 80/500 [04:22<23:17,  3.33s/it]

{'loss': 0.7315, 'grad_norm': 4.339579105377197, 'learning_rate': 3.2000000000000003e-06, 'epoch': 1.6}


 18%|█▊        | 90/500 [05:15<37:18,  5.46s/it]

{'loss': 0.6243, 'grad_norm': 6.921136379241943, 'learning_rate': 3.6000000000000003e-06, 'epoch': 1.8}


 20%|██        | 100/500 [06:20<38:36,  5.79s/it]

{'loss': 0.8834, 'grad_norm': 6.177294731140137, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.0}


 22%|██▏       | 110/500 [07:16<31:49,  4.90s/it]

{'loss': 0.7849, 'grad_norm': 24.161746978759766, 'learning_rate': 4.4e-06, 'epoch': 2.2}


 24%|██▍       | 120/500 [08:13<33:53,  5.35s/it]

{'loss': 0.6429, 'grad_norm': 12.405302047729492, 'learning_rate': 4.800000000000001e-06, 'epoch': 2.4}


 26%|██▌       | 130/500 [09:01<27:19,  4.43s/it]

{'loss': 0.5605, 'grad_norm': 8.486457824707031, 'learning_rate': 5.2e-06, 'epoch': 2.6}


 28%|██▊       | 140/500 [09:45<28:09,  4.69s/it]

{'loss': 0.9328, 'grad_norm': 3.4891419410705566, 'learning_rate': 5.600000000000001e-06, 'epoch': 2.8}


 30%|███       | 150/500 [10:39<30:50,  5.29s/it]

{'loss': 0.6456, 'grad_norm': 28.495981216430664, 'learning_rate': 6e-06, 'epoch': 3.0}


 32%|███▏      | 160/500 [11:22<24:21,  4.30s/it]

{'loss': 0.6024, 'grad_norm': 4.794069766998291, 'learning_rate': 6.4000000000000006e-06, 'epoch': 3.2}


 34%|███▍      | 170/500 [12:05<23:49,  4.33s/it]

{'loss': 0.5967, 'grad_norm': 12.013569831848145, 'learning_rate': 6.800000000000001e-06, 'epoch': 3.4}


 36%|███▌      | 180/500 [12:50<24:10,  4.53s/it]

{'loss': 0.7282, 'grad_norm': 3.8159830570220947, 'learning_rate': 7.2000000000000005e-06, 'epoch': 3.6}


 38%|███▊      | 190/500 [13:38<27:25,  5.31s/it]

{'loss': 0.6477, 'grad_norm': 10.386427879333496, 'learning_rate': 7.600000000000001e-06, 'epoch': 3.8}


 40%|████      | 200/500 [14:48<32:33,  6.51s/it]

{'loss': 0.7304, 'grad_norm': 3.244467258453369, 'learning_rate': 8.000000000000001e-06, 'epoch': 4.0}


 42%|████▏     | 210/500 [15:59<25:10,  5.21s/it]

{'loss': 0.6677, 'grad_norm': 23.67951202392578, 'learning_rate': 8.400000000000001e-06, 'epoch': 4.2}


 44%|████▍     | 220/500 [16:46<20:34,  4.41s/it]

{'loss': 0.6494, 'grad_norm': 7.906527042388916, 'learning_rate': 8.8e-06, 'epoch': 4.4}


 46%|████▌     | 230/500 [17:33<19:31,  4.34s/it]

{'loss': 0.6297, 'grad_norm': 21.177961349487305, 'learning_rate': 9.200000000000002e-06, 'epoch': 4.6}


 48%|████▊     | 240/500 [18:20<18:54,  4.37s/it]

{'loss': 0.6983, 'grad_norm': 8.328411102294922, 'learning_rate': 9.600000000000001e-06, 'epoch': 4.8}


 50%|█████     | 250/500 [19:17<25:24,  6.10s/it]

{'loss': 0.6184, 'grad_norm': 8.201739311218262, 'learning_rate': 1e-05, 'epoch': 5.0}


 52%|█████▏    | 260/500 [20:17<18:29,  4.62s/it]

{'loss': 0.6683, 'grad_norm': 2.641996383666992, 'learning_rate': 1.04e-05, 'epoch': 5.2}


 54%|█████▍    | 270/500 [21:04<16:52,  4.40s/it]

{'loss': 0.6495, 'grad_norm': 2.9299604892730713, 'learning_rate': 1.0800000000000002e-05, 'epoch': 5.4}


 56%|█████▌    | 280/500 [21:52<16:05,  4.39s/it]

{'loss': 0.6314, 'grad_norm': 4.33413028717041, 'learning_rate': 1.1200000000000001e-05, 'epoch': 5.6}


 58%|█████▊    | 290/500 [22:39<15:35,  4.45s/it]

{'loss': 0.6417, 'grad_norm': 3.2344563007354736, 'learning_rate': 1.16e-05, 'epoch': 5.8}


 60%|██████    | 300/500 [23:27<14:39,  4.40s/it]

{'loss': 0.6755, 'grad_norm': 4.049530506134033, 'learning_rate': 1.2e-05, 'epoch': 6.0}


 62%|██████▏   | 310/500 [24:14<13:53,  4.39s/it]

{'loss': 0.6717, 'grad_norm': 2.085095167160034, 'learning_rate': 1.2400000000000002e-05, 'epoch': 6.2}


 64%|██████▍   | 320/500 [25:02<13:11,  4.40s/it]

{'loss': 0.6732, 'grad_norm': 2.6234018802642822, 'learning_rate': 1.2800000000000001e-05, 'epoch': 6.4}


 66%|██████▌   | 330/500 [25:49<12:30,  4.41s/it]

{'loss': 0.5952, 'grad_norm': 6.524759292602539, 'learning_rate': 1.3200000000000002e-05, 'epoch': 6.6}


 68%|██████▊   | 340/500 [26:37<11:54,  4.47s/it]

{'loss': 0.6101, 'grad_norm': 2.907574415206909, 'learning_rate': 1.3600000000000002e-05, 'epoch': 6.8}


 70%|███████   | 350/500 [27:24<11:05,  4.44s/it]

{'loss': 0.5713, 'grad_norm': 5.159304618835449, 'learning_rate': 1.4e-05, 'epoch': 7.0}


 72%|███████▏  | 360/500 [28:12<10:14,  4.39s/it]

{'loss': 0.566, 'grad_norm': 2.900050640106201, 'learning_rate': 1.4400000000000001e-05, 'epoch': 7.2}


 74%|███████▍  | 370/500 [28:59<09:31,  4.39s/it]

{'loss': 0.5808, 'grad_norm': 291.24755859375, 'learning_rate': 1.48e-05, 'epoch': 7.4}


 76%|███████▌  | 380/500 [29:47<08:49,  4.42s/it]

{'loss': 0.6867, 'grad_norm': 4.5637006759643555, 'learning_rate': 1.5200000000000002e-05, 'epoch': 7.6}


 78%|███████▊  | 390/500 [30:35<08:05,  4.41s/it]

{'loss': 0.5817, 'grad_norm': 44.851383209228516, 'learning_rate': 1.5600000000000003e-05, 'epoch': 7.8}


 80%|████████  | 400/500 [31:22<07:21,  4.41s/it]

{'loss': 0.5786, 'grad_norm': 4.8144941329956055, 'learning_rate': 1.6000000000000003e-05, 'epoch': 8.0}


 82%|████████▏ | 410/500 [32:09<06:35,  4.39s/it]

{'loss': 0.5396, 'grad_norm': 4.329478740692139, 'learning_rate': 1.64e-05, 'epoch': 8.2}


 84%|████████▍ | 420/500 [32:57<05:57,  4.47s/it]

{'loss': 0.4975, 'grad_norm': 4.1913323402404785, 'learning_rate': 1.6800000000000002e-05, 'epoch': 8.4}


 86%|████████▌ | 430/500 [33:44<05:08,  4.41s/it]

{'loss': 0.5407, 'grad_norm': 5.048719882965088, 'learning_rate': 1.72e-05, 'epoch': 8.6}


 88%|████████▊ | 440/500 [34:32<04:25,  4.43s/it]

{'loss': 0.5839, 'grad_norm': 4.498689651489258, 'learning_rate': 1.76e-05, 'epoch': 8.8}


 90%|█████████ | 450/500 [35:19<03:41,  4.42s/it]

{'loss': 0.4895, 'grad_norm': 4.1937785148620605, 'learning_rate': 1.8e-05, 'epoch': 9.0}


 92%|█████████▏| 460/500 [36:07<02:58,  4.47s/it]

{'loss': 0.46, 'grad_norm': 4.754560470581055, 'learning_rate': 1.8400000000000003e-05, 'epoch': 9.2}


 94%|█████████▍| 470/500 [36:55<02:14,  4.47s/it]

{'loss': 0.51, 'grad_norm': 2.0144052505493164, 'learning_rate': 1.88e-05, 'epoch': 9.4}


 96%|█████████▌| 480/500 [37:42<01:28,  4.43s/it]

{'loss': 0.4435, 'grad_norm': 10.013379096984863, 'learning_rate': 1.9200000000000003e-05, 'epoch': 9.6}


 98%|█████████▊| 490/500 [38:30<00:43,  4.38s/it]

{'loss': 0.5035, 'grad_norm': 4.9212188720703125, 'learning_rate': 1.9600000000000002e-05, 'epoch': 9.8}


100%|██████████| 500/500 [39:17<00:00,  4.39s/it]

{'loss': 0.4915, 'grad_norm': 4.11275577545166, 'learning_rate': 0.0, 'epoch': 10.0}
2024-08-20 00:43:33,099 - clearml.storage - INFO - Starting upload: C:\Users\djbac\AppData\Local\Temp\model_package.mcwtobc7.zip => https://files.clear.ml/HuggingFace Transformers/Trainer.9e614809235d475da9b542269ae68b6b/models/checkpoint-500.zip


100%|██████████| 500/500 [39:39<00:00,  4.39s/it]

{'train_runtime': 2417.5998, 'train_samples_per_second': 3.309, 'train_steps_per_second': 0.207, 'train_loss': 0.6639747095108032, 'epoch': 10.0}


100%|██████████| 500/500 [1:24:20<00:00, 10.12s/it]


TrainOutput(global_step=500, training_loss=0.6639747095108032, metrics={'train_runtime': 2417.5998, 'train_samples_per_second': 3.309, 'train_steps_per_second': 0.207, 'total_flos': 1059739189248000.0, 'train_loss': 0.6639747095108032, 'epoch': 10.0})

In [10]:
# Evaluate the model on the test set
eval_results = trainer.evaluate(eval_dataset=dict_test_tokenized)
print(f"Evaluation results: {eval_results}")

100%|██████████| 7/7 [00:38<00:00,  5.54s/it]

Evaluation results: {'eval_loss': 0.6198015809059143, 'eval_runtime': 47.3297, 'eval_samples_per_second': 4.226, 'eval_steps_per_second': 0.148, 'epoch': 10.0}





In [11]:
# Make predictions on the test set
predictions, labels, _ = trainer.predict(test_dataset=dict_test_tokenized)
# Convert predictions to labels
predicted_labels = predictions.argmax(axis=-1)

100%|██████████| 7/7 [00:38<00:00,  5.48s/it]


In [12]:
from sklearn.metrics import classification_report
print(classification_report(labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.84      0.66      0.74       118
           1       0.63      0.82      0.71        82

    accuracy                           0.72       200
   macro avg       0.73      0.74      0.72       200
weighted avg       0.75      0.72      0.73       200



In [197]:
predicted_labels

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0], dtype=int64)

#### Load the Dataset

In [167]:
df = pd.read_csv("datasets\mtsamples.csv", index_col=0)

In [168]:
# Filtering to 12 classes for simplicity
list_med_sp = [
    " Allergy / Immunology",
    " Bariatrics",
    " Cardiovascular / Pulmonary",
    " Urology",
    " Dentistry",
    " Rheumatology",
    " Radiology",
    " Psychiatry / Psychology",
    " Podiatry",
    " Orthopedic",
    " Opthalmology",
    " Neurology"
]
# Filter the DataFrame
df = df[df['medical_specialty'].isin(list_med_sp)]

# df_stratified, _ = train_test_split(df, train_size=0.03, stratify=df['medical_specialty'])

#### Specify the candidate labels for the ZSC

In [169]:
candidate_labels = df["medical_specialty"].unique().tolist()

#### Create the Classifier

In [170]:
classifier = (
    pipeline(task="zero-shot-classification",
             model="tasksource/deberta-small-long-nli"
    )
)

In [171]:
# Function to classify a single description
def classify_description(description):
    result = classifier(description, candidate_labels)
    # Find the index of the maximum score
    max_index = result['scores'].index(max(result['scores']))
    # Return the label with the highest score
    return result['labels'][max_index], result['scores'][max_index]

#### Zero-shot

In [172]:
df[['zero_shot_class', 'score']] = (
    df['description']
    .swifter.progress_bar(enable=True)
    .apply(classify_description)  # Apply function directly
    .apply(pd.Series)  # Convert tuple/list output into DataFrame
)

Pandas Apply: 100%|██████████| 1543/1543 [49:52<00:00,  1.94s/it]


In [174]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Assuming 'y_true' is the actual label and 'y_pred' is the predicted label
y_true = df['medical_specialty']
y_pred = df['zero_shot_class']

# Get accuracy score
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Get classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Get confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


Accuracy: 0.53
Classification Report:
                             precision    recall  f1-score   support

       Allergy / Immunology       0.16      1.00      0.27         7
                 Bariatrics       0.75      0.83      0.79        18
 Cardiovascular / Pulmonary       0.81      0.23      0.36       372
                  Dentistry       0.76      0.81      0.79        27
                  Neurology       0.71      0.31      0.44       223
                 Orthopedic       0.71      0.74      0.72       355
                   Podiatry       0.55      0.38      0.45        47
    Psychiatry / Psychology       0.78      0.74      0.76        53
                  Radiology       0.31      0.76      0.44       273
               Rheumatology       0.18      0.40      0.25        10
                    Urology       0.93      0.59      0.73       158

                   accuracy                           0.53      1543
                  macro avg       0.60      0.62      0.54     