In [2]:
# !pip install transformers datasets

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report


In [5]:
data = pd.read_csv("dataset.csv")
symptoms = data['symptoms']
diseases = data['disease']

In [6]:
label_encoder = LabelEncoder()
data['disease_encoded'] = label_encoder.fit_transform(diseases)
temp_texts, test_texts, temp_labels, test_labels = train_test_split(
    symptoms, data['disease_encoded'], test_size=0.2, random_state=42
)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.25, random_state=42
)


In [7]:
def train_and_evaluate(model_name, train_texts, val_texts, train_labels, val_labels):

    if model_name == "describeai/gemini":
        tokenizer = AutoTokenizer.from_pretrained("describeai/gemini")
        model = AutoModelForSequenceClassification.from_pretrained("describeai/gemini", num_labels=len(label_encoder.classes_))
    else:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token  # Set padding token
        model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))
        model.config.pad_token_id = tokenizer.eos_token_id

    # Tokenize data
    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
    train_dataset = Dataset.from_dict({
        'input_ids': train_encodings['input_ids'],
        'attention_mask': train_encodings['attention_mask'],
        'labels': list(train_labels)
    })
    val_dataset = Dataset.from_dict({
        'input_ids': val_encodings['input_ids'],
        'attention_mask': val_encodings['attention_mask'],
        'labels': list(val_labels)
    })

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f'./logs/{model_name}',
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Evaluation Results for {model_name}:", eval_results)

    # Save the model and tokenizer
    model.save_pretrained(f'./{model_name}-disease-prediction')
    tokenizer.save_pretrained(f'./{model_name}-disease-prediction')

    return model, tokenizer


In [8]:
def evaluate_model(model, tokenizer, model_name):
    test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)
    test_dataset = Dataset.from_dict({
        'input_ids': test_encodings['input_ids'],
        'attention_mask': test_encodings['attention_mask'],
        'labels': list(test_labels),
    })

    # Initialize Trainer for evaluation
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir=f'./results/{model_name}/eval',
            per_device_eval_batch_size=64
        )
    )

    # Evaluate the model
    eval_results = trainer.evaluate(eval_dataset=test_dataset)
    print(f"Evaluation Results for {model_name}:", eval_results)

    # Generate predictions
    raw_predictions = trainer.predict(test_dataset)
    predictions = np.argmax(raw_predictions.predictions, axis=1)

    # Custom metrics
    accuracy = accuracy_score(test_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average="weighted")
    print(f"Metrics for {model_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

    # Classification report
    unique_labels = np.unique(np.concatenate((test_labels, predictions)))
    class_report = classification_report(test_labels, predictions, labels=unique_labels, target_names=label_encoder.classes_[unique_labels])
    print(f"Classification Report for {model_name}:\n", class_report)

In [8]:
gpt2_model, gpt2_tokenizer = train_and_evaluate("gpt2", train_texts, val_texts, train_labels, val_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mghostmaga[0m ([33mghostmaga-nazarbayev-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,7.34577
2,No log,6.982092
3,7.281100,6.583929
4,7.281100,6.646929
5,5.909600,7.01349


Evaluation Results for gpt2: {'eval_loss': 6.58392858505249, 'eval_runtime': 2.9188, 'eval_samples_per_second': 6.852, 'eval_steps_per_second': 0.343, 'epoch': 5.0}


In [None]:
gemini_model, gemini_tokenizer = train_and_evaluate("describeai/gemini", train_texts, val_texts, train_labels, val_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at describeai/gemini and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mghostmaga[0m ([3

In [None]:
evaluate_model(gpt2_model, gpt2_tokenizer, "gpt2")
evaluate_model(gemini_model, gemini_tokenizer, "describeai/gemini")