In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModelForSeq2SeqLM
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

In [3]:
data = pd.read_csv('dataset.csv')  # Make sure this CSV file has 'disease' and 'symptoms' columns
data = data[['disease', 'symptoms']]
data.head()

Unnamed: 0,disease,symptoms
0,flu,"fever,cough,sore throat,runny or stuffy nose,m..."
1,bronchitis,"cough,mucus production,shortness of breath,che..."
2,pneumonia,"fever,cough,shortness of breath,chest pain,fat..."
3,heart attack,"chest pain,shortness of breath,nausea,vomiting..."
4,stroke,"sudden weakness,numbness on one side of the bo..."


In [None]:
data.columns = ["disease", "symptoms"]  # Ensure correct column names if needed
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['disease'])  # Labels from 0 to 99

: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("describeai/gemini")
model = AutoModelForSeq2SeqLM.from_pretrained("describeai/gemini")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
print(tokenizer.tokenize(data['symptoms'][0]))
data['symptoms'][0]

['▁fever', '▁cough', '▁so', 're', '▁throat', '▁run', 'n', 'y', '▁or', '▁stuff', 'y', '▁nose', '▁muscle', '▁', 'aches', '▁headache', '▁fatigue']


'fever cough sore throat runny or stuffy nose muscle aches headache fatigue'

In [None]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        padding='max_length',  # Pad to max length to ensure consistent input size
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Prepare encodings for both train and validation sets
train_encodings = tokenize_texts(train_data['symptoms'])
val_encodings = tokenize_texts(val_data['symptoms'])

# Convert labels to tensors
train_labels = torch.tensor(train_data['label'].values)
val_labels = torch.tensor(val_data['label'].values)

In [None]:
class DiseaseSymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = DiseaseSymptomDataset(train_encodings, train_labels)
val_dataset = DiseaseSymptomDataset(val_encodings, val_labels)

# Fine-tuning Setup
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define a compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

# Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
model.save_pretrained("fine_tuned_gemini_disease_model")
tokenizer.save_pretrained("fine_tuned_gemini_disease_model")

In [None]:
preds_output = trainer.predict(val_dataset)
accuracy = accuracy_score(val_labels, preds_output.predictions.argmax(-1))
f1 = f1_score(val_labels, preds_output.predictions.argmax(-1), average="weighted")

print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Save original model for comparison if needed
model.save_pretrained("original_gemini_model")