In [4]:
!pip install sentencepiece




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Step 1: Load Data
df = pd.read_csv('dataset.csv')  # Make sure the CSV file is in the correct directory
print(df.head())

# Step 2: Preprocess Data
# Convert the DataFrame into a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenizer and preprocess for GPT-2
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # GPT-2 requires a padding token

def gpt2_tokenize_function(examples):
    inputs = examples['symptoms']
    model_inputs = gpt2_tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
    return model_inputs

# Tokenize the dataset for GPT-2
tokenized_gpt2_datasets = dataset.map(gpt2_tokenize_function, batched=True)
tokenized_gpt2_datasets = tokenized_gpt2_datasets.remove_columns(["symptoms"])

# Tokenizer and preprocess for T5
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")

def t5_tokenize_function(examples):
    inputs = [f"Symptoms: {item['symptoms']} Disease:" for item in examples]
    targets = examples['disease']
    model_inputs = t5_tokenizer(inputs, padding="max_length", truncation=True, max_length=128)
    labels = t5_tokenizer(targets, padding="max_length", truncation=True, max_length=64)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize the dataset for T5
tokenized_t5_datasets = dataset.map(t5_tokenize_function, batched=True)
tokenized_t5_datasets = tokenized_t5_datasets.remove_columns(["symptoms", "disease"])

# Step 3: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir="./logs",
    save_steps=500,
    evaluation_strategy="epoch",
    logging_steps=100,
    learning_rate=5e-5,
)

# Step 4: Initialize Models and Trainers

# GPT-2 Model for Disease Prediction
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

gpt2_trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=tokenized_gpt2_datasets,
    eval_dataset=tokenized_gpt2_datasets,
)

# T5 Model for Disease Prediction
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

t5_trainer = Trainer(
    model=t5_model,
    args=training_args,
    train_dataset=tokenized_t5_datasets,
    eval_dataset=tokenized_t5_datasets,
)

# Step 5: Train Both Models

# Train GPT-2
print("Training GPT-2 model...")
gpt2_trainer.train()

# Train T5
print("Training T5 model...")
t5_trainer.train()

# Save the models
gpt2_model.save_pretrained("./gpt2_disease_model")
gpt2_tokenizer.save_pretrained("./gpt2_disease_model")

t5_model.save_pretrained("./t5_disease_model")
t5_tokenizer.save_pretrained("./t5_disease_model")

# Step 6: Evaluate the Models (on validation data)

# Example of how to evaluate the model after training
def evaluate_model(model, tokenizer, dataset):
    # Get predictions
    predictions = []
    labels = []
    for example in dataset:
        inputs = example['symptoms']
        inputs_tokenized = tokenizer(inputs, return_tensors='pt', truncation=True, padding=True, max_length=128)
        
        # GPT-2 does not return label directly, we need to generate the output
        output = model.generate(inputs_tokenized['input_ids'])
        predicted_disease = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # Append predicted and actual values
        predictions.append(predicted_disease)
        labels.append(example['disease'])
    
    # Calculate accuracy and F1 Score
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')
    conf_matrix = confusion_matrix(labels, predictions)
    
    return accuracy, f1, conf_matrix

# Evaluate GPT-2
print("Evaluating GPT-2...")
gpt2_accuracy, gpt2_f1, gpt2_conf_matrix = evaluate_model(gpt2_model, gpt2_tokenizer, dataset)

print("GPT-2 Accuracy:", gpt2_accuracy)
print("GPT-2 F1 Score:", gpt2_f1)
print("GPT-2 Confusion Matrix:\n", gpt2_conf_matrix)

# Evaluate T5
print("Evaluating T5...")
t5_accuracy, t5_f1, t5_conf_matrix = evaluate_model(t5_model, t5_tokenizer, dataset)

print("T5 Accuracy:", t5_accuracy)
print("T5 F1 Score:", t5_f1)
print("T5 Confusion Matrix:\n", t5_conf_matrix)

# Step 7: Compare the Models
if gpt2_f1 > t5_f1:
    print("GPT-2 performs better.")
else:
    print("T5 performs better.")


        disease                                           symptoms  \
0           flu  fever,cough,sore throat,runny or stuffy nose,m...   
1    bronchitis  cough,mucus production,shortness of breath,che...   
2     pneumonia  fever,cough,shortness of breath,chest pain,fat...   
3  heart attack  chest pain,shortness of breath,nausea,vomiting...   
4        stroke  sudden weakness,numbness on one side of the bo...   

                                               cures  \
0           over-the-counter medications,rest,fluids   
1  antibiotics,over-the-counter medications,rest,...   
2  antibiotics,over-the-counter medications,rest,...   
3                         emergency medical services   
4                         emergency medical services   

                        doctor     risk level  
0    family doctor,urgent care      low (0.1%  
1  family doctor,pulmonologist      low (0.5%  
2  family doctor,pulmonologist  moderate (1%)  
3                 cardiologist     high (20%)  
4 

Map: 100%|██████████| 99/99 [00:00<00:00, 3571.98 examples/s]


ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
