In [None]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('dataset.csv')
df = df[['disease', 'symptoms']]

# Prepare data as text sequences for GPT-2
df['input_text'] = "Symptoms: " + df['symptoms'] + " Predict disease: " + df['disease']

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to EOS token for GPT-2

# Tokenize data
encoded_data = tokenizer(df['input_text'].tolist(), padding=True, truncation=True, return_tensors='pt')
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']

# Convert to lists and create dictionary
dataset_dict = {
    'input_ids': input_ids.tolist(),
    'attention_mask': attention_mask.tolist(),
    'labels': input_ids.tolist()  # Set labels to be the same as input_ids
}

# Convert dictionary to DataFrame and split data
dataset_df = pd.DataFrame(dataset_dict)
train_df, val_df = train_test_split(dataset_df, test_size=0.2)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Define the model as a causal language model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to='none'
)

# Define Trainer with datasets containing labels
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=120, training_loss=2.9367324829101564, metrics={'train_runtime': 389.5211, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.308, 'total_flos': 5442733440000.0, 'train_loss': 2.9367324829101564, 'epoch': 3.0})

In [None]:
evaluation_results = trainer.evaluate()
print("Evaluation results:", evaluation_results)

Evaluation results: {'eval_loss': 1.5355908870697021, 'eval_runtime': 6.4381, 'eval_samples_per_second': 3.107, 'eval_steps_per_second': 1.553, 'epoch': 3.0}


In [None]:
def predict_disease(symptoms, model):
    model.eval()
    prompt = f"Symptoms: {symptoms} Predict disease:"
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction.split("Predict disease:")[-1].strip()

# Example usage to predict disease based on symptoms
sample_symptoms = "cough,mucus production,shortness of breath,chest pain"
prediction = predict_disease(sample_symptoms, model)
orig_prediction = predict_disease(sample_symptoms, GPT2LMHeadModel.from_pretrained('gpt2'))
print(f"Prediction for symptoms '{sample_symptoms}': {prediction}")
print(f"Original Prediction for symptoms '{sample_symptoms}': {orig_prediction}")

Prediction for symptoms 'cough,mucus production,shortness of breath,chest pain': pneumonia
Original Prediction for symptoms 'cough,mucus production,shortness of breath,chest pain': acute respiratory distress,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest pain,chest


In [None]:
def calculate_accuracy(model, val_dataset):
    model.eval()
    correct = 0
    total = 0

    for example in val_dataset:
        # Get input and label
        input_ids = torch.tensor(example['input_ids']).unsqueeze(0)  # Add batch dimension
        attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0)
        label = example['labels']

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Extract the logits of the last token
            last_token_logits = logits[:, -1, :]  # Select the logits for the last token in the sequence
            predicted_id = torch.argmax(last_token_logits, dim=-1).item()

        # Compare the prediction to the true label
        if predicted_id == label:
            correct += 1
        total += 1

    # Calculate accuracy
    accuracy = correct / total if total > 0 else 0
    return accuracy

# After training, calculate accuracy on the validation dataset
accuracy = calculate_accuracy(model, val_dataset)
print(f"Validation Accuracy: {accuracy:.4f}")


Validation Accuracy: 0.0000


In [None]:
val_dataset.to_pandas()

Unnamed: 0,input_ids,attention_mask,labels,__index_level_0__
0,"[43094, 35533, 25, 2910, 287, 262, 18922, 11, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 2910, 287, 262, 18922, 11, ...",22
1,"[43094, 35533, 25, 8722, 2956, 6010, 11, 18041...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 8722, 2956, 6010, 11, 18041...",83
2,"[43094, 35533, 25, 13181, 5490, 11, 69, 451, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 13181, 5490, 11, 69, 451, 1...",15
3,"[43094, 35533, 25, 4802, 10453, 11, 77, 2178, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 4802, 10453, 11, 77, 2178, ...",85
4,"[43094, 35533, 25, 3220, 24613, 11, 69, 46018,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 3220, 24613, 11, 69, 46018,...",6
5,"[43094, 35533, 25, 2356, 11, 2032, 9417, 11, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 2356, 11, 2032, 9417, 11, 2...",25
6,"[43094, 35533, 25, 2356, 287, 262, 2793, 826, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 2356, 287, 262, 2793, 826, ...",63
7,"[43094, 35533, 25, 10023, 669, 11, 38246, 3356...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 10023, 669, 11, 38246, 3356...",79
8,"[43094, 35533, 25, 36004, 11, 18041, 88, 38753...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 36004, 11, 18041, 88, 38753...",93
9,"[43094, 35533, 25, 11363, 18307, 10280, 326, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[43094, 35533, 25, 11363, 18307, 10280, 326, 4...",41
