In [25]:
# pip install transformers datasets

In [26]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer

# # Specify the model name
# model_name = "gpt2"  # You can replace this with other models like "gpt2-medium" or "gpt-neo-125M"

# # Load the tokenizer and model
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)


In [27]:
# tokenizer.pad_token = tokenizer.eos_token

In [28]:
import warnings
warnings.filterwarnings("ignore")

In [29]:
# prompt = "Hello"
# input_ids = tokenizer.encode(prompt, return_tensors="pt")

# # Generate text with adjusted parameters
# outputs = model.generate(
#     input_ids,
#     max_length=300,
#     temperature=0.9,  # Adds randomness
#     top_k=50,         # Considers top 50 tokens
#     top_p=0.9,        # Nucleus sampling
#     repetition_penalty=1.2,  # Penalty for repetition
#     pad_token_id=tokenizer.eos_token_id
# )

# # Decode and print the generated text
# generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(generated_text)


In [30]:
# !pip install datasets
# !pip install evaluate

In [31]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate

In [32]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(logits, dim=-1)
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    precision_result = precision.compute(predictions=predictions, references=labels)
    recall_result = recall.compute(predictions=predictions, references=labels)
    f1_result = f1.compute(predictions=predictions, references=labels)
    return {
        "accuracy": accuracy_result["accuracy"],
        "precision": precision_result["precision"],
        "recall": recall_result["recall"],
        "f1": f1_result["f1"]
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [33]:
df = pd.read_csv('dataset.csv')
df = df[['disease', 'symptoms']]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_data = tokenizer(df['symptoms'].tolist(), padding=True, truncation=True, return_tensors='pt')

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['disease'])

dataset_dict = {
    'input_ids': encoded_data['input_ids'].tolist(),
    'attention_mask': encoded_data['attention_mask'].tolist(),
    'labels': labels.tolist()
}

dataset_df = pd.DataFrame(dataset_dict)

train_df, val_df = train_test_split(dataset_df, test_size=0.2)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

orig_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    report_to='none',
)

# Train model
trainer = Trainer(
    model=model,                        # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                 # training arguments, defined above
    train_dataset=train_dataset,        # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

trainer.train()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=15, training_loss=4.567629496256511, metrics={'train_runtime': 125.097, 'train_samples_per_second': 1.895, 'train_steps_per_second': 0.12, 'total_flos': 4753541652912.0, 'train_loss': 4.567629496256511, 'epoch': 3.0})

In [34]:
evaluation_results = trainer.evaluate()

TypeError: argmax(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [None]:
# print("Evaluation results:")
# for key, value in evaluation_results.items():
#     print(f"{key}: {value}")

# Function to predict disease with a given model
def predict_disease(symptoms, model):
    model.eval()
    inputs = tokenizer(symptoms, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_disease = label_encoder.inverse_transform([predicted_class_id])[0]
    return predicted_disease

# Example usage to compare predictions
sample_symptoms = "fever cough fatigue"
original_prediction = predict_disease(sample_symptoms, orig_model)
fine_tuned_prediction = predict_disease(sample_symptoms, model)

print(f"Original model prediction for symptoms '{sample_symptoms}': {original_prediction}")
print(f"Fine-tuned model prediction for symptoms '{sample_symptoms}': {fine_tuned_prediction}")

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate

# Load your CSV dataset
df = pd.read_csv('dataset.csv')

# Select only the first two columns: 'disease' and 'symptoms'
df = df[['disease', 'symptoms']]

# Preprocess data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_data = tokenizer(df['symptoms'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Convert labels to numbers
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['disease'])

# Create a dictionary for the dataset
dataset_dict = {
    'input_ids': encoded_data['input_ids'].tolist(),
    'attention_mask': encoded_data['attention_mask'].tolist(),
    'labels': labels.tolist()
}

# Convert the dictionary to a pandas DataFrame
dataset_df = pd.DataFrame(dataset_dict)

# Split dataset into training and validation sets
train_df, val_df = train_test_split(dataset_df, test_size=0.2)

# Convert DataFrames back to Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load original pretrained model
original_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

# Save original model (optional)
original_model.save_pretrained('./original_model')

# Define a function to compute metrics using `evaluate`
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.tensor(logits)  # Convert logits to tensor
    labels = torch.tensor(labels)  # Convert labels to tensor
    predictions = torch.argmax(logits, dim=-1)
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    precision_result = precision.compute(predictions=predictions, references=labels, average='weighted')
    recall_result = recall.compute(predictions=predictions, references=labels)
    f1_result = f1.compute(predictions=predictions, references=labels)
    return {
        "accuracy": accuracy_result["accuracy"],
        "precision": precision_result["precision"],
        "recall": recall_result["recall"],
        "f1": f1_result["f1"]
    }

# Fine-tune model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(labels)))

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    report_to="none"                 # disable reporting to wandb.ai
)

trainer = Trainer(
    model=model,                        # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                 # training arguments, defined above
    train_dataset=train_dataset,        # training dataset
    eval_dataset=val_dataset,           # evaluation dataset
    compute_metrics=compute_metrics     # function to compute metrics
)

trainer.train()

In [None]:
# evaluation_results = trainer.evaluate()

# Display evaluation results
# print("Evaluation results:")
# for key, value in evaluation_results.items():
#     print(f"{key}: {value}")

# Function to predict disease with a given model
def predict_disease(symptoms, model):
    model.eval()
    inputs = tokenizer(symptoms, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_disease = label_encoder.inverse_transform([predicted_class_id])[0]
    return predicted_disease

# Example usage to compare predictions
sample_symptoms = "runny nose,sneezing,sore throat,cough,congestion fewer"
original_prediction = predict_disease(sample_symptoms, original_model)
fine_tuned_prediction = predict_disease(sample_symptoms, model)

print(f"Original model prediction for symptoms '{sample_symptoms}': {original_prediction}")
print(f"Fine-tuned model prediction for symptoms '{sample_symptoms}': {fine_tuned_prediction}")