In [None]:

import pandas as pd # data processing, CSV file

import os
pName = ""
for dirname, _, filenames in os.walk('./database'):
    for filename in filenames:
        pName = (os.path.join(dirname, filename))
        print(pName)
        
if pName != "":
    print('Data source import complete.')
else:
    print('Data source import failed.')


In [None]:
df = pd.read_csv(pName)

In [None]:
df.head()

In [None]:
df['diseases'].nunique()

In [None]:
df['diseases'].unique()[:200]

## Imbalanced value counts

_Don't worry - having lesser training data for some diseases is fine. Their probability of occurrence in the real world is negligible. The occurrence probability in this dataset is roughly proportional to the value counts._


In [None]:
df.diseases.value_counts().loc[lambda x : x <= 10].reset_index()

## Now I will split dataset into X and y and train a model using sklearn LabelEncoder


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Convert DataFrame to NumPy first
X = df.drop(columns=['diseases']).to_numpy()
y = le.fit_transform(df['diseases'])

# Now convert to tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Dataset + DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Define model
import torch.nn as nn
import torch.nn.functional as F

class SymptomClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SymptomClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # logits

In [None]:
# Training loop

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SymptomClassifier(X.shape[1], len(le.classes_)).to(device)
print(f"Using device: {device}") 

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(20):  # adjust epochs
    for batch_X, batch_y in loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

In [None]:
# Prediction function
def predict_symptoms(symptoms_vector):
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor(symptoms_vector, dtype=torch.float32).to(device)
        outputs = model(inputs)
        predicted = torch.argmax(outputs).item()
    return le.classes_[predicted]

# Example: predict first row
print(predict_symptoms(X[0]))

In [None]:
# SAVE AND LOAD MODEL

torch.save(model.state_dict(), "torch_symptom_model.pth")


In [None]:
import torch

# Define the same architecture you used before
class SymptomClassifier(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SymptomClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, 64)
        self.fc2 = torch.nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)  # logits

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SymptomClassifier(X.shape[1], len(le.classes_))
model.load_state_dict(torch.load("torch_symptom_model.pth", map_location=device))
model.to(device)
model.eval()

In [None]:
import numpy as np

def predict_top_diseases(symptoms_vector, top_k=3):
    listOfSymptomsHeader = df.columns[:-1].tolist()  # all columns except 'diseases'
    for i in range(len(symptoms_vector)):
        if symptoms_vector[i] == 1:
            print(f"Symptom: {listOfSymptomsHeader[i]} is present.")
    print("\nPredicting top diseases based on symptoms...")
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor(symptoms_vector, dtype=torch.float32).to(device)
        outputs = model(inputs)
        probs = torch.softmax(outputs, dim=0).cuda(device=device).cpu().numpy()  # move to CPU and convert to NumPy

    # Sort probabilities in descending order
    sorted_indices = np.argsort(probs)[::-1]
    top_indices = sorted_indices[:top_k]

    # Build list of (disease, probability)
    top_diseases = [(le.classes_[i], probs[i]) for i in top_indices]
    return top_diseases

In [None]:
allResults = []
for i in range(100,200):
    print(i , "=" * 46, i)
    sample = X[i]  # row of dataset
    top_results = predict_top_diseases(sample, top_k=4)

    print(f"\nTop probable diseases for sample {i}:")
    
    for disease, prob in top_results:
        print(f"{disease}: {prob:.2f}")
    print("------------------------x-------------------------\n")
    allResults.append((i, top_results))
        

In [None]:
import pandas as pd

df_results = pd.DataFrame(allResults, columns=["Sample Index", "Top Predictions"])

df_results.to_csv("top_disease_predictions.csv", index=False)