# Disease prediction using logistic regression

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
train_path = '/content/drive/MyDrive/DS5500 PROJECT/Training.csv'
training_data = pd.read_csv(train_path)
training_data.drop('Unnamed: 133', axis=1, inplace=True)
training_df = pd.DataFrame(training_data)
training_df

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
4916,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,Acne
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Urinary tract infection
4918,0,1,0,0,0,0,1,0,0,0,...,0,0,1,1,1,1,0,0,0,Psoriasis


In [26]:
test_path = '/content/drive/MyDrive/DS5500 PROJECT/Testing.csv'
test_data = pd.read_csv(test_path)
test_df = pd.DataFrame(test_data)
test_df

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Peptic ulcer diseae
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AIDS
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Diabetes
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Gastroenteritis
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bronchial Asthma


##### Split data train/test

In [28]:
x_train = training_df.drop(columns = ['prognosis'])
y_train = training_df['prognosis']
x_test = test_df.drop(columns = ['prognosis'])
y_test = test_df['prognosis']

##### Encode prognosis

In [29]:
label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

### Logistic regression model

In [30]:
logistic_regression = LogisticRegression()

#fit model on training data

logistic_regression.fit(x_train,y_train )

#predictions on testing data

y_pred = logistic_regression.predict(x_test)

#evaluate model

accuracy = accuracy_score(y_test, y_pred)
print(f'The accuracy score is {accuracy * 100}%')

The accuracy score is 100.0%


### Classification report

In [31]:
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test)

In [32]:
print(classification_report(y_test_labels, y_pred_labels))

                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      1.00      1.00         1
                           

### Prediction

In [39]:
probs = logistic_regression.predict_proba(x_test)
#top 3 predictions for each instance
top_3= np.argsort(-probs, axis=1)[:, :3]
# map indices to labels
top3_diseases = [label_encoder.inverse_transform(element) for element in top_3]
print(top3_diseases)

[array(['Fungal infection', 'Drug Reaction', 'Acne'], dtype=object), array(['Allergy', 'Urinary tract infection', 'Acne'], dtype=object), array(['GERD', 'Heart attack', 'Drug Reaction'], dtype=object), array(['Chronic cholestasis', 'Hepatitis D', 'Hepatitis C'], dtype=object), array(['Drug Reaction', 'Fungal infection', 'Urinary tract infection'],
      dtype=object), array(['Peptic ulcer diseae', 'Chronic cholestasis', 'Heart attack'],
      dtype=object), array(['AIDS', 'Impetigo', 'Bronchial Asthma'], dtype=object), array(['Diabetes ', 'Hyperthyroidism', 'Varicose veins'], dtype=object), array(['Gastroenteritis', 'Heart attack', 'Paralysis (brain hemorrhage)'],
      dtype=object), array(['Bronchial Asthma', 'Hepatitis C', 'Pneumonia'], dtype=object), array(['Hypertension ', 'Cervical spondylosis',
       '(vertigo) Paroymsal  Positional Vertigo'], dtype=object), array(['Migraine', 'Hypoglycemia', 'Paralysis (brain hemorrhage)'],
      dtype=object), array(['Cervical spondylosis', '

In [40]:
top_3_probs = []
# Loop through each instance
for i in range(len(top_3)):
    # Get the top 3 indices for current instance
    indices = top_3[i]

    # Get the top 3 labels
    labels = label_encoder.inverse_transform(indices)

    # Get the probabilities of the top 3 labels
    probabilities = probs[i, indices]

    # Combine labels and their probabilities, and append to the list
    labels_with_probs = list(zip(labels, probabilities))
    top_3_probs.append(labels_with_probs)

# Print the result
for i, labels_with_probs in enumerate(top_3_probs):
    print(f"Instance {i+1}: {y_test_labels[i]}")
    for label, prob in labels_with_probs:
        print(f"   {label}: {prob*100:.4f}%")
    print("\n")

Instance 1: Fungal infection
   Fungal infection: 99.2518%
   Drug Reaction: 0.1595%
   Acne: 0.0730%


Instance 2: Allergy
   Allergy: 99.3760%
   Urinary tract infection: 0.0366%
   Acne: 0.0325%


Instance 3: GERD
   GERD: 99.3618%
   Heart attack: 0.1290%
   Drug Reaction: 0.0474%


Instance 4: Chronic cholestasis
   Chronic cholestasis: 98.4954%
   Hepatitis D: 0.3398%
   Hepatitis C: 0.1818%


Instance 5: Drug Reaction
   Drug Reaction: 99.3569%
   Fungal infection: 0.1473%
   Urinary tract infection: 0.0634%


Instance 6: Peptic ulcer diseae
   Peptic ulcer diseae: 99.2670%
   Chronic cholestasis: 0.1129%
   Heart attack: 0.0482%


Instance 7: AIDS
   AIDS: 99.3086%
   Impetigo: 0.0608%
   Bronchial Asthma: 0.0497%


Instance 8: Diabetes 
   Diabetes : 99.4098%
   Hyperthyroidism: 0.0686%
   Varicose veins: 0.0549%


Instance 9: Gastroenteritis
   Gastroenteritis: 99.2118%
   Heart attack: 0.0782%
   Paralysis (brain hemorrhage): 0.0716%


Instance 10: Bronchial Asthma
   Bronch