## Import Required Libraries

In [53]:
import numpy as np
import pandas as pd

## Load the Dataset

In [54]:
data=pd.read_csv("D:\Datasets\dataset.csv")
symptom=pd.read_csv("D:\Datasets\Symptom-severity.csv")

In [55]:
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [56]:
symptom.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


## Data Preprocessing and Feature Engineering

Since the main dataset has a lot of spaces, we'll remove the spaces and let's also remove the underscores in both the datasets so both of them are consistent.

In [57]:
for i in data.columns:
    data[i] = data[i].str.replace(" ", "")
    data[i] = data[i].str.replace('_', '')
symptom["Symptom"] = symptom["Symptom"].str.replace('_', "")

In [58]:
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungalinfection,itching,skinrash,nodalskineruptions,dischromicpatches,,,,,,,,,,,,,
1,Fungalinfection,skinrash,nodalskineruptions,dischromicpatches,,,,,,,,,,,,,,
2,Fungalinfection,itching,nodalskineruptions,dischromicpatches,,,,,,,,,,,,,,
3,Fungalinfection,itching,skinrash,dischromicpatches,,,,,,,,,,,,,,
4,Fungalinfection,itching,skinrash,nodalskineruptions,,,,,,,,,,,,,,


In [59]:
symptom.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skinrash,3
2,nodalskineruptions,4
3,continuoussneezing,4
4,shivering,5


Fill NA values with the value 'Not available' for now since the data is categorical. We'll label encode the dataset later.

In [60]:
data = data.fillna("Not available")

Let's use the symptom dataset to create a new feature called Total_weight. This column will contain the total weight of all the symptoms present in the column. Weight for each symptom is given in the symptom dataset.

In [61]:
weight_dict = {i:x for i,x in zip(symptom["Symptom"], symptom["weight"])}
data["Total_weight"] = data.replace(weight_dict).sum(axis=1)

  data["Total_weight"] = data.replace(weight_dict).sum(axis=1)


In [62]:
weight_dict

{'itching': 1,
 'skinrash': 3,
 'nodalskineruptions': 4,
 'continuoussneezing': 4,
 'shivering': 5,
 'chills': 3,
 'jointpain': 3,
 'stomachpain': 5,
 'acidity': 3,
 'ulcersontongue': 4,
 'musclewasting': 3,
 'vomiting': 5,
 'burningmicturition': 6,
 'spottingurination': 6,
 'fatigue': 4,
 'weightgain': 3,
 'anxiety': 4,
 'coldhandsandfeets': 5,
 'moodswings': 3,
 'weightloss': 3,
 'restlessness': 5,
 'lethargy': 2,
 'patchesinthroat': 6,
 'irregularsugarlevel': 5,
 'cough': 4,
 'highfever': 7,
 'sunkeneyes': 3,
 'breathlessness': 4,
 'sweating': 3,
 'dehydration': 4,
 'indigestion': 5,
 'headache': 3,
 'yellowishskin': 3,
 'darkurine': 4,
 'nausea': 5,
 'lossofappetite': 4,
 'painbehindtheeyes': 4,
 'backpain': 3,
 'constipation': 4,
 'abdominalpain': 4,
 'diarrhoea': 6,
 'mildfever': 5,
 'yellowurine': 4,
 'yellowingofeyes': 4,
 'acuteliverfailure': 6,
 'fluidoverload': 4,
 'swellingofstomach': 7,
 'swelledlymphnodes': 6,
 'malaise': 6,
 'blurredanddistortedvision': 5,
 'phlegm': 5,


Now let's label encode the categorical columns using scikit-learn's LabelEncoder library. Keep in mind that Disease column has to be encoded seperately because it contains different categorical values. We create 2 objects of the LabelEncoder class for Diseases and symptoms respectively. We immediately transform the Disease column and for the symptoms, we take the list of symptoms from the symptom dataset and fit our encoder on it and then we apply the transform on the symptom columns.

In [63]:
from sklearn.preprocessing import LabelEncoder
lb_y = LabelEncoder()#Label Encoder for Disease column
lb_x = LabelEncoder()#Label Encoder for Symptom column

data["Disease"] = lb_y.fit_transform(data["Disease"])
all_symptoms = symptom["Symptom"].values
all_symptoms = np.append(all_symptoms, "Not available") #List of all symptoms and the NA value('Not available')
lb_x.fit(all_symptoms)
cat_columns = [i for i in data.columns if i!= "Total_weight"] #Columns containing categorical variables
data[cat_columns[1:]] = data[cat_columns[1:]].apply(lb_x.transform) #We select from the 1st index the 0th index contains the Disease column and we have 

In [64]:
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,Total_weight
0,15,56,102,74,33,0,0,0,0,0,0,0,0,0,0,0,0,0,8
1,15,102,74,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13
2,15,56,74,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11
3,15,56,102,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10
4,15,56,102,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8


In [65]:
data[["Symptom_7","Symptom_8","Symptom_9","Symptom_10","Symptom_11","Symptom_12","Symptom_13","Symptom_14","Symptom_15", "Symptom_16"]]

Unnamed: 0,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0
4916,0,0,0,0,0,0,0,0,0,0
4917,0,0,0,0,0,0,0,0,0,0
4918,0,0,0,0,0,0,0,0,0,0


Now let's check which labels have been encoded as what

In [66]:
cols_to_drop = ["Symptom_7","Symptom_8","Symptom_9","Symptom_10","Symptom_11","Symptom_12","Symptom_13","Symptom_14","Symptom_15", "Symptom_16","Symptom_17"]
#x_train = x_train.drop(columns=cols_to_drop)
#x_valid = x_valid.drop(columns=cols_to_drop)
#x = data.drop(columns=cols_to_drop)
#x = x.drop(columns="Disease")
#y = data["Disease"]
data=data.drop(columns=cols_to_drop)

In [67]:
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Total_weight
0,15,56,102,74,33,0,0,8
1,15,102,74,33,0,0,0,13
2,15,56,74,33,0,0,0,11
3,15,56,102,33,0,0,0,10
4,15,56,102,74,0,0,0,8


In [68]:
all_symptoms

array(['itching', 'skinrash', 'nodalskineruptions', 'continuoussneezing',
       'shivering', 'chills', 'jointpain', 'stomachpain', 'acidity',
       'ulcersontongue', 'musclewasting', 'vomiting',
       'burningmicturition', 'spottingurination', 'fatigue', 'weightgain',
       'anxiety', 'coldhandsandfeets', 'moodswings', 'weightloss',
       'restlessness', 'lethargy', 'patchesinthroat',
       'irregularsugarlevel', 'cough', 'highfever', 'sunkeneyes',
       'breathlessness', 'sweating', 'dehydration', 'indigestion',
       'headache', 'yellowishskin', 'darkurine', 'nausea',
       'lossofappetite', 'painbehindtheeyes', 'backpain', 'constipation',
       'abdominalpain', 'diarrhoea', 'mildfever', 'yellowurine',
       'yellowingofeyes', 'acuteliverfailure', 'fluidoverload',
       'swellingofstomach', 'swelledlymphnodes', 'malaise',
       'blurredanddistortedvision', 'phlegm', 'throatirritation',
       'rednessofeyes', 'sinuspressure', 'runnynose', 'congestion',
       'chestpain'

In [69]:
lb_x.transform(all_symptoms)

array([ 56, 102,  74,  26,  98,  20,  57, 109,   3, 120,  70, 123,  18,
       106,  42, 127,   6,  21,  66, 128,  94,  60,  82,  53,  27,  46,
       110,  15, 111,  30,  50,  45, 131,  29,  72,  61,  76,   7,  24,
         1,  32,  65, 132, 130,   4,  43, 114, 112,  64,  14,  83, 118,
        91, 100,  95,  23,  19, 125,  41,  77,  79,  13,  55,  73,  35,
        28,  17,  75, 117, 115,  87,  37,  16, 116,  38,  39,  36, 103,
        58,  47,  71, 107, 113,  67, 105,  62, 121, 126,  63,  10,  44,
        25,  81,  52, 119,  31,  54,  69,   5,  93,   8,   2,  33, 124,
        49,  84,  40,  68,  96,  59, 122,  89,  90,  22, 108,  34,  48,
        43,  12,  86,  80,  78,  88,   9,  97, 101,  99, 104,  51,  11,
        92, 129,  85,   0])

In [70]:
data["Disease"].unique()

array([15,  4, 16,  9, 14, 33,  1, 12, 17,  6, 23, 30,  7, 32, 28, 29,  8,
       11, 37, 40, 19, 20, 21, 22,  3, 36, 10, 34, 13, 18, 39, 26, 24, 25,
       31,  5,  0,  2, 38, 35, 27])

In [71]:
lb_y.inverse_transform(data["Disease"].unique())

array(['Fungalinfection', 'Allergy', 'GERD', 'Chroniccholestasis',
       'DrugReaction', 'Pepticulcerdiseae', 'AIDS', 'Diabetes',
       'Gastroenteritis', 'BronchialAsthma', 'Hypertension', 'Migraine',
       'Cervicalspondylosis', 'Paralysis(brainhemorrhage)', 'Jaundice',
       'Malaria', 'Chickenpox', 'Dengue', 'Typhoid', 'hepatitisA',
       'HepatitisB', 'HepatitisC', 'HepatitisD', 'HepatitisE',
       'Alcoholichepatitis', 'Tuberculosis', 'CommonCold', 'Pneumonia',
       'Dimorphichemmorhoids(piles)', 'Heartattack', 'Varicoseveins',
       'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia',
       'Osteoarthristis', 'Arthritis',
       '(vertigo)ParoymsalPositionalVertigo', 'Acne',
       'Urinarytractinfection', 'Psoriasis', 'Impetigo'], dtype=object)

## Build Model

In [72]:
from sklearn.metrics import accuracy_score
#x_train = data.drop("Disease",axis=1)
#y_train = data["Disease"]
#x_valid = data.drop("Disease", axis=1)
#y_valid = test["Disease"]

#y=data["Disease"]

#Input and output
X=data.drop("Disease",axis=1)
y=data["Disease"]

In [73]:
X

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Total_weight
0,56,102,74,33,0,0,8
1,102,74,33,0,0,0,13
2,56,74,33,0,0,0,11
3,56,102,33,0,0,0,10
4,56,102,74,0,0,0,8
...,...,...,...,...,...,...,...
4915,123,45,72,105,62,121,13
4916,102,88,9,97,0,0,7
4917,18,10,44,25,0,0,15
4918,102,57,101,99,104,51,9


In [74]:
y

0       15
1       15
2       15
3       15
4       15
        ..
4915     0
4916     2
4917    38
4918    35
4919    27
Name: Disease, Length: 4920, dtype: int32

In [75]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [76]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier(random_state=123)
rf.fit(X_train, y_train)
#accuracy_score(rf.predict(x_valid), y_valid)

RandomForestClassifier(random_state=123)

In [77]:
print(data.shape)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4920, 8)
(3936, 7)
(3936,)
(984, 7)
(984,)


In [78]:
y_pred = rf.predict(X_test)

In [79]:
from sklearn import metrics
print("RF_Model accuracy:",metrics.accuracy_score(y_pred,y_test))

RF_Model accuracy: 0.9979674796747967


In [80]:
from sklearn.model_selection import cross_val_score 
np.mean(cross_val_score(rf, X,y, cv=3 ))

0.9975609756097561

In [81]:
#from sklearn.tree import DecisionTreeClassifier 
#dt = DecisionTreeClassifier(random_state=9)
#dt.fit(X_train,y_train)

In [82]:
#Y_pred = dt.predict(X_test)

In [83]:
#print("DT_Model accuracy:",metrics.accuracy_score(Y_pred,y_test))

In [84]:
import pickle 
with open("model3.pkl", "wb") as file:
    pickle.dump(rf, file=file)