In [151]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [59]:
data = pd.read_csv('dataset.csv')
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3
0,Fungal infection,itching,skin_rash,vomiting
1,Fungal infection,skin_rash,skin_rash,vomiting
2,Fungal infection,itching,skin_rash,vomiting
3,Fungal infection,itching,skin_rash,vomiting
4,Fungal infection,itching,skin_rash,vomiting


In [138]:
val = {'itching' : 0, 
       'skin_rash' : 1, 
       'continuous_sneezing' : 2, 
       'shivering' : 3, 
       'stomach_pain' : 4, 
       'acidity' : 5, 
       'vomiting' : 6, 
       'indigestion' : 7,  
       'muscle_wasting' : 8, 
       'indigestion' : 9, 
       'fatigue' : 10}

df = pd.DataFrame(columns=['Index']+list(val.keys())+['Disease'])
df.head()

Unnamed: 0,Index,itching,skin_rash,continuous_sneezing,shivering,stomach_pain,acidity,vomiting,indigestion,muscle_wasting,fatigue,Disease


In [142]:
for row in range(data.shape[0]):
    new_row = pd.Series([False] * len(df.columns), index=df.columns)
    new_row['Index'] = row
    new_row['Disease'] = data.iloc[row]['Disease']
    new_row[data.iloc[row]['Symptom_1'].strip()] = True
    new_row[data.iloc[row]['Symptom_2'].strip()] = True
    new_row[data.iloc[row]['Symptom_3'].strip()] = True

    df = pd.concat([df, new_row.to_frame().T], ignore_index=True)

In [204]:
x = df.drop(['Index','Disease'], axis=1)
y = df['Disease']

x1, x2, y1, y2 = train_test_split(x, y, test_size = 0.2)

model = RandomForestClassifier(random_state=42).fit(x1, y1)
print(classification_report(y2, model.predict(x2)))

                     precision    recall  f1-score   support

               AIDS       1.00      1.00      1.00         3
            Allergy       1.00      1.00      1.00         1
   Bronchial Asthma       0.33      1.00      0.50         1
Chronic cholestasis       1.00      1.00      1.00         2
          Diabetes        1.00      0.33      0.50         3
      Drug Reaction       1.00      1.00      1.00         3
   Fungal infection       1.00      1.00      1.00         1
               GERD       1.00      1.00      1.00         2
    Gastroenteritis       1.00      1.00      1.00         4

           accuracy                           0.90        20
          macro avg       0.93      0.93      0.89        20
       weighted avg       0.97      0.90      0.90        20



In [214]:
x = df.drop(['Index','Disease'], axis=1)
y = df['Disease']

x1, x2, y1, y2 = train_test_split(x, y, test_size = 0.3)

model = DecisionTreeClassifier().fit(x1, y1)
print(classification_report(y2, model.predict(x2)))

                     precision    recall  f1-score   support

               AIDS       0.50      1.00      0.67         1
            Allergy       1.00      1.00      1.00         4
   Bronchial Asthma       0.57      1.00      0.73         4
Chronic cholestasis       0.00      0.00      0.00         6
          Diabetes        0.00      0.00      0.00         3
      Drug Reaction       1.00      0.67      0.80         3
   Fungal infection       1.00      1.00      1.00         4
               GERD       1.00      1.00      1.00         2
    Gastroenteritis       1.00      1.00      1.00         1
Peptic ulcer diseae       0.25      1.00      0.40         2

           accuracy                           0.67        30
          macro avg       0.63      0.77      0.66        30
       weighted avg       0.58      0.67      0.59        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [215]:
print(sklearn.tree.export_text(model, feature_names=x.columns))

|--- acidity <= 0.50
|   |--- vomiting <= 0.50
|   |   |--- itching <= 0.50
|   |   |   |--- fatigue <= 0.50
|   |   |   |   |--- muscle_wasting <= 0.50
|   |   |   |   |   |--- class: AIDS
|   |   |   |   |--- muscle_wasting >  0.50
|   |   |   |   |   |--- class: AIDS
|   |   |   |--- fatigue >  0.50
|   |   |   |   |--- continuous_sneezing <= 0.50
|   |   |   |   |   |--- class: Diabetes 
|   |   |   |   |--- continuous_sneezing >  0.50
|   |   |   |   |   |--- class: Bronchial Asthma
|   |   |--- itching >  0.50
|   |   |   |--- class: Drug Reaction
|   |--- vomiting >  0.50
|   |   |--- stomach_pain <= 0.50
|   |   |   |--- indigestion <= 0.50
|   |   |   |   |--- skin_rash <= 0.50
|   |   |   |   |   |--- class: Chronic cholestasis
|   |   |   |   |--- skin_rash >  0.50
|   |   |   |   |   |--- continuous_sneezing <= 0.50
|   |   |   |   |   |   |--- shivering <= 0.50
|   |   |   |   |   |   |   |--- class: Fungal infection
|   |   |   |   |   |   |--- shivering >  0.50
|   |   |

In [174]:
import sklearn
help(sklearn.tree.export_text)

Help on function export_text in module sklearn.tree._export:

export_text(decision_tree, *, feature_names=None, class_names=None, max_depth=10, spacing=3, decimals=2, show_weights=False)
    Build a text report showing the rules of a decision tree.
    
    Note that backwards compatibility may not be supported.
    
    Parameters
    ----------
    decision_tree : object
        The decision tree estimator to be exported.
        It can be an instance of
        DecisionTreeClassifier or DecisionTreeRegressor.
    
    feature_names : array-like of shape (n_features,), default=None
        An array containing the feature names.
        If None generic names will be used ("feature_0", "feature_1", ...).
    
    class_names : array-like of shape (n_classes,), default=None
        Names of each of the target classes in ascending numerical order.
        Only relevant for classification and not supported for multi-output.
    
        - if `None`, the class names are delegated to `decis