In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from keras.metrics import categorical_accuracy
import tensorflow as tf
import glob
import os
from joblib import dump

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DISEASE_DIR = "/content/drive/My Drive/Colab Notebooks/"

# Loading and Preprocessing Disease_Symptom Encoded Data

In [4]:
# loading the data
disease_path = os.path.join(DISEASE_DIR, "Disease_symptom_and_patient_pro.csv")
disease_df = pd.read_csv(disease_path,encoding="utf-8")

# Review the DataFrame
disease_df

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
...,...,...,...,...,...,...,...,...,...,...
344,Stroke,Yes,No,Yes,No,80,Female,High,High,Positive
345,Stroke,Yes,No,Yes,No,85,Male,High,High,Positive
346,Stroke,Yes,No,Yes,No,85,Male,High,High,Positive
347,Stroke,Yes,No,Yes,No,90,Female,High,High,Positive


In [5]:
#identify the data types in the file before transforming
disease_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               349 non-null    object
 1   Fever                 349 non-null    object
 2   Cough                 349 non-null    object
 3   Fatigue               349 non-null    object
 4   Difficulty Breathing  349 non-null    object
 5   Age                   349 non-null    int64 
 6   Gender                349 non-null    object
 7   Blood Pressure        349 non-null    object
 8   Cholesterol Level     349 non-null    object
 9   Outcome Variable      349 non-null    object
dtypes: int64(1), object(9)
memory usage: 27.4+ KB


In [6]:
disease_df.nunique()

Disease                 116
Fever                     2
Cough                     2
Fatigue                   2
Difficulty Breathing      2
Age                      26
Gender                    2
Blood Pressure            3
Cholesterol Level         3
Outcome Variable          2
dtype: int64

In [7]:
# #initiate scaling
# disease_data_scaled = StandardScaler().fit_transform(disease_df[["Age"]])
# #create df with scaled data
# disease_scaled_df = pd.DataFrame(disease_data_scaled,columns=["Age"])
# #show the newly scaled df
# disease_scaled_df.head()

In [8]:
# hot_df = pd.get_dummies(disease_df[['Outcome Variable','Gender','Difficulty Breathing','Fever','Cough','Fatigue','Blood Pressure','Cholesterol Level']])
# hot_df

In [9]:
#Create an instance of LabelEncoder
label_encoder = LabelEncoder()

#Label encode everything except age


#disease_df.apply(label_encoder.fit_transform)

#disease_df['Disease','Outcome Variable','Gender','Difficulty Breathing','Fever','Cough','Fatigue','Blood Pressure','Cholesterol Level'] = label_encoder.fit_transform(disease_df['Disease','Outcome Variable','Gender','Difficulty Breathing','Fever','Cough','Fatigue','Blood Pressure','Cholesterol Level'])

#Encoding Disease Column
#disease_df['Disease'] = label_encoder.fit_transform(disease_df['Disease'])
disease_df['Outcome Variable'] = label_encoder.fit_transform(disease_df['Outcome Variable'])
disease_df['Gender'] = label_encoder.fit_transform(disease_df['Gender'])
disease_df['Difficulty Breathing'] = label_encoder.fit_transform(disease_df['Difficulty Breathing'])
disease_df['Fever'] = label_encoder.fit_transform(disease_df['Fever'])
disease_df['Cough'] = label_encoder.fit_transform(disease_df['Cough'])
disease_df['Fatigue'] = label_encoder.fit_transform(disease_df['Fatigue'])
disease_df['Blood Pressure'] = label_encoder.fit_transform(disease_df['Blood Pressure'])
disease_df['Cholesterol Level'] = label_encoder.fit_transform(disease_df['Cholesterol Level'])

# disease_df.drop(["Fever", "Cough", "Fatigue", "Difficulty Breathing", "Gender", "Blood Pressure", "Cholesterol Level", "Outcome Variable"], axis=1, inplace=True)
disease_df

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,1,0,1,1,19,0,1,2,1
1,Common Cold,0,1,1,0,25,0,2,2,0
2,Eczema,0,1,1,0,25,0,2,2,0
3,Asthma,1,1,0,1,25,1,2,2,1
4,Asthma,1,1,0,1,25,1,2,2,1
...,...,...,...,...,...,...,...,...,...,...
344,Stroke,1,0,1,0,80,0,0,0,1
345,Stroke,1,0,1,0,85,1,0,0,1
346,Stroke,1,0,1,0,85,1,0,0,1
347,Stroke,1,0,1,0,90,0,0,0,1


In [10]:
# disease_concat_df = pd.concat([disease_df,hot_df],axis=1)
# disease_concat_df.head()

In [11]:
# Define features set
X = disease_df.copy()
X.drop(["Outcome Variable","Disease"], axis=1, inplace=True)
X.head()



# y = disease_concat_df["Disease_"].values
# X = disease_concat_df.drop(["Disease"],1).values
# X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level
0,1,0,1,1,19,0,1,2
1,0,1,1,0,25,0,2,2
2,0,1,1,0,25,0,2,2
3,1,1,0,1,25,1,2,2
4,1,1,0,1,25,1,2,2


In [12]:
#Define target vector
### What do we use to determine y[:5]?
y = disease_df["Outcome Variable"].values.reshape(-1,1)
y[:5]

array([[1],
       [0],
       [0],
       [1],
       [1]])

In [13]:
#Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=12)


In [14]:
#Creating StandardScaler Instance
scaler = StandardScaler()

In [15]:
#Fitting Standard Scaller
#X_scaler = scaler.fit(X_train)
scaler.fit(X_train)

In [16]:
#Scaling data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fitting the Decision Tree Model

In [17]:
#Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [18]:
#Fitting the model
model = model.fit(X_train_scaled, y_train)

# Making Predictions Using the Tree Model


In [19]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

# Model Evaluation


In [20]:
#Calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

#Convert the confusion matrix to a DataFrame
disease_labels = disease_df["Outcome Variable"].unique().tolist()


#col = pd.DataFrame([data], columns=['r

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=disease_labels)

#Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

acc_score



0.8068181818181818

# Save the Model

In [21]:
scaler_path = os.path.join(DISEASE_DIR, "std_scaler.bin")
dump(scaler, scaler_path, compress=True)


['/content/drive/My Drive/Colab Notebooks/std_scaler.bin']

In [22]:
model_path = os.path.join(DISEASE_DIR, "model.bin")
dump(model, model_path, compress=True)

['/content/drive/My Drive/Colab Notebooks/model.bin']

# Test/Practice

In [23]:
X_test_scaled = scaler.transform(X_test)
predictions = model.predict(X_test_scaled)
predictions

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [24]:
type(X_test)

pandas.core.frame.DataFrame

In [25]:
X_test.columns

Index(['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age', 'Gender',
       'Blood Pressure', 'Cholesterol Level'],
      dtype='object')

In [26]:
X_test.iloc[0:2]

Unnamed: 0,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level
67,0,0,1,0,35,1,2,2
174,0,0,1,0,45,1,0,0


In [34]:
row_76 = {
    "Fever": 0,
    "Cough":0,
    "Fatigue" :1,
    "Difficulty Breathing":0,
    "Age":35,
    "Gender":1,
    "Blood Pressure":2,
    "Cholesterol Level":2}
row_76_df = pd.DataFrame(row_76, index=[0])
row_76_scaled = scaler.transform(row_76_df)

In [37]:
model.predict(row_76_scaled)[0]

0

In [57]:
pd.DataFrame(row)

Unnamed: 0,340
Fever,0
Cough,0
Fatigue,1
Difficulty Breathing,0
Age,70
Gender,0
Blood Pressure,0
Cholesterol Level,0


In [61]:
new_pred = []
dicts = X_test.to_dict('records')
for d in dicts:
  df = pd.DataFrame(d, index=[0])
  scaled_df = scaler.transform(df)
  row_pred = model.predict(scaled_df)[0]
  new_pred.append(row_pred)

In [65]:
all(new_pred == predictions)

True

In [68]:
from joblib import __version__ as joblib_version
print(joblib_version)

1.3.2
