# CoVEx'19 Expert System
### An Artificial Intelligence based Covid'19 Expert System

### import useful Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import confusion_matrix

Actual data source: https://www.kaggle.com/marianarfranklin/mexico-covid19-clinical-data/metadata

Data was in mexican language with alot of raw data.. we converted the data columns into English and then cleaned the data.
So, below we are using cleaned data

In [3]:
df = pd.read_csv("Covid19_cleaned_data.csv")
df.head()

Unnamed: 0,Sex,Age,smoking,Fever,Cough,Pneumonia,DIABETES,Asthma,HIPERTENSION,Diarrhea,Sore_thorat,Headache,Muscle_pain,CARDIOVASCULAR,Shortness_of_breath,Abnormalities_in_smell_and_taste,Obesity,RENAL_Chronicle,Result
0,0,74,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
1,1,71,1,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0
2,0,50,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1
3,1,5,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1
4,1,8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['Sex', 'Age', 'smoking', 'Fever', 'Cough', 'Pneumonia', 'DIABETES',
       'Asthma', 'HIPERTENSION', 'Diarrhea', 'Sore_thorat', 'Headache',
       'Muscle_pain', 'CARDIOVASCULAR', 'Shortness_of_breath',
       'Abnormalities_in_smell_and_taste', 'Obesity', 'RENAL_Chronicle',
       'Result'],
      dtype='object')

In [5]:
x_data = df.drop(['Result',''],axis=1)
y_data = df['Result']

In [6]:
x_data = df.drop(['Result'],axis=1)
y_data = df['Result']

MinMaxScaler = preprocessing.MinMaxScaler()
X_data_minmax = MinMaxScaler.fit_transform(x_data)

data = pd.DataFrame(X_data_minmax)
df.head()

Unnamed: 0,Sex,Age,smoking,Fever,Cough,Pneumonia,DIABETES,Asthma,HIPERTENSION,Diarrhea,Sore_thorat,Headache,Muscle_pain,CARDIOVASCULAR,Shortness_of_breath,Abnormalities_in_smell_and_taste,Obesity,RENAL_Chronicle,Result
0,0,74,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
1,1,71,1,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0
2,0,50,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1
3,1,5,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1
4,1,8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data,test_size=0.2, random_state = 42)

In [8]:
y_test

1935     0
6494     1
1720     0
9120     1
360      0
        ..
1195     0
11876    0
5421     0
3855     0
4414     0
Name: Result, Length: 2400, dtype: int64

In [9]:
# Feature Scaling because yes we don't want one independent variable dominating the other and it makes computations easy
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train) 

dtree_predictions = dtree_model.predict(X_test) 

In [11]:
X_test.shape

(2400, 18)

In [12]:
accuracy = dtree_model.score(X_test, y_test) 
accuracy

0.8308333333333333

In [13]:
cm = confusion_matrix(y_test, dtree_predictions) 
cm

array([[1994,    0],
       [ 406,    0]], dtype=int64)

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train,y_train)

KNeighborsClassifier()

In [15]:
ypred = knn_clf.predict(X_test) #These are the predicted output values

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, ypred)
print('Confusion Matrix:')
print(result)

Confusion Matrix:
[[1906   88]
 [ 380   26]]


In [17]:
result2 = accuracy_score(y_test,ypred)
print('Accuracy:',result2)

Accuracy: 0.805


In [30]:
knn_clf.predict(x_test_data)

array([1], dtype=int64)

In [31]:
knn_clf.predict(X_test[30:40])

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [35]:
result1 = classification_report(y_test, ypred)
print('Classification Report:')
print(result1)

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1994
           1       0.22      0.05      0.08       406

    accuracy                           0.81      2400
   macro avg       0.53      0.51      0.49      2400
weighted avg       0.73      0.81      0.76      2400



In [44]:
import pickle

# save
with open('symptoms_model.pkl','wb') as f:
    pickle.dump(dtree_model,f)

In [29]:
x_test_data = {'Sex':0, 'Age':54, 'smoking':0, 'Fever':1, 'Cough':1, 'Pneumonia':0, 'DIABETES':0, 'Asthma':0,
       'HIPERTENSION':0, 'Diarrhea':1, 'Sore_thorat':1, 'Headache':1,
       'Muscle_pain':0, 'CARDIOVASCULAR':0, 'Shortness_of_breath':1,
       'Abnormalities_in_smell_and_taste':0, 'Obesity':0, 'RENAL_Chronicle':0}
x_test_data = pd.DataFrame(x_test_data, index=[0])
x_test_data

Unnamed: 0,Sex,Age,smoking,Fever,Cough,Pneumonia,DIABETES,Asthma,HIPERTENSION,Diarrhea,Sore_thorat,Headache,Muscle_pain,CARDIOVASCULAR,Shortness_of_breath,Abnormalities_in_smell_and_taste,Obesity,RENAL_Chronicle
0,0,54,0,1,1,0,0,0,0,1,1,1,0,0,1,0,0,0


In [11]:
# load
import pickle
with open('symptoms_model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

clf2.predict(x_test_data)

array([0], dtype=int64)