In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import itertools
plt.style.use('fivethirtyeight')

In [2]:
covidData = pd.read_csv('Cleaned-Data.csv')
rawData = pd.read_csv('Raw-Data.csv')

In [3]:
newData = pd.read_csv('datasets_575188_1041634_modified_COVID19_open_line_list.csv')


In [4]:
newData.outcome.unique()

array([0, 1, 2, 3], dtype=int64)

In [5]:
newData["symptom"] = None

In [6]:
newData.loc[newData['outcome'] == 1, ['symptom']] = "Negatif Covid mild"
newData.loc[newData.outcome == 0, ['symptom']] = "Negatif Covid"
newData.loc[newData.outcome == 2, ['symptom']] = "Positif Covid moder"
newData.loc[newData.outcome == 3, ['symptom']] = "Positif Covid sever"



In [7]:
 training_dataset = newData[['age', 'gender', 'onset_symptoms_to_admission_hospital',
       'admission_hospital_to_confirmation', 'fever', 'caugh', 'chills',
       'nausea', 'dyspnea', 'anorexia', 'pneumonitis', 'rhinorrhea',
       'diarrhea', 'fatigue', 'sore muscle', 'sore throat',
       'respiratory symptoms', 'headache', 'weakness', 'dizziness',
       'pleural effusion', ' chest pain', 'symptom']]

In [8]:

corr_matrix = training_dataset.corr()


In [9]:
cor_target = abs(corr_matrix)
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.4]
relevant_features

Unnamed: 0,age,gender,onset_symptoms_to_admission_hospital,admission_hospital_to_confirmation,fever,caugh,chills,nausea,dyspnea,anorexia,...,diarrhea,fatigue,sore muscle,sore throat,respiratory symptoms,headache,weakness,dizziness,pleural effusion,chest pain
age,1.0,,,,,,,,,,...,,,,,,,,,,
gender,,1.0,,,,,,,,,...,,,,,,,,,,
onset_symptoms_to_admission_hospital,,,1.0,,,,,,,,...,,,,,,,,,,
admission_hospital_to_confirmation,,,,1.0,,,,,,,...,,,,,,,,,,
fever,,,,,1.0,,,,,,...,,,,,,,,,,
caugh,,,,,,1.0,,,,,...,,,,,,,,,,
chills,,,,,,,1.0,,,,...,,,,,,,,,,
nausea,,,,,,,,1.0,,,...,,,,,,,,,,
dyspnea,,,,,,,,,1.0,,...,,,,,,,,,,
anorexia,,,,,,,,,,1.0,...,,,,,,,,,,


In [10]:
training_data = training_dataset.iloc[:,:-1]

In [11]:
corr = training_data.corr()

In [12]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.8:
            if columns[j]:
                columns[j] = False
selected_columns = training_data.columns[columns]
training_data = training_data[selected_columns]

In [13]:
data = training_data

In [14]:
selected_columns = selected_columns[1:].values
import statsmodels.formula.api as sm
def backwardElimination(x, Y, sl, columns):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                    columns = np.delete(columns, j)
                    
    regressor_OLS.summary()
    return x, columns
SL = 0.05
data_modeled, selected_columns = backwardElimination(data.iloc[:,1:].values, data.iloc[:,0].values, SL, selected_columns)

In [15]:
result = pd.DataFrame()
result['diagnosis'] = data.iloc[:,0]

In [16]:
data = pd.DataFrame(data = data_modeled, columns = selected_columns)

In [17]:
data

Unnamed: 0,gender,onset_symptoms_to_admission_hospital,admission_hospital_to_confirmation,fever,caugh,pneumonitis,fatigue,chest pain
0,0,0,1,1,0,0,0,0
1,0,5,2,1,0,0,0,0
2,0,0,1,0,1,0,0,0
3,1,0,6,1,0,0,0,0
4,1,6,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...
295,0,3,2,1,0,0,0,0
296,0,10,12,0,0,0,1,0
297,1,7,12,0,1,0,0,0
298,0,12,1,1,1,0,0,0


In [18]:
# Dimensionality Reduction for removing redundancies
dimensionality_reduction = training_dataset.groupby(training_dataset['symptom']).max()

In [19]:
dimensionality_reduction

Unnamed: 0_level_0,age,gender,onset_symptoms_to_admission_hospital,admission_hospital_to_confirmation,fever,caugh,chills,nausea,dyspnea,anorexia,...,diarrhea,fatigue,sore muscle,sore throat,respiratory symptoms,headache,weakness,dizziness,pleural effusion,chest pain
symptom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Negatif Covid,96,1,32,17,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Negatif Covid mild,73,1,13,11,1,1,1,0,1,0,...,1,1,0,1,1,1,1,0,0,0
Positif Covid moder,85,1,10,12,1,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
Positif Covid sever,67,1,12,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:

# Slicing and Dicing the dataset to separate features from predictions
X = training_dataset.iloc[:, 0:21].values
y = training_dataset.iloc[:, -1].values

In [21]:
# Encoding String values to integer constants
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [22]:

# Splitting the dataset into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [23]:
# Implementing the Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
y_predict = classifier.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

0.9066666666666666

In [25]:
# Saving the information of columns
cols     = training_dataset.columns
cols     = cols[:-1]

In [26]:

# Checking the Important features
importances = classifier.feature_importances_
indices = np.argsort(importances)[::-1]
features = cols

In [27]:
features

Index(['age', 'gender', 'onset_symptoms_to_admission_hospital',
       'admission_hospital_to_confirmation', 'fever', 'caugh', 'chills',
       'nausea', 'dyspnea', 'anorexia', 'pneumonitis', 'rhinorrhea',
       'diarrhea', 'fatigue', 'sore muscle', 'sore throat',
       'respiratory symptoms', 'headache', 'weakness', 'dizziness',
       'pleural effusion', ' chest pain'],
      dtype='object')

In [28]:

# Implementing the Visual Tree
from sklearn.tree import _tree

In [31]:
def chatbot():

    print("Reponsez yes/Yes or no/No pour les symptomes") 
    def print_disease(node):
        #print(node)
        node = node[0]
        #print(len(node))
        val  = node.nonzero() 
        #print(val)
        disease = labelencoder.inverse_transform(val[0])
        return disease
    def tree_to_code(tree, feature_names):
        tree_ = tree.tree_
        #print(tree_)
        feature_name = [
            feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
            for i in tree_.feature
        ]
        #print("def tree({}):".format(", ".join(feature_names)))
        symptoms_present = []
        def recurse(node, depth):
            indent = "  " * depth
            if tree_.feature[node] != _tree.TREE_UNDEFINED:
                name = feature_name[node]
                threshold = tree_.threshold[node]
                print(name + " ?")
                ans = input()
                ans = ans.lower()
                if ans == 'yes':
                    val = 1
                else:
                    val = 0
                if  val <= threshold:
                    recurse(tree_.children_left[node], depth + 1)
                else:
                    symptoms_present.append(name)
                    recurse(tree_.children_right[node], depth + 1)
            else:
                present_disease = print_disease(tree_.value[node])
                print( "D'après les symptomes vous avez " +  present_disease )
                red_cols = dimensionality_reduction.columns 
                symptoms_given = red_cols[dimensionality_reduction.loc[present_disease].values[0].nonzero()]
                print("symptoms present  " + str(list(symptoms_present)))
                print("symptoms given "  +  str(list(symptoms_given)) )  
                confidence_level = (1.0*len(symptoms_present))/len(symptoms_given)
                print("indice de confiance " + str(confidence_level))
    
        recurse(0, 1)
    tree_to_code(classifier,cols)

In [30]:
chatbot() 


Reponsez yes/Yes or no/No pour les symptomes
admission_hospital_to_confirmation ?
no
chills ?
yes
admission_hospital_to_confirmation ?
no
age ?
60
['You may have Negatif Covid mild']
symptoms present  ['chills']
symptoms given ['age', 'gender', 'onset_symptoms_to_admission_hospital', 'admission_hospital_to_confirmation', 'fever', 'caugh', 'chills', 'dyspnea', 'pneumonitis', 'rhinorrhea', 'diarrhea', 'fatigue', 'sore throat', 'respiratory symptoms', 'headache', 'weakness']
confidence level is 0.0625
