In [None]:
'''
Documentation: 
Topic: Building a Classifier Using Decision Tree to predict the COVID-19 Severity
Dataset: https://www.kaggle.com/iamhungundji/covid19-symptoms-checker/version/2
Methodology:
1. Load Labeled Training Data
2. Load and create Test Data
3. Using Decision Tree Classifier
4. Find the accuracy score and confusion matrix
'''

In [1]:
#importing all necessary libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt
df=pd.read_csv('Cleaned-Data.csv') #reads the csv file at the specified location.

col_names=df.columns.tolist()
print(col_names,'\n') #prints all the column names in the dataset

#define X and y
feature_cols = ['Fever', 'Tiredness', 'Dry-Cough', 'Difficulty-in-Breathing', 'Sore-Throat', 'None_Sympton', 'Pains', 'Nasal-Congestion', 'Runny-Nose', 'Diarrhea', 'None_Experiencing']

#X is a matrix, hence we use [] to access the features that we want in feature_cols
X=df[feature_cols] 
print(X)
#y is a vector hence we use dot to access Severity(chosen moderate, can also choose mean of all the severities)
y=df.Severity_Moderate
print(y)
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=54,train_size=0.75)

#importing the Decision Tree Classifier from sklearn library
#from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier(criterion='entropy')
#training the decision tree classifier
clf.fit(X_train,y_train)
#predicting severity based on test set
y_pred=clf.predict(X_test)

#calculate accuracy from the metrics module
print('\nAccuracy Score on Test Data :',metrics.accuracy_score(y_test,y_pred)) #accuracy on test data

#prints the confusion matrix(numpy array 2x2)[row,column]
print('\nConfusion matrix:')
confusion = metrics.confusion_matrix(y_test,y_pred)
print(confusion)
#save the matrix and slice it into 4 pieces
TP = confusion[1,1] 
TN = confusion[0,0] 
FP = confusion[0,1] 
FN = confusion[1,0]

['Fever', 'Tiredness', 'Dry-Cough', 'Difficulty-in-Breathing', 'Sore-Throat', 'None_Sympton', 'Pains', 'Nasal-Congestion', 'Runny-Nose', 'Diarrhea', 'None_Experiencing', 'Age_0-9', 'Age_10-19', 'Age_20-24', 'Age_25-59', 'Age_60+', 'Gender_Female', 'Gender_Male', 'Gender_Transgender', 'Severity_Mild', 'Severity_Moderate', 'Severity_None', 'Severity_Severe', 'Contact_Dont-Know', 'Contact_No', 'Contact_Yes', 'Country'] 

        Fever  Tiredness  Dry-Cough  Difficulty-in-Breathing  Sore-Throat  \
0           1          1          1                        1            1   
1           1          1          1                        1            1   
2           1          1          1                        1            1   
3           1          1          1                        1            1   
4           1          1          1                        1            1   
...       ...        ...        ...                      ...          ...   
316795      0          0          0    

In [None]:
print((TP+TN)/float(TP+TN+FP+FN)) 
print('Accuracy Score on Confusion Matrix:',metrics.accuracy_score(y_test,y_pred)) 
classification_error = (FP+FN)/float(TP+TN+FP+FN)
print(classification_error)
print('Error Score on Confusion Matrix:',1-metrics.accuracy_score(y_test,y_pred))
sensitiviy = TP/float(TN+FP)
print(sensitiviy)
print('Sensitivity Score on Confusion Matrix:',metrics.recall_score(y_test,y_pred))
specificity = TN/(TN+FP)
print(specificity)
print('Specificity Score on Confusion  Matrix:',specificity)
precision = TP/float(TP+FP)
print(precision)
print('Precision Score on Confusion Matrix:',metrics.precision_score(y_test,y_pred))

In [None]:
#THE ABOVE CELL GIVE US A RED BOX DUE TO DIVISION OF 0/0