In [1]:
#import Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",None)

In [6]:
#Creating Dataframe
data_covid = pd.read_excel('Covid_Dataset.xlsx', header=0)
data_covid.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,corona_result
0,0,0,0,0,0,Yes,female,Contact with confirmed,positive
1,1,0,0,0,0,Yes,male,Contact with confirmed,positive
2,0,0,0,0,0,Yes,male,Abroad,positive
3,1,1,0,0,0,Yes,male,Other,positive
4,1,1,0,0,0,Yes,female,Contact with confirmed,positive


In [8]:
data_covid.shape

(2137, 9)

In [9]:
data_covid.describe(include="all")

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,test_indication,corona_result
count,2137.0,2137.0,2137.0,2137.0,2137.0,2137,2137,2137,2137
unique,,,,,,2,3,3,2
top,,,,,,No,female,Other,positive
freq,,,,,,1159,1064,1290,1105
mean,0.419747,0.27328,0.038372,0.037436,0.058493,,,,
std,0.493633,0.445748,0.192137,0.189871,0.234729,,,,
min,0.0,0.0,0.0,0.0,0.0,,,,
25%,0.0,0.0,0.0,0.0,0.0,,,,
50%,0.0,0.0,0.0,0.0,0.0,,,,
75%,1.0,1.0,0.0,0.0,0.0,,,,


**Pre processing the data**

In [10]:
# create a copy of the dataframe
data_covid_rev = pd.DataFrame.copy(data_covid)

In [11]:
data_covid_rev.duplicated().sum()

1925

In [12]:
# Handle the missing value

In [13]:
data_covid_rev.isnull().sum()

cough                  0
fever                  0
sore_throat            0
shortness_of_breath    0
head_ache              0
age_60_and_above       0
gender                 0
test_indication        0
corona_result          0
dtype: int64

In [14]:
data_covid_rev.dtypes

cough                   int64
fever                   int64
sore_throat             int64
shortness_of_breath     int64
head_ache               int64
age_60_and_above       object
gender                 object
test_indication        object
corona_result          object
dtype: object

In [17]:
# convert categorical data into numerical
colname=[]
for x in data_covid_rev.columns:
    if data_covid_rev[x].dtype=='object':
        colname.append(x)
colname

['age_60_and_above', 'gender', 'test_indication', 'corona_result']

In [18]:
# For preprocessing the data
from sklearn.preprocessing import LabelEncoder        
 
le=LabelEncoder()
 
for x in colname:
    data_covid_rev[x]=le.fit_transform(data_covid_rev[x])

In [19]:
# negative --> 0 
# positive --> 1

In [20]:
# Create X and Y
X = data_covid_rev.values[:,0:-1]  #--> 0 to -2
Y = data_covid_rev.values[:,-1]

# .values will return an array whereas .loc will return a df object
# arrays are lighter in weigth which indirectly help the model to build faster.

In [21]:
print(X.shape)
print(Y.shape)

(2137, 8)
(2137,)


In [22]:
# Before Building the model, Do Scaling because it will improve the model.
from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler()
 
scaler.fit(X)
X = scaler.transform(X)

**Running a basic model**

# Model using Logistic Regression

In [23]:
from sklearn.model_selection import train_test_split
 
#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,  # Default value -> test_size= 0.25
                                                    random_state=10)

In [24]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1709, 8)
(1709,)
(428, 8)
(428,)


In [26]:
from sklearn.linear_model import LogisticRegression
#create a model object
classifier = LogisticRegression()
#train the model object
classifier.fit(X_train,Y_train)      # fit is the function that is used for training the data

Y_pred = classifier.predict(X_test)
#print(Y_pred)

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[161  49]
 [ 93 125]]
Classification report: 
              precision    recall  f1-score   support

           0       0.63      0.77      0.69       210
           1       0.72      0.57      0.64       218

    accuracy                           0.67       428
   macro avg       0.68      0.67      0.67       428
weighted avg       0.68      0.67      0.67       428

Accuracy of the model:  0.6682242990654206


## Tunning

## Adjusting the Threshold

In [28]:
# store the predicted probabilities
y_pred_prob = classifier.predict_proba(X_test)
print(y_pred_prob)

[[5.43891575e-01 4.56108425e-01]
 [4.05849970e-01 5.94150030e-01]
 [2.78398734e-01 7.21601266e-01]
 [3.07572236e-01 6.92427764e-01]
 [5.25488360e-01 4.74511640e-01]
 [3.67310049e-01 6.32689951e-01]
 [5.25346586e-01 4.74653414e-01]
 [7.33999306e-01 2.66000694e-01]
 [3.34544795e-03 9.96654552e-01]
 [5.43891575e-01 4.56108425e-01]
 [6.94505195e-01 3.05494805e-01]
 [6.34958153e-01 3.65041847e-01]
 [3.07572236e-01 6.92427764e-01]
 [2.85172931e-01 7.14827069e-01]
 [4.50376070e-01 5.49623930e-01]
 [5.65204235e-01 4.34795765e-01]
 [4.23805456e-01 5.76194544e-01]
 [6.94505195e-01 3.05494805e-01]
 [2.85172931e-01 7.14827069e-01]
 [4.77090950e-01 5.22909050e-01]
 [1.59170993e-04 9.99840829e-01]
 [3.67310049e-01 6.32689951e-01]
 [6.09710663e-01 3.90289337e-01]
 [4.98505603e-01 5.01494397e-01]
 [6.94505195e-01 3.05494805e-01]
 [6.34958153e-01 3.65041847e-01]
 [5.65204235e-01 4.34795765e-01]
 [6.94505195e-01 3.05494805e-01]
 [6.34958153e-01 3.65041847e-01]
 [3.00048611e-03 9.96999514e-01]
 [4.986477

In [35]:
# Trial and Error Approch --> decide you to take the best threshold value with the lowest Type 2 Error
for a in np.arange(0.4,0.61,0.01):
    predict_mine = np.where(y_pred_prob[:,1] > a, 1, 0) # acts as an if else statement
    cfm=confusion_matrix(Y_test, predict_mine)
    total_err=cfm[0,1]+cfm[1,0]  # Addition of Type 1 and Type 2 error
    print("Errors at threshold ", a, ":",total_err, " , type 2 error :", 
          cfm[1,0]," , type 1 error:", cfm[0,1])

Errors at threshold  0.4 : 127  , type 2 error : 36  , type 1 error: 91
Errors at threshold  0.41000000000000003 : 134  , type 2 error : 43  , type 1 error: 91
Errors at threshold  0.42000000000000004 : 133  , type 2 error : 43  , type 1 error: 90
Errors at threshold  0.43000000000000005 : 133  , type 2 error : 43  , type 1 error: 90
Errors at threshold  0.44000000000000006 : 122  , type 2 error : 44  , type 1 error: 78
Errors at threshold  0.45000000000000007 : 119  , type 2 error : 57  , type 1 error: 62
Errors at threshold  0.4600000000000001 : 123  , type 2 error : 71  , type 1 error: 52
Errors at threshold  0.4700000000000001 : 123  , type 2 error : 71  , type 1 error: 52
Errors at threshold  0.4800000000000001 : 134  , type 2 error : 84  , type 1 error: 50
Errors at threshold  0.4900000000000001 : 142  , type 2 error : 93  , type 1 error: 49
Errors at threshold  0.5000000000000001 : 142  , type 2 error : 93  , type 1 error: 49
Errors at threshold  0.5100000000000001 : 139  , type

In [36]:
y_pred_class=[]
for value in y_pred_prob[:,1]:
    if value > 0.456:
        y_pred_class.append(1)
    else:
        y_pred_class.append(0)
#print(y_pred_class)

In [37]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cfm=confusion_matrix(Y_test,y_pred_class)
print(cfm)
acc=accuracy_score(Y_test, y_pred_class)
print("Accuracy of the model: ",acc)
print(classification_report(Y_test, y_pred_class))

[[148  62]
 [ 57 161]]
Accuracy of the model:  0.7219626168224299
              precision    recall  f1-score   support

           0       0.72      0.70      0.71       210
           1       0.72      0.74      0.73       218

    accuracy                           0.72       428
   macro avg       0.72      0.72      0.72       428
weighted avg       0.72      0.72      0.72       428



After Tuning in Logistic regression, we are getting the accuracy of 0.721, and as we can see that the recall for Class O & Class 1 is 0.70 and 0.74

# Model Using Decision Tree

In [38]:
from sklearn.model_selection import train_test_split
 
#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,  # Default value -> test_size= 0.25
                                                    random_state=10)

In [39]:
# predicting using Decision Tree Classifier.
from sklearn.tree import DecisionTreeClassifier

model_DT = DecisionTreeClassifier(random_state=10,
                                   criterion="gini")

# fit the model on data and predict the values
model_DT.fit(X_train,Y_train)      # fit is the function that is used for training the data
Y_pred = model_DT.predict(X_test) # Validation Data
#print(Y_pred)
#print(list(zip(Y_test,Y_pred)))

In [40]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[173  37]
 [ 46 172]]
Classification report: 
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       210
           1       0.82      0.79      0.81       218

    accuracy                           0.81       428
   macro avg       0.81      0.81      0.81       428
weighted avg       0.81      0.81      0.81       428

Accuracy of the model:  0.8060747663551402


## Conclusion: 
As compared to **Logistic Regression**, Decision Tree is performed much better as we can see that the **Recall** & **f1-score** is **Higher** than **Logistic Regression**