   # Prediction of Covid-19 infection

In [2]:
#Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [3]:
#Reading and Importing dataset
df = pd.read_csv("covid_early_stage_symptoms.csv")
df

Unnamed: 0,gender,age_year,fever,cough,runny_nose,muscle_soreness,pneumonia,diarrhea,lung_infection,travel_history,isolation_treatment,test_results
0,male,89,1,1,0,0,0,0,0,1,0,0
1,male,68,1,0,0,0,0,0,0,0,0,0
2,male,68,0,0,0,0,0,0,0,1,0,0
3,male,68,1,1,0,0,0,0,0,1,1,1
4,male,50,1,1,1,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6507,female,44,1,1,0,0,0,0,0,1,0,1
6508,female,44,1,1,0,0,0,0,0,0,0,0
6509,female,58,0,0,0,0,0,0,0,0,0,0
6510,female,58,1,1,0,0,0,0,0,0,0,0


In [4]:
sex = LabelEncoder()
df['sex']=sex.fit_transform(df['gender'])
df

Unnamed: 0,gender,age_year,fever,cough,runny_nose,muscle_soreness,pneumonia,diarrhea,lung_infection,travel_history,isolation_treatment,test_results,sex
0,male,89,1,1,0,0,0,0,0,1,0,0,1
1,male,68,1,0,0,0,0,0,0,0,0,0,1
2,male,68,0,0,0,0,0,0,0,1,0,0,1
3,male,68,1,1,0,0,0,0,0,1,1,1,1
4,male,50,1,1,1,0,1,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6507,female,44,1,1,0,0,0,0,0,1,0,1,0
6508,female,44,1,1,0,0,0,0,0,0,0,0,0
6509,female,58,0,0,0,0,0,0,0,0,0,0,0
6510,female,58,1,1,0,0,0,0,0,0,0,0,0


In [5]:
#dropping unnecessary columns
final = df.drop(['lung_infection','isolation_treatment', 'pneumonia', 'gender'],axis = 'columns')
final

Unnamed: 0,age_year,fever,cough,runny_nose,muscle_soreness,diarrhea,travel_history,test_results,sex
0,89,1,1,0,0,0,1,0,1
1,68,1,0,0,0,0,0,0,1
2,68,0,0,0,0,0,1,0,1
3,68,1,1,0,0,0,1,1,1
4,50,1,1,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...
6507,44,1,1,0,0,0,1,1,0
6508,44,1,1,0,0,0,0,0,0
6509,58,0,0,0,0,0,0,0,0
6510,58,1,1,0,0,0,0,0,0


## Data Analysis

In [6]:
#Find the number of empty cells
final.columns[final.isna().any()]

Index([], dtype='object')

In [7]:
final.isna().sum()

age_year           0
fever              0
cough              0
runny_nose         0
muscle_soreness    0
diarrhea           0
travel_history     0
test_results       0
sex                0
dtype: int64

In [8]:
#prints statistics of data
final.describe()

Unnamed: 0,age_year,fever,cough,runny_nose,muscle_soreness,diarrhea,travel_history,test_results,sex
count,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0,6512.0
mean,44.019502,0.41078,0.303286,0.084306,0.003993,0.005682,0.650952,0.2414,0.517045
std,16.112865,0.492013,0.459713,0.277867,0.063066,0.075169,0.476706,0.427965,0.499748
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,43.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
75%,55.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
max,96.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
#number of rows and columns
final.shape

(6512, 9)

In [10]:
#how many people tested positive and how many poeple tested negative
final.test_results.value_counts()

0    4940
1    1572
Name: test_results, dtype: int64

## Creating Models

In [11]:
X = final.drop(['test_results'], axis = 'columns')
X

Unnamed: 0,age_year,fever,cough,runny_nose,muscle_soreness,diarrhea,travel_history,sex
0,89,1,1,0,0,0,1,1
1,68,1,0,0,0,0,0,1
2,68,0,0,0,0,0,1,1
3,68,1,1,0,0,0,1,1
4,50,1,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...
6507,44,1,1,0,0,0,1,0
6508,44,1,1,0,0,0,0,0
6509,58,0,0,0,0,0,0,0
6510,58,1,1,0,0,0,0,0


In [12]:
y = final.test_results
y

0       0
1       0
2       0
3       1
4       1
       ..
6507    1
6508    0
6509    0
6510    0
6511    0
Name: test_results, Length: 6512, dtype: int64

In [13]:
rf = cross_val_score(RandomForestClassifier(random_state=10, n_estimators=5), X, y, cv = 10)
rf

array([0.75460123, 0.8803681 , 0.79109063, 0.72043011, 0.78033794,
       0.81105991, 0.88632873, 0.906298  , 0.82334869, 0.65898618])

In [15]:
lr = cross_val_score(LogisticRegression(random_state=10, solver = 'liblinear'), X, y, cv = 10)
lr

array([0.80828221, 0.89263804, 0.77112135, 0.83410138, 0.77880184,
       0.80952381, 0.91705069, 0.89861751, 0.84485407, 0.62211982])

In [15]:
svc = cross_val_score(SVC(), X, y, cv = 10)
np.average(svc)
#Takes long to run

0.7585997945586309

In [17]:
knn = cross_val_score(KNeighborsClassifier(n_neighbors = 10), X, y, cv = 10)
knn

array([0.76840491, 0.89110429, 0.78341014, 0.74500768, 0.77880184,
       0.80645161, 0.89400922, 0.86943164, 0.8141321 , 0.69124424])

In [18]:
dtc = cross_val_score(DecisionTreeClassifier(),X, y, cv = 10)
dtc

array([0.74386503, 0.86349693, 0.77880184, 0.71889401, 0.77112135,
       0.80030722, 0.88786482, 0.90015361, 0.82488479, 0.69278034])

### So far Logistic Regression gives the best score