In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 



### Loading the data

In [6]:
df = pd.read_csv("../data/data.csv")
df.head(15)

Unnamed: 0.1,Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6,diff_sym_hos,result
0,0,104,8,1,66.0,1,0,14,31,19,12,3,1,8,1
1,1,101,8,0,56.0,0,1,14,31,19,12,3,1,0,0
2,2,137,8,1,46.0,0,1,14,31,19,12,3,1,13,0
3,3,116,8,0,60.0,1,0,14,31,19,12,3,1,0,0
4,4,116,8,1,58.0,0,0,14,31,19,12,3,1,0,0
5,5,23,8,0,44.0,0,1,14,31,19,12,3,1,0,0
6,6,105,8,1,34.0,0,1,14,31,19,12,3,1,0,0
7,7,13,8,1,37.0,1,0,14,31,19,12,3,1,6,0
8,8,13,8,1,39.0,1,0,14,31,19,12,3,1,5,0
9,9,13,8,1,56.0,1,0,14,31,19,12,3,1,4,0


### Data exploration 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 863 entries, 0 to 862
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    863 non-null    int64  
 1   location      863 non-null    int64  
 2   country       863 non-null    int64  
 3   gender        863 non-null    int64  
 4   age           863 non-null    float64
 5   vis_wuhan     863 non-null    int64  
 6   from_wuhan    863 non-null    int64  
 7   symptom1      863 non-null    int64  
 8   symptom2      863 non-null    int64  
 9   symptom3      863 non-null    int64  
 10  symptom4      863 non-null    int64  
 11  symptom5      863 non-null    int64  
 12  symptom6      863 non-null    int64  
 13  diff_sym_hos  863 non-null    int64  
 14  result        863 non-null    int64  
dtypes: float64(1), int64(14)
memory usage: 101.3 KB


In [8]:
df.axes[1:]

[Index(['Unnamed: 0', 'location', 'country', 'gender', 'age', 'vis_wuhan',
        'from_wuhan', 'symptom1', 'symptom2', 'symptom3', 'symptom4',
        'symptom5', 'symptom6', 'diff_sym_hos', 'result'],
       dtype='object')]

In [None]:
# print(f" there is {df.loc[:,1].unique().size} in sa column")
# print(f" there is {df['symptom1'].unique().size} in country column")
df.iloc[:,:3]

Unnamed: 0.1,Unnamed: 0,location,country
0,0,104,8
1,1,101,8
2,2,137,8
3,3,116,8
4,4,116,8
...,...,...,...
858,858,48,3
859,859,0,0
860,860,3,1
861,861,24,9


In [10]:
df.drop('Unnamed: 0',axis=1,inplace=True) 

In [11]:
df.describe()

Unnamed: 0,location,country,gender,age,vis_wuhan,from_wuhan,symptom1,symptom2,symptom3,symptom4,symptom5,symptom6,diff_sym_hos,result
count,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0
mean,76.645423,16.995365,0.849363,49.4,0.181924,0.107764,12.13905,28.002317,18.298957,11.840093,2.993048,0.998841,0.995365,0.125145
std,39.200264,7.809951,0.726062,15.079203,0.386005,0.310261,3.99787,7.473231,2.864064,1.183771,0.127251,0.03404,2.358767,0.331075
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,0.0
25%,45.0,11.0,0.0,40.0,0.0,0.0,14.0,31.0,19.0,12.0,3.0,1.0,0.0,0.0
50%,87.0,18.0,1.0,49.4,0.0,0.0,14.0,31.0,19.0,12.0,3.0,1.0,0.0,0.0
75%,110.0,24.0,1.0,57.0,0.0,0.0,14.0,31.0,19.0,12.0,3.0,1.0,1.0,0.0
max,138.0,33.0,2.0,96.0,1.0,1.0,24.0,31.0,19.0,12.0,3.0,1.0,15.0,1.0


In [12]:
y = df['result']
x=df.drop('result',axis=1)

In [13]:
x_train, x_check, y_train, y_check = train_test_split(x,y, test_size=0.3, random_state=42)
x_test, x_validate, y_test, y_validate = train_test_split(x_check,y_check, test_size=0.3, random_state=38)

### Trying multiple models to find the best tuning for the hyperparameter

In [14]:
no_of_models =6

In [15]:
models=[]
for i in range(1,2*no_of_models,2): 
    models.append( KNeighborsClassifier(n_neighbors= i) )
    print(i)

1
3
5
7
9
11


In [16]:
for i in range (0,no_of_models) : 
    models[i].fit(x_train,y_train)
    training_accuracy = models[i].score(x_train, y_train)
    print(f"Training Accuracy with {2*i+1} neighbors:", training_accuracy)

Training Accuracy with 1 neighbors: 1.0
Training Accuracy with 3 neighbors: 0.9635761589403974
Training Accuracy with 5 neighbors: 0.9652317880794702
Training Accuracy with 7 neighbors: 0.9552980132450332
Training Accuracy with 9 neighbors: 0.9486754966887417
Training Accuracy with 11 neighbors: 0.9470198675496688


In [17]:
y_test_predict=models[2].predict(x_test)

In [18]:
for i in range (0,no_of_models) : 
    testing_accuracy = models[i].score(x_test, y_test)
    print(f"Testing Accuracy with {2*i+1} neighbors:", testing_accuracy)

Testing Accuracy with 1 neighbors: 0.9502762430939227
Testing Accuracy with 3 neighbors: 0.9226519337016574
Testing Accuracy with 5 neighbors: 0.9281767955801105
Testing Accuracy with 7 neighbors: 0.9116022099447514
Testing Accuracy with 9 neighbors: 0.9337016574585635
Testing Accuracy with 11 neighbors: 0.9337016574585635


In [19]:
for i in range (0,no_of_models) : 
    models[i].fit(x_test,y_test)
    validation_accuracy = models[i].score(x_validate, y_validate)
    print(f"Validation Accuracy with {2*i+1} neighbors:", validation_accuracy)

Validation Accuracy with 1 neighbors: 0.9487179487179487
Validation Accuracy with 3 neighbors: 0.9102564102564102
Validation Accuracy with 5 neighbors: 0.9615384615384616
Validation Accuracy with 7 neighbors: 0.9615384615384616
Validation Accuracy with 9 neighbors: 0.9615384615384616
Validation Accuracy with 11 neighbors: 0.9615384615384616
