In [2]:
#import neccessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

#loading the dataset
df = pd.read_csv("covid_early_stage_symptoms.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6512 entries, 0 to 6511
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   gender               6512 non-null   object
 1   age_year             6512 non-null   int64 
 2   fever                6512 non-null   int64 
 3   cough                6512 non-null   int64 
 4   runny_nose           6512 non-null   int64 
 5   muscle_soreness      6512 non-null   int64 
 6   pneumonia            6512 non-null   int64 
 7   diarrhea             6512 non-null   int64 
 8   lung_infection       6512 non-null   int64 
 9   travel_history       6512 non-null   int64 
 10  isolation_treatment  6512 non-null   int64 
 11  test_results         6512 non-null   int64 
dtypes: int64(11), object(1)
memory usage: 610.6+ KB


In [4]:
sex = LabelEncoder()
df['sex']=sex.fit_transform(df['gender'])
df

Unnamed: 0,gender,age_year,fever,cough,runny_nose,muscle_soreness,pneumonia,diarrhea,lung_infection,travel_history,isolation_treatment,test_results,sex
0,male,89,1,1,0,0,0,0,0,1,0,0,1
1,male,68,1,0,0,0,0,0,0,0,0,0,1
2,male,68,0,0,0,0,0,0,0,1,0,0,1
3,male,68,1,1,0,0,0,0,0,1,1,1,1
4,male,50,1,1,1,0,1,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6507,female,44,1,1,0,0,0,0,0,1,0,1,0
6508,female,44,1,1,0,0,0,0,0,0,0,0,0
6509,female,58,0,0,0,0,0,0,0,0,0,0,0
6510,female,58,1,1,0,0,0,0,0,0,0,0,0


In [5]:
#dropping unnecessary columns
final = df.drop(['lung_infection','isolation_treatment', 'pneumonia', 'gender'],axis = 'columns')
final

Unnamed: 0,age_year,fever,cough,runny_nose,muscle_soreness,diarrhea,travel_history,test_results,sex
0,89,1,1,0,0,0,1,0,1
1,68,1,0,0,0,0,0,0,1
2,68,0,0,0,0,0,1,0,1
3,68,1,1,0,0,0,1,1,1
4,50,1,1,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...
6507,44,1,1,0,0,0,1,1,0
6508,44,1,1,0,0,0,0,0,0
6509,58,0,0,0,0,0,0,0,0
6510,58,1,1,0,0,0,0,0,0


In [6]:
X = final.drop(['test_results'], axis = 'columns')
X

Unnamed: 0,age_year,fever,cough,runny_nose,muscle_soreness,diarrhea,travel_history,sex
0,89,1,1,0,0,0,1,1
1,68,1,0,0,0,0,0,1
2,68,0,0,0,0,0,1,1
3,68,1,1,0,0,0,1,1
4,50,1,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...
6507,44,1,1,0,0,0,1,0
6508,44,1,1,0,0,0,0,0
6509,58,0,0,0,0,0,0,0
6510,58,1,1,0,0,0,0,0


In [7]:
y = final.test_results
y

0       0
1       0
2       0
3       1
4       1
       ..
6507    1
6508    0
6509    0
6510    0
6511    0
Name: test_results, Length: 6512, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [9]:
model1 = LogisticRegression(random_state = 0, solver = 'liblinear')
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
model1.score(X_test, y_test)

0.807367613200307

In [10]:
model = SelectFromModel(LogisticRegression(random_state = 0, solver = 'liblinear'))
model.fit(X_train, y_train)
model.get_support()

array([False, False,  True,  True,  True,  True, False, False])

In [11]:
features = X_train.columns[model.get_support()]
features

Index(['cough', 'runny_nose', 'muscle_soreness', 'diarrhea'], dtype='object')

In [12]:
X_train_rfc = model.transform(X_train)
X_test_rfc =model.transform(X_test)

In [13]:
def run_logisticregression(X_train, X_test, y_train, y_test):
    model = LogisticRegression(random_state=0, solver = 'liblinear')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy: ", accuracy_score(y_pred, y_test))
    print('\n')
    print("Confusion Matrix")
    print('\n')
    print(confusion_matrix(y_pred, y_test))

In [14]:
%%time
run_logisticregression(X_train, X_test, y_train, y_test)

Accuracy:  0.807367613200307


Confusion Matrix


[[878 146]
 [105 174]]
Wall time: 43.3 ms


In [15]:
%%time
run_logisticregression(X_train_rfc, X_test_rfc, y_train, y_test)

Accuracy:  0.8096699923254029


Confusion Matrix


[[970 235]
 [ 13  85]]
Wall time: 23.3 ms


Revursive feature elimination was used to select features with highest predicting factor. Using these feature alone reduced accuracy by a negligible factor but the prediction time was much much fastor

In [16]:
# Using just the selected featurees to make prediction time faster

model1.fit(X_train_rfc, y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [17]:
model1.score(X_test_rfc, y_test)

0.8096699923254029

In [18]:
y_pred = model1.predict(X_test_rfc)

In [19]:
accuracy_score(y_pred, y_test)

0.8096699923254029

In [20]:
X_train_rfc[0]

array([1, 1, 0, 0], dtype=int64)

In [21]:
pred = np.array([[1, 0, 0]])
pred

array([[1, 0, 0]])

In [22]:
import pickle
with open('model_pickle', 'wb') as file:
    pickle.dump(model1, file)

In [23]:
filename = 'covid_model.sav'
pickle.dump(model1, open(filename, 'wb'))

In [24]:
import joblib
filename = 'finalized model.sav'
joblib.dump(model1, filename)

['finalized model.sav']