Import the necessary libraries

In [19]:
import numpy
import pandas

import joblib

from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing

Import Drive

In [20]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Read data from CSV file

In [21]:
data = pandas.read_csv('drive/MyDrive/covid.csv')

print('Data:\n')
print(data)

print('\nColumns:\n')
print(data.columns)

Data:

            id  sex  patient_type  ... contact_other_covid covid_res icu
0       16169f    2             1  ...                   2         1  97
1       1009bf    2             1  ...                  99         1  97
2       167386    1             2  ...                  99         1   2
3       0b5948    2             2  ...                  99         1   2
4       0d01b5    1             2  ...                  99         1   2
...        ...  ...           ...  ...                 ...       ...  ..
566597  01ff60    2             1  ...                   2         3  97
566598  047cd1    1             1  ...                   2         3  97
566599  1beb81    1             2  ...                  99         3   2
566600  16fb02    1             1  ...                   2         3  97
566601  0021c9    2             1  ...                   2         3  97

[566602 rows x 23 columns]

Columns:

Index(['id', 'sex', 'patient_type', 'entry_date', 'date_symptoms', 'date_died'

Encode the gender field

In [22]:
data_gender = data.sex

male = []
female = []
other = []

for gender in data_gender:
    if gender == 1:
        male.append(1)
        female.append(0)
        other.append(0)
    elif gender == 2:
        male.append(0)
        female.append(1)
        other.append(0)
    else:
        male.append(0)
        female.append(0)
        other.append(1)

# Remove existing field
data.drop(columns = ['sex'])

# Add the new fields
data['male'] = male
data['female'] = female
data['other'] = other

Encode the following fields

1. Pneumonia
2. Pregnancy
3. Diabetes
4. COPD
5. Asthma
6. Hypertension
7. Other diseases
8. Cardiovascular
9. Obesity
10. Renal Diseases (Kidney)
11. Tobacco consumer

In [23]:
def encode(name):
    data_list = data[name]

    res = []

    for record in data_list:
        if record == 1:
            res.append(1)
        else:
            res.append(0)

    # Drop the existing field
    data.drop(columns = [name])

    # Add the new field with same name
    data[name] = res
    
# Encode all the fields in arr
arr = ['pneumonia', 'pregnancy', 'diabetes', 'copd', 'asthma', 'hypertension', 'other_disease', 'cardiovascular', 'obesity', 'renal_chronic', 'tobacco']

for name in arr:
    encode(name)

Put the result into a single field

In [24]:
result = []

data_type = data.patient_type
data_ventilator = data.intubed
data_icu = data.icu
data_died = data.date_died

for i in range(0, len(data)):
    if data_type[i] == 2 or data_ventilator[i] == 1 or data_icu[i] == 1 or data_died[i] != '9999-99-99':
        result.append(1)
    else:
        result.append(0)

Remove columns which are not useful

In [25]:
data = data.drop(columns = ['id', 'sex', 'patient_type', 'intubed', 'icu', 'entry_date', 'date_symptoms', 'date_died', 'inmsupr', 'contact_other_covid', 'covid_res'])

print('Columns after removal:\n')
print(data.columns)

Columns after removal:

Index(['pneumonia', 'age', 'pregnancy', 'diabetes', 'copd', 'asthma',
       'hypertension', 'other_disease', 'cardiovascular', 'obesity',
       'renal_chronic', 'tobacco', 'male', 'female', 'other'],
      dtype='object')


Split the data into test and train

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(data, result, test_size = 0.2)

Random Forest Classifier

In [27]:
random_forest_classifier = RandomForestClassifier(n_estimators = 15)
random_forest_classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=15,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Report from Random Forest Classifier

In [28]:
Y_pred = random_forest_classifier.predict(X_test)

print('Confusion matrix:\n', confusion_matrix(Y_test, Y_pred))
print('\nClassification_report:\n', classification_report(Y_test, Y_pred))
print("\nAccuracy:", accuracy_score(Y_test, Y_pred))

Confusion matrix:
 [[84711  3502]
 [ 9060 16048]]

Classification_report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93     88213
           1       0.82      0.64      0.72     25108

    accuracy                           0.89    113321
   macro avg       0.86      0.80      0.82    113321
weighted avg       0.89      0.89      0.88    113321


Accuracy: 0.8891467600885978


Decision Tree Classifier

In [29]:
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

Report from Decision Tree Classifier

In [30]:
Y_pred = decision_tree_classifier.predict(X_test)

print('Confusion matrix:\n', confusion_matrix(Y_test, Y_pred))
print('\nClassification_report:\n', classification_report(Y_test, Y_pred))
print("\nAccuracy:", accuracy_score(Y_test, Y_pred))

Confusion matrix:
 [[84904  3309]
 [ 9292 15816]]

Classification_report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93     88213
           1       0.83      0.63      0.72     25108

    accuracy                           0.89    113321
   macro avg       0.86      0.80      0.82    113321
weighted avg       0.88      0.89      0.88    113321


Accuracy: 0.8888026049893665


Naive Bayes Classifier

In [31]:
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

Report from Naive Bayes Classifier

In [32]:
Y_pred = naive_bayes_classifier.predict(X_test)

print('Confusion matrix:\n', confusion_matrix(Y_test, Y_pred))
print('\nClassification_report:\n', classification_report(Y_test, Y_pred))
print("\nAccuracy:", accuracy_score(Y_test, Y_pred))

Confusion matrix:
 [[78999  9214]
 [ 6829 18279]]

Classification_report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91     88213
           1       0.66      0.73      0.70     25108

    accuracy                           0.86    113321
   macro avg       0.79      0.81      0.80    113321
weighted avg       0.86      0.86      0.86    113321


Accuracy: 0.8584287113597656


Final Steps

In [33]:
MLModel = RandomForestClassifier(n_estimators = 15)
MLModel.fit(data, result)

joblib.dump(random_forest_classifier, 'drive/MyDrive/MLModel.sav')

['drive/MyDrive/MLModel.sav']