Import the necessary libraries

In [25]:
import numpy
import pandas

import joblib

from google.colab import drive

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error

Import Drive

In [14]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Read data from CSV file

In [15]:
data = pandas.read_csv('drive/MyDrive/covid.csv')

print('Data:\n')
print(data)

print('\nColumns:\n')
print(data.columns)

Data:

            id  sex  patient_type  ... contact_other_covid covid_res icu
0       16169f    2             1  ...                   2         1  97
1       1009bf    2             1  ...                  99         1  97
2       167386    1             2  ...                  99         1   2
3       0b5948    2             2  ...                  99         1   2
4       0d01b5    1             2  ...                  99         1   2
...        ...  ...           ...  ...                 ...       ...  ..
566597  01ff60    2             1  ...                   2         3  97
566598  047cd1    1             1  ...                   2         3  97
566599  1beb81    1             2  ...                  99         3   2
566600  16fb02    1             1  ...                   2         3  97
566601  0021c9    2             1  ...                   2         3  97

[566602 rows x 23 columns]

Columns:

Index(['id', 'sex', 'patient_type', 'entry_date', 'date_symptoms', 'date_died'

Encode the gender field

In [16]:
data_gender = data.sex

male = []
female = []
other = []

for gender in data_gender:
    if gender == 1:
        male.append(1)
        female.append(0)
        other.append(0)
    elif gender == 2:
        male.append(0)
        female.append(1)
        other.append(0)
    else:
        male.append(0)
        female.append(0)
        other.append(1)

# Remove existing field
data.drop(columns = ['sex'])

# Add the new fields
data['male'] = male
data['female'] = female
data['other'] = other

Encode the following fields

1. Pneumonia
2. Pregnancy
3. Diabetes
4. COPD
5. Asthma
6. Hypertension
7. Other diseases
8. Cardiovascular
9. Obesity
10. Renal Diseases (Kidney)
11. Tobacco consumer

In [17]:
def encode(name):
    data_list = data[name]

    res = []

    for record in data_list:
        if record == 1:
            res.append(1)
        else:
            res.append(0)

    # Drop the existing field
    data.drop(columns = [name])

    # Add the new field with same name
    data[name] = res
    
# Encode all the fields in arr
arr = ['pneumonia', 'pregnancy', 'diabetes', 'copd', 'asthma', 'hypertension', 'other_disease', 'cardiovascular', 'obesity', 'renal_chronic', 'tobacco']

for name in arr:
    encode(name)

Put the result into a single field

In [18]:
result = []

data_type = data.patient_type
data_ventilator = data.intubed
data_icu = data.icu
data_died = data.date_died

for i in range(0, len(data)):
    if data_type[i] == 2 or data_ventilator[i] == 1 or data_icu[i] == 1 or data_died[i] != '9999-99-99':
        result.append(1)
    else:
        result.append(0)

Remove columns which are not useful

In [19]:
data = data.drop(columns = ['id', 'sex', 'patient_type', 'intubed', 'icu', 'entry_date', 'date_symptoms', 'date_died', 'inmsupr', 'contact_other_covid', 'covid_res'])

print('Columns after removal:\n')
print(data.columns)

Columns after removal:

Index(['pneumonia', 'age', 'pregnancy', 'diabetes', 'copd', 'asthma',
       'hypertension', 'other_disease', 'cardiovascular', 'obesity',
       'renal_chronic', 'tobacco', 'male', 'female', 'other'],
      dtype='object')


Split the data into test and train

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(data, result, test_size = 0.2)

Random Forest Regressor

In [21]:
random_forest_regressor = RandomForestRegressor(n_estimators = 15)
random_forest_regressor.fit(X_train, Y_train)

RandomForestRegressor(n_estimators=15)

Report from Random Forest Regressor

In [26]:
Y_pred = random_forest_regressor.predict(X_test)

print('Mean absolute error:', mean_absolute_error(Y_test, Y_pred))

Mean absolute error: 0.1691855453988427


Decision Tree Regressor

In [27]:
decision_tree_regressor = DecisionTreeRegressor()
decision_tree_regressor.fit(X_train, Y_train)

DecisionTreeRegressor()

Report from Decision Tree Regressor

In [29]:
Y_pred = decision_tree_regressor.predict(X_test)

print('Mean absolute error:', mean_absolute_error(Y_test, Y_pred))

Mean absolute error: 0.1689861997106174


Final Steps

In [30]:
MLModel = RandomForestRegressor(n_estimators = 15)
MLModel.fit(data, result)

joblib.dump(MLModel, 'drive/MyDrive/MLModel-regressor.sav')

['drive/MyDrive/MLModel-regressor.sav']