In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import Libraries

In [2]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.ensemble import ExtraTreesClassifier
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'seaborn'

Load data

In [None]:
pip install openpyxl

In [None]:
data = pd.read_excel("../input/covid19/dataset.xlsx", engine="openpyxl")

In [None]:
data.info()
data.describe()
data.head()

Feature Engineering!

In [None]:
data.columns = [x.lower().strip().replace(' ','_') for x in data.columns]

In [None]:
def miss_data(x):
    total = x.isnull().sum()
    percent = (x.isnull().sum()/x.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(x[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
miss_data(data)

In [None]:
for x in data.columns:
    if data[x].dtype=='float16' or  data[x].dtype=='float32' or  data[x].dtype=='float64':
        data[x].fillna(data[x].mean())

data = data.fillna(-999)

for y in data.columns:
    if data[y].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(data[y].values))
        data[y] = lbl.transform(list(data[y].values))

In [None]:
threshold = 0.92

corr_matrix = data.corr().abs()
corr_matrix.head()

In [None]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

In [None]:
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))
dataset = data.drop(columns = to_drop)
print('Data shape: ', data.shape)
print('Size of the data', data.shape)

In [None]:
data_missing = (data.isnull().sum() / len(data)).sort_values(ascending = False)
data_missing.head()

In [None]:
data_missing_ = data_missing.index[data_missing > 0.85]
all_missing = list(set(data_missing_))

In [None]:
dataset = dataset.drop(columns = all_missing)

In [None]:
dataset.info()

In [None]:
cols = [x for x in dataset.columns if x not in ['patient_id','sars-cov-2_exam_result', 'patient_addmited_to_regular_ward_(1=yes,_0=no)', 'patient_addmited_to_semi-intensive_unit_(1=yes,_0=no)', 'patient_addmited_to_intensive_care_unit_(1=yes,_0=no)']]

In [None]:
new_df = dataset[cols]

In [None]:
new_df

Data Split

In [None]:
X = new_df
y = dataset['sars-cov-2_exam_result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=101)

Feature Importance using Extra Trees Classifier

In [None]:
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
feat_head = feat_importances.head(10)
feat_head.index

In [None]:
X = new_df[feat_head.index]
y = dataset['sars-cov-2_exam_result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 101)

ML Models

In [None]:
accuracy_lst =[]

def model_assess(model, name='Default'):
    model.fit(X_train, y_train)
    prds = model.predict(X_test)
    model_acc = accuracy_score(y_test, prds)
    accuracy_lst.append(100*model_acc)
    print('---', name, '---', '\n',
          confusion_matrix(y_test, prds), '\n',
          'Accuracy:', (accuracy_score(y_test, prds)), '\n',
          'Classification Report:', (classification_report(y_test, prds)))

In [None]:
# Logistic Regression
lg = LogisticRegression()
model_assess(lg, 'Logistic Regression')

# Decision Tree
tree = DecisionTreeClassifier()
model_assess(tree, 'Decission Trees')

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
model_assess(rforest, 'Random Forest')

# SVM
svm = SVC()
model_assess(svm, 'SVM')

# KNN
knn = KNeighborsClassifier(n_neighbors=19)
model_assess(knn, name='KNN')

# XGBOOST
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model_assess(xgb, 'XGBoost')

# Neural Network
nn = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1)
model_assess(nn, 'Neural Nets')

In [None]:
model_list = ['Logistic Regression', 'DT', 'Random Forest', 'SVM', 'KNearestNeighbours', 'XGBOOST', 'NN']

In [None]:
plt.rcParams['figure.figsize']=20,8
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y = accuracy_lst, palette = "coolwarm", saturation =2.0)
plt.xlabel('Classifier Models', fontsize = 20 )
plt.ylabel('% of Accuracy', fontsize = 20)
plt.title('Accuracy of different Classifier Models', fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

In [None]:
cross_acc = []

ca_lg = cross_val_score(lg, X_train, y_train, scoring='accuracy')
ca_lg = ca_lg.mean()
cross_acc.append(100*ca_lg)

ca_tree = cross_val_score(tree, X_train, y_train, scoring='accuracy')
ca_tree = ca_tree.mean()
cross_acc.append(100*ca_tree)

ca_rforest = cross_val_score(rforest, X_train, y_train, scoring='accuracy')
ca_rforest = ca_rforest.mean()
cross_acc.append(100*ca_rforest)

ca_svm = cross_val_score(svm, X_train, y_train, scoring='accuracy')
ca_svm = ca_svm.mean()
cross_acc.append(100*ca_svm)

ca_knn = cross_val_score(knn, X_train, y_train, scoring='accuracy')
ca_knn = ca_knn.mean()
cross_acc.append(100*ca_knn)

ca_xgb = cross_val_score(xgb, X_train, y_train, scoring='accuracy')
ca_xgb = ca_xgb.mean()
cross_acc.append(100*ca_xgb)

In [None]:
ca_nn = cross_val_score(nn, X_train, y_train, scoring='accuracy')
ca_nn = ca_nn.mean()
cross_acc.append(100*ca_nn)

In [None]:
plt.rcParams['figure.figsize']=20,8
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=cross_acc, palette = "rocket", saturation =2.0)
plt.xlabel('Classifier Models', fontsize = 20 )
plt.ylabel('Cross validation Accuracy', fontsize = 20)
plt.title('Accuracy of different Classifier Models', fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

Admission to ward (Covid-19 Patients) 

In [None]:
covid_positive = dataset[dataset['sars-cov-2_exam_result'] == 1]

In [None]:
admission = []  

def multiclass_target(row):
    check = 0
    check += 1 if (row['patient_addmited_to_regular_ward_(1=yes,_0=no)'] == 1) else 0
    check += 2 if (row['patient_addmited_to_semi-intensive_unit_(1=yes,_0=no)'] == 1) else 0
    check += 3 if (row['patient_addmited_to_intensive_care_unit_(1=yes,_0=no)'] == 1) else 0
    row['target'] = check
    return row

data_adm = covid_positive.apply(multiclass_target, axis=1)
data_adm

In [None]:
X = data_adm[feat_head.index]
y = data_adm['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

ML Models for Ward Prediction!

In [None]:
acc_lst = []

def model_assess(model, name='Default'):
    model.fit(X_train, y_train)
    prds = model.predict(X_test)
    model_acc = accuracy_score(y_test, prds)
    acc_lst.append(100*model_acc)
    print('---', name, '---', '\n',
          confusion_matrix(y_test, prds), '\n',
          'Accuracy:', (accuracy_score(y_test, prds)), '\n',
          'Classification Report:', (classification_report(y_test, prds)))

In [None]:
# Logistic Regression
lg = LogisticRegression()
model_assess(lg, 'Logistic Regression')

# Decision Tree
tree = DecisionTreeClassifier()
model_assess(tree, 'Decission Trees')

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
model_assess(rforest, 'Random Forest')

# SVM
svm = SVC()
model_assess(svm, 'SVM')

# KNN
knn = KNeighborsClassifier(n_neighbors=19)
model_assess(knn, name='KNN')

# XGBOOST
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
model_assess(xgb, 'XGBoost')

# Neural Network
nn = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1)
model_assess(nn, 'Neural Nets')

In [None]:
plt.rcParams['figure.figsize']=20,8
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y = acc_lst, palette = "coolwarm", saturation =2.0)
plt.xlabel('Classifier Models', fontsize = 20 )
plt.ylabel('% of Accuracy', fontsize = 20)
plt.title('Accuracy of different Classifier Models', fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()