<a href="https://colab.research.google.com/github/vix993/Kunumi_Workshop_Covid19/blob/main/EinsteinCovidDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import pertinent Libraries

In [None]:
!pip install pandas
!pip install numpy
!pip install scipy
!pip install matplotlib
!pip install sweetviz
!pip install sklearn
!pip install seaborn
!pip install imblearn
!pip install statsmodels

Collecting sweetviz
[?25l  Downloading https://files.pythonhosted.org/packages/71/c1/e15ac0b39997c0680620d8274a2a7d41730968d5c3958f80bb80127ceb5f/sweetviz-1.1.1-py3-none-any.whl (15.1MB)
[K     |████████████████████████████████| 15.1MB 294kB/s 
[?25hCollecting importlib-resources>=1.2.0
  Downloading https://files.pythonhosted.org/packages/ba/03/0f9595c0c2ef12590877f3c47e5f579759ce5caf817f8256d5dcbd8a1177/importlib_resources-3.0.0-py2.py3-none-any.whl
Collecting tqdm>=4.43.0
[?25l  Downloading https://files.pythonhosted.org/packages/bd/cf/f91813073e4135c1183cadf968256764a6fe4e35c351d596d527c0540461/tqdm-4.50.2-py2.py3-none-any.whl (70kB)
[K     |████████████████████████████████| 71kB 7.3MB/s 
Installing collected packages: importlib-resources, tqdm, sweetviz
  Found existing installation: tqdm 4.41.1
    Uninstalling tqdm-4.41.1:
      Successfully uninstalled tqdm-4.41.1
Successfully installed importlib-resources-3.0.0 sweetviz-1.1.1 tqdm-4.50.2


# Given Data Exploration and Analysis

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns

sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

#ler os dados
df = pd.read_excel('dataset.xlsx')

#imprimir as primeiras linhas do DataFrame
df.head()

#Para nosso objetivo a identificação do paciente não importa, então vamos deletar essa coluna
del df['Patient ID']


# vamos colocar todos os nomes das colunas em letras minúsculas
df = df.rename(columns=str.lower)
df.head()

# vamos importar uma biblioteca que vai ajudar a ter uma visão geral do dados
# https://pypi.org/project/sweetviz/

import sweetviz as sv

df['urine - ph'].replace('Não Realizado', 0.0000001 ,inplace=True)
df['urine - ph'] = pd.to_numeric(df['urine - ph'], errors='coerce')

advert_report = sv.analyze(df, pairwise_analysis='off')

advert_report.show_html('Advertising.html')

# Vamos criar uma função customizadas para realizar o EDA
def EDA (df):

    eda_df = {}
    eda_df['Amount_NaN'] = df.isnull().sum()
    eda_df['%_NaN'] = df.isnull().mean().round(2)
    eda_df['DType'] = df.dtypes
    eda_df['Amount_Data'] = df.count()

    
    eda_df['Mean'] = np.round(df.mean(), 2);
    eda_df['Median'] = np.round(df.median(), 2);
    
    eda_df['Max'] = df.max()
    eda_df['Min'] = df.min()
    eda_df['STD'] = np.round(df.std(), 2)
    
    eda = pd.DataFrame(eda_df)
    
    colunas = sorted(df.columns.tolist(), reverse=False)
    eda['Amount_Unique'] = list(map(lambda x: len(df[x].unique().tolist()), colunas))

    return eda

#Chamar a função criada acima para gerar o DataFrame
informacao_df = EDA(df)

#imprimir as primeiras linhas
informacao_df.head()


def gerar_graficos(df):
    # definir a fonte utilizada nos gráficos
    plt.rcParams['font.family'] = 'monospace'
    plt.rcParams['font.monospace'] = 'Roboto Mono'

    ax = df["%_NaN"].value_counts(normalize=True).plot(kind="bar", figsize=(20, 10), cmap='YlGnBu_r');

    # título dos eixos
    ax.set_xlabel("% de dados faltantes", fontsize=26)
    ax.set_ylabel("% de colunas", fontsize=26)

    # tick labels.
    plt.setp(ax.get_xticklabels(), rotation=90, ha="right",
             rotation_mode="anchor", fontsize=20);

    plt.setp(ax.get_yticklabels(), rotation=0, ha="right",
             rotation_mode="anchor", fontsize=20);

    # título do gráfico
    plt.title("Dados Faltantes - COVID19 DataSet", fontsize=28);

    return (plt.show())

#Vamos começar olhando para as coluna com dados faltantes
gerar_graficos(informacao_df);

#Deletar as colunas com aproximadamente 100% de dados faltantes

#Seleção das colunas
del_colunas = informacao_df.loc[informacao_df["%_NaN"] == 1].index.tolist()

#Deletar as colunas
df1 = df.drop(columns = del_colunas)

# vamos ver quais colunas foram excluídas
del_colunas

#Recriar o DataFrame com as informações do novo dataset
informacao_df1 = EDA(df1)

gerar_graficos(informacao_df1);

#Vamos Analisar um pouco mais as colunas com muitos dados faltantes (Acima de 90%), que são categóricas
cols = informacao_df1.loc[(informacao_df1["DType"]=="object") \
                             & (informacao_df1["%_NaN"]>=0.9)].index.tolist()

#Vamos imprimir as colunas e seus valores únicos
for col in cols:
    print (col,"  ",df[col].unique().tolist())
    print ("\n")

#Considerando que as colunas acima estão associadas à urina, é bem provável que não seja muito relevante para nosso
#objetivo. Dessa maneira podemos excluir essas colunas sem perda alguma de informação
df2 = df1.drop(columns=cols)

#Recriar o DataFrame com as informações do novo dataset
informacao_df2 = EDA(df2)

gerar_graficos(informacao_df2);

#Para tentarmos entender melhor as colunas, vamos olhar com mais carinho apenas pacientes com COVID
df2_positivos = df2.loc[df2["sars-cov-2 exam result"]=="positive"]

#vamos gerar as informações somente dos pacientes com COVID-19
informacao_df2 = EDA(df2_positivos)

gerar_graficos(informacao_df2);

#Vamos Analisar um pouco mais as colunas quando selecionamos apenas casos positivos para COVID-19
cols = informacao_df2.loc[(informacao_df2["Amount_Unique"]== 2) & \
                          (informacao_df2["DType"]== "object") ].index.tolist()

#cols.remove("sars-cov-2 exam result")

for col in cols:
    if (len(df2_positivos[col].unique().tolist()) ==2):
        print (col,"  ", df2_positivos[col].unique().tolist())
        print ("\n")

#Excluir as colunas acima
df3 = df2.drop(columns=cols)

df3.shape

#Recriar o DataFrame com as informações do novo dataset
informacao_df3 = EDA(df3)

gerar_graficos(informacao_df3);

#Após essa limpeza inicial (na qual buscamos entender o significado das colunas excluídas), vamos retirar
#as colunas do tipo float que apresentam muitos dados faltantes

cols = informacao_df3.loc[(informacao_df3["%_NaN"]> 0.95) & (informacao_df3["DType"] == "float")].index.tolist()

df4 = df3.drop(columns=cols)

#Vamos conferir como esta o conjunto de dados nesse ponto da limpeza de dados
informacao_df4 = EDA(df4)

gerar_graficos(informacao_df4);

#Vamos excluir linhas que apresentam mas de 50% dos dados faltantes
df5 = df4.dropna(thresh = int(df4.shape[1]*0.5))
df4.shape

df5.shape

#Atualizando as informações do dataset
informacao_df5 = EDA(df5)

gerar_graficos(informacao_df5);

#Vamos Analisar um pouco mais as colunas com muitos dados faltantes
cols = informacao_df5.loc[(informacao_df5["%_NaN"]==0.67) & (informacao_df5["DType"]!= "int64") ].index.tolist()
cols

#vamos selecionar as colunas que apresentam Influenza no nome
influenza_b_cols = df5.loc[:, df5.columns.str.startswith('influenza')].columns.tolist()
influenza_b_cols

df6 = df5.drop(columns=influenza_b_cols, axis=1)

#Atualizar as informações do data set
informacao_df6 = EDA(df6)


gerar_graficos(informacao_df6);

#vamos ver as colunas que apresentam mais dados faltantes
cols = informacao_df6.loc[(informacao_df6["%_NaN"] == 0.39) & (informacao_df6["DType"]== "object") ].index.tolist()
cols

#como são exames relacionadas à outras doenças vamos exclui-las
df7 = df6.drop(columns=cols)

#Atualizar as informações do data set
informacao_df7 = EDA(df7)

gerar_graficos(informacao_df7);

#Vamos ver as informações que contém nas colunas que apresentam mais de 30% de dados faltantes
cols = informacao_df7.loc[(informacao_df7["%_NaN"] >= 0.3)].index.tolist()
cols

#No primeiro instante não parecem informações relevantes para nosso objetivo
df8 = df7.drop(columns=cols)

#Atualizar as informações sobre o dataset
informacao_df8 = EDA(df8)

gerar_graficos(informacao_df8);

df9 = df8.fillna(df8.median())

#Atualizar as informações sobre o dataset
informacao_df9 = EDA(df9)
# 
gerar_graficos(informacao_df9);

informacao_df9

print("------------" * 3)



# My Turn

*`Escrevi minhas notas e análises em inglês, pois vivi a maior parte da minha vida no Reino Unido e acho mais eficiente me expressar dessa forma.`*

Initially, I decided to visualize what we can learn from the entire dataset after applying a Logistic Regression. Using oversampling I achieved a confusion matrix which did not perform adequately. It did well when predicting positive test results however about half of the positive test results returned as false negatives. Which for the needs of this analysis is a poor performance as we could potentially misinform people who have contracted the virus which is the worst case scenario. Our model is too generous.

In [6]:
df9.head()

data_final=df9.copy()
data_final.columns.values

print(df9.values)

X = data_final.loc[:, data_final.columns != 'sars-cov-2 exam result']
Y = data_final.loc[:, data_final.columns == 'sars-cov-2 exam result']

# print('---------', ' save to mat file ', '------------')

# for key in data_final:
#   if len(key) > 29:
#     new_key = key\
#       .replace(' (1=yes, 0=no)', '')\
#       .replace('patient addmited to ','')\
#       .replace('mean corpuscular ', 'mc ')\
#       .replace('red blood cell ','rbc ')\
#       .replace('hemoglobin ', 'h ')\
#       .replace('distribution weight', 'dw')\
#       .replace('reativa', 'rea').encode('utf-8')
#     data_final[new_key] = data_final[key]
#     del data_final[key]

# for key in data_final:
#   print(key)
# import scipy.io

# scipy.io.savemat('arrdata.mat', mdict={'X': data_final})

from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X, os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=['sars-cov-2 exam result'])
# check numbers for our data
# print("data: ", os_data_X, os_data_y)
print("length of oversampled data is ",len(os_data_X))
print("Number of negative patients in oversampled data",len(os_data_y[os_data_y['sars-cov-2 exam result']=='negative']))
print("Number of positive",len(os_data_y[os_data_y['sars-cov-2 exam result']=='positive']))
print("Proportion of negative patient data in oversampled data is ",len(os_data_y[os_data_y['sars-cov-2 exam result']=='negative'])/len(os_data_X))
print("Proportion of positive patient data in oversampled data is ",len(os_data_y[os_data_y['sars-cov-2 exam result']=='positive'])/len(os_data_X))

# print("---------" *2, " recursive feature elimination ", "--------" * 2)

# data_final_vars=data_final.columns.values.tolist()
# y=['y']
# X=[i for i in data_final_vars if i not in y]
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LogisticRegression
# logreg = LogisticRegression()
# rfe = RFE(logreg, 20)
# rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
# print(rfe.support_)
# print(rfe.ranking_)


print("----------" * 2, " model ", "-------" * 2)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}%'.format(100 * (logreg.score(X_test, y_test))))
# print(logreg)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

print(confusion_matrix)

[[17 'negative' 0 ... -0.6250726580619812 -0.6190860271453857
  -0.147894948720932]
 [1 'negative' 0 ... -0.9788991212844849 -0.127395361661911
  -0.2869857549667358]
 [9 'negative' 0 ... -1.067355036735535 0.880570113658905
  -0.3932908624410629]
 ...
 [15 'negative' 0 ... -1.155811905860901 -0.06183667480945587
  0.5614683032035828]
 [17 'negative' 0 ... -0.4481598734855652 1.552547812461853
  0.609156608581543]
 [19 'positive' 0 ... -0.1827902793884277 0.3806847631931305
  -0.503570020198822]]
length of oversampled data is  724
Number of negative patients in oversampled data 362
Number of positive 362
Proportion of negative patient data in oversampled data is  0.5
Proportion of positive patient data in oversampled data is  0.5
--------------------  model  --------------
Accuracy of logistic regression classifier on test set: 91.71%
[[156   1]
 [ 14  10]]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Further Filtering

Clearly my initial approach was ambitious. I need to make a more in depth attempt at determining what features are potentially relevant to the contraction of the virus and could be more efficient in teaching us about the problem.

The following analysis is based on the idea that blood related features could potentially be more insightful. Considering that the main identifiers regarding our immune system and oxygen levels are found in our blood stream.

This however, provided us with a far less performant model, with more false negatives and less true positives.

In [60]:

# with bloodcell related features
blood_df = data_final.copy()
del blood_df['patient age quantile']
del blood_df['proteina c reativa mg/dl']
del blood_df['patient addmited to regular ward (1=yes, 0=no)']
del blood_df['patient addmited to semi-intensive unit (1=yes, 0=no)']
del blood_df['patient addmited to intensive care unit (1=yes, 0=no)']

X = blood_df.loc[:, blood_df.columns != 'sars-cov-2 exam result']
Y = blood_df.loc[:, blood_df.columns == 'sars-cov-2 exam result']

print("----------" * 2, " model ", "-------" * 2)

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}%'.format(100 * (logreg.score(X_test, y_test))))
# print(logreg)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

print(confusion_matrix)

--------------------  model  --------------
Accuracy of logistic regression classifier on test set: 86.74%
[[153   4]
 [ 20   4]]


  y = column_or_1d(y, warn=True)


# More specific Identifiers

Perhaps we must again filter the more specifically related features in order to acheive a more performant model. I also decided to migrate from the Logistic Regression approach considering the model so far has been provide us with a high level of false negatives.

Evidently, the Naive Bayse model is closer to the desired result though still lack luster. While the count of true positives has reduced, we have brought our false negatives under control and can look to improve on this.

In [68]:

# with bloodcell related features
blood_oxygen_df = data_final.copy()
del blood_oxygen_df['patient age quantile']
del blood_oxygen_df['patient addmited to regular ward (1=yes, 0=no)']
del blood_oxygen_df['patient addmited to semi-intensive unit (1=yes, 0=no)']
del blood_oxygen_df['patient addmited to intensive care unit (1=yes, 0=no)']
del blood_oxygen_df['mean corpuscular volume (mcv)']

print(blood_oxygen_df.columns)

X = blood_oxygen_df.loc[:, blood_oxygen_df.columns != 'sars-cov-2 exam result']
Y = blood_oxygen_df.loc[:, blood_oxygen_df.columns == 'sars-cov-2 exam result']


print("----------" * 2, " model ", "-------" * 2)

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}%'.format(100 * (nb.score(X_test, y_test))))
# print(logreg)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

print(confusion_matrix)

Index(['sars-cov-2 exam result', 'hematocrit', 'hemoglobin', 'platelets',
       'mean platelet volume ', 'red blood cells', 'lymphocytes',
       'mean corpuscular hemoglobin concentration (mchc)', 'leukocytes',
       'basophils', 'mean corpuscular hemoglobin (mch)', 'eosinophils',
       'monocytes', 'red blood cell distribution width (rdw)', 'neutrophils',
       'proteina c reativa mg/dl'],
      dtype='object')
--------------------  model  --------------
Accuracy of logistic regression classifier on test set: 87.85%
[[143  14]
 [  8  16]]


  y = column_or_1d(y, warn=True)


# Imunity

Perhaps features related to imunity can provide us with some significant insight. This approach has provided us with more reasonable predictions though perhaps more data and exploration is needed to come to a solid conclusion. I applied an oversampling of the data, achieving a more even distribution of results.

This approach has clearly provided us with more cohessive predictions, though there is still much to be improved on.

In [91]:
imunity = data_final.copy()

del imunity['patient addmited to regular ward (1=yes, 0=no)']
del imunity['patient addmited to semi-intensive unit (1=yes, 0=no)']
del imunity['hematocrit']
del imunity['red blood cells']
del imunity['red blood cell distribution width (rdw)']
del imunity['patient addmited to intensive care unit (1=yes, 0=no)']


X = imunity.loc[:, imunity.columns != 'sars-cov-2 exam result']
Y = imunity.loc[:, imunity.columns == 'sars-cov-2 exam result']

from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X, os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
os_data_y = pd.DataFrame(data=os_data_y, columns=['sars-cov-2 exam result'])

print("length of oversampled data is ",len(os_data_X))
print("Number of negative patients in oversampled data",len(os_data_y[os_data_y['sars-cov-2 exam result']=='negative']))
print("Number of positive",len(os_data_y[os_data_y['sars-cov-2 exam result']=='positive']))
print("Proportion of negative patient data in oversampled data is ",len(os_data_y[os_data_y['sars-cov-2 exam result']=='negative'])/len(os_data_X))
print("Proportion of positive patient data in oversampled data is ",len(os_data_y[os_data_y['sars-cov-2 exam result']=='positive'])/len(os_data_X))


print("----------" * 2, " model ", "-------" * 2)

from sklearn.svm import LinearSVR
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}%'.format(100 * (nb.score(X_test, y_test))))
# print(logreg)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)

print(confusion_matrix)

length of oversampled data is  724
Number of negative patients in oversampled data 362
Number of positive 362
Proportion of negative patient data in oversampled data is  0.5
Proportion of positive patient data in oversampled data is  0.5
--------------------  model  --------------
Accuracy of logistic regression classifier on test set: 90.06%
[[148   9]
 [  9  15]]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# Notes

- My impression is that this dataset is not particularly effective in helping us understand the tendencies of the Covid19 virus. Most likely this can be attributed to my limitations as an analyser of the data provided as well as my limited knowledge of potential models to apply and how to apply them.
- With that said, it was a great learning experience and I definately feel that I have advanced significantly in this discipline.