# Data Cleaning / Wrangling w/ Pandas

About pandas:

pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language.

In [1]:
# First step, import the pandas package

import pandas as pd

In [2]:
# Next step, load the file that we want to clean

file = pd.read_excel('/Users/daniel.geanon/OneDrive - Karolinska Institutet/Mac/Desktop/211126_CIMOverflow.xlsx')

In [3]:
# Display file

file

Unnamed: 0,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onsent,Date hospital admission,...,H√∂gsta Ferritin +/- 24 h fr√•n biobanksprov 4,H√∂gsta D-dimerer +/- 24 h fr√•n biobanksprov 4,H√∂gsta LD +/- 24 h fr√•n biobanksprov 4,H√∂gsta krea +/- 24 h fr√•n biobanksprov 4,H√∂gsta bilirubin +/- 24 h fr√•n biobanksprov 4,L√§gsta TPK +/- 24 h fr√•n biobanksprov 4,Positive blododling,Positiv nedre luftv√§gsodling,Positiv urinv√§gsodling,SARS-CoV-2_serum
0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,2020-04-23 00:00:00,...,,,,,,,n,n,n,n
1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,2020-04-23 00:00:00,...,,,,,,,n,n,n,n
2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,0,...,,,,,,,n,n,n,n
388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,0,...,,,,,,,n,n,n,n
389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,0,...,,,,,,,n,n,n,n
390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,0,...,,,,,,,n,n,n,n


In [4]:
# First step, let's read columns and check the names

file.columns.tolist()

['Subject ID',
 'Biobank Sample',
 'Disease Status',
 'Code for biobanksample',
 'Include?',
 'Exclusion Criteria Comments',
 'WSP',
 'Date for biobank sample',
 'Date disease onsent',
 'Date hospital admission',
 'Date hospital admission_2a',
 'Date hospital dismisal',
 'Date ICU',
 'Date ICU 2nd admission',
 'Date ICU 3rd admission',
 'Date invasive ventilation',
 'Date ECMO',
 'Date ICU dismisal',
 'Date ICU dismisal 2nd admission',
 'Date ICU dismisal 3rd admission',
 'Outcome',
 'Outcome Adjusted',
 'Date of death (if applicable)',
 'covid-19-diagnos: verifierad, klinisk/radiologisk, ej covid-19',
 'Date positive PCR test',
 'Age',
 'Sex',
 'BMI final',
 'Country of birth',
 'Smoking (nuvarande/tidigare/ej r√∂kare)',
 'CCI (Charlson comorbidity index)',
 'Ischemisk hs',
 'Hj√§rtsvikt',
 'Perifer k√§rlsjd',
 'Hypertoni',
 'Cerebrovaskul√§r sjd',
 'Demens',
 'Chronic pulmonary disease (1=KOL, 2=Astma, 3=Annat)',
 'Connective tissue disease = Reumatiska sjd',
 'Ulkus sjd',
 'Diabetes

In [5]:
# There are many typos in the column names, let's correct those

example_string = 'r√∂kare'
example_string = example_string.replace('√∂','ö')
example_string

'rökare'

In [6]:
# There's a number of instances where swedish was misread into english
# Let's change all of those characters back in a for loop using str.replace()

new_col_name_list = []

for column in file.columns.tolist():
    new_col = column.replace('√∂','ö').replace('√§','ä').replace('√•','å')
    new_col_name_list.append(new_col)
    
new_col_name_list

['Subject ID',
 'Biobank Sample',
 'Disease Status',
 'Code for biobanksample',
 'Include?',
 'Exclusion Criteria Comments',
 'WSP',
 'Date for biobank sample',
 'Date disease onsent',
 'Date hospital admission',
 'Date hospital admission_2a',
 'Date hospital dismisal',
 'Date ICU',
 'Date ICU 2nd admission',
 'Date ICU 3rd admission',
 'Date invasive ventilation',
 'Date ECMO',
 'Date ICU dismisal',
 'Date ICU dismisal 2nd admission',
 'Date ICU dismisal 3rd admission',
 'Outcome',
 'Outcome Adjusted',
 'Date of death (if applicable)',
 'covid-19-diagnos: verifierad, klinisk/radiologisk, ej covid-19',
 'Date positive PCR test',
 'Age',
 'Sex',
 'BMI final',
 'Country of birth',
 'Smoking (nuvarande/tidigare/ej rökare)',
 'CCI (Charlson comorbidity index)',
 'Ischemisk hs',
 'Hjärtsvikt',
 'Perifer kärlsjd',
 'Hypertoni',
 'Cerebrovaskulär sjd',
 'Demens',
 'Chronic pulmonary disease (1=KOL, 2=Astma, 3=Annat)',
 'Connective tissue disease = Reumatiska sjd',
 'Ulkus sjd',
 'Diabetes (1=

In [7]:
# Column names are now clean, so let's rename the columns with our new list

file.columns = new_col_name_list
file.columns

Index(['Subject ID', 'Biobank Sample', 'Disease Status',
       'Code for biobanksample', 'Include?', 'Exclusion Criteria Comments',
       'WSP', 'Date for biobank sample', 'Date disease onsent',
       'Date hospital admission',
       ...
       'Högsta Ferritin +/- 24 h från biobanksprov 4',
       'Högsta D-dimerer  +/- 24 h från biobanksprov 4',
       'Högsta LD  +/- 24 h från biobanksprov 4',
       'Högsta krea  +/- 24 h från biobanksprov 4',
       'Högsta bilirubin  +/- 24 h från biobanksprov 4',
       'Lägsta TPK  +/- 24 h från biobanksprov 4', 'Positive blododling',
       'Positiv nedre luftvägsodling', 'Positiv urinvägsodling',
       'SARS-CoV-2_serum'],
      dtype='object', length=144)

In [8]:
# Showing file

file

Unnamed: 0,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onsent,Date hospital admission,...,Högsta Ferritin +/- 24 h från biobanksprov 4,Högsta D-dimerer +/- 24 h från biobanksprov 4,Högsta LD +/- 24 h från biobanksprov 4,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum
0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,2020-04-23 00:00:00,...,,,,,,,n,n,n,n
1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,2020-04-23 00:00:00,...,,,,,,,n,n,n,n
2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,0,...,,,,,,,n,n,n,n
388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,0,...,,,,,,,n,n,n,n
389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,0,...,,,,,,,n,n,n,n
390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,0,...,,,,,,,n,n,n,n


In [9]:
# Let's loop through every row and do the same sort of cleaning
for value in list(range(0,392)):
    for column in file.columns.tolist():
        cell = file.loc[value,column]
        if type(cell) == str:
            cell = cell.replace('√∂','ö').replace('√§','ä').replace('√•','å')
            file.loc[value,column] = cell
        
file['Smoking (nuvarande/tidigare/ej rökare)'].value_counts()

ej rökare    177
tidigare     144
rökare        23
0             21
?             18
Ej rökare      4
Rökare         2
Tidigare       2
EJ rökare      1
Name: Smoking (nuvarande/tidigare/ej rökare), dtype: int64

In [10]:
# Let's explore the "Include?" Column

file['Include?'].value_counts()

1    387
0      5
Name: Include?, dtype: int64

In [11]:
# 0 must equal exclude, so let's drop those from our dataframe
# First, let's locate all rows where Include? == 0, get the indices for those rows, and assign them to a new list

drop_list = file.loc[file['Include?'] == 0].index.tolist()
drop_list

[105, 120, 189, 251, 316]

In [12]:
# Now, let's drop those rows with those indices

file = file.drop(drop_list)

file

Unnamed: 0,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onsent,Date hospital admission,...,Högsta Ferritin +/- 24 h från biobanksprov 4,Högsta D-dimerer +/- 24 h från biobanksprov 4,Högsta LD +/- 24 h från biobanksprov 4,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum
0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,2020-04-23 00:00:00,...,,,,,,,n,n,n,n
1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,2020-04-23 00:00:00,...,,,,,,,n,n,n,n
2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,2020-04-14 00:00:00,...,,,,,,,n,n,j,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,0,...,,,,,,,n,n,n,n
388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,0,...,,,,,,,n,n,n,n
389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,0,...,,,,,,,n,n,n,n
390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,0,...,,,,,,,n,n,n,n


In [13]:
# Let's explore more columns and see what further cleaning we can do

file.columns.tolist()

['Subject ID',
 'Biobank Sample',
 'Disease Status',
 'Code for biobanksample',
 'Include?',
 'Exclusion Criteria Comments',
 'WSP',
 'Date for biobank sample',
 'Date disease onsent',
 'Date hospital admission',
 'Date hospital admission_2a',
 'Date hospital dismisal',
 'Date ICU',
 'Date ICU 2nd admission',
 'Date ICU 3rd admission',
 'Date invasive ventilation',
 'Date ECMO',
 'Date ICU dismisal',
 'Date ICU dismisal 2nd admission',
 'Date ICU dismisal 3rd admission',
 'Outcome',
 'Outcome Adjusted',
 'Date of death (if applicable)',
 'covid-19-diagnos: verifierad, klinisk/radiologisk, ej covid-19',
 'Date positive PCR test',
 'Age',
 'Sex',
 'BMI final',
 'Country of birth',
 'Smoking (nuvarande/tidigare/ej rökare)',
 'CCI (Charlson comorbidity index)',
 'Ischemisk hs',
 'Hjärtsvikt',
 'Perifer kärlsjd',
 'Hypertoni',
 'Cerebrovaskulär sjd',
 'Demens',
 'Chronic pulmonary disease (1=KOL, 2=Astma, 3=Annat)',
 'Connective tissue disease = Reumatiska sjd',
 'Ulkus sjd',
 'Diabetes (1=

In [14]:
# Cleaning one more column
file = file.rename(columns = {'Date disease onsent':'Date disease onset'})
file.columns.tolist()

['Subject ID',
 'Biobank Sample',
 'Disease Status',
 'Code for biobanksample',
 'Include?',
 'Exclusion Criteria Comments',
 'WSP',
 'Date for biobank sample',
 'Date disease onset',
 'Date hospital admission',
 'Date hospital admission_2a',
 'Date hospital dismisal',
 'Date ICU',
 'Date ICU 2nd admission',
 'Date ICU 3rd admission',
 'Date invasive ventilation',
 'Date ECMO',
 'Date ICU dismisal',
 'Date ICU dismisal 2nd admission',
 'Date ICU dismisal 3rd admission',
 'Outcome',
 'Outcome Adjusted',
 'Date of death (if applicable)',
 'covid-19-diagnos: verifierad, klinisk/radiologisk, ej covid-19',
 'Date positive PCR test',
 'Age',
 'Sex',
 'BMI final',
 'Country of birth',
 'Smoking (nuvarande/tidigare/ej rökare)',
 'CCI (Charlson comorbidity index)',
 'Ischemisk hs',
 'Hjärtsvikt',
 'Perifer kärlsjd',
 'Hypertoni',
 'Cerebrovaskulär sjd',
 'Demens',
 'Chronic pulmonary disease (1=KOL, 2=Astma, 3=Annat)',
 'Connective tissue disease = Reumatiska sjd',
 'Ulkus sjd',
 'Diabetes (1=t

In [15]:
# There are lots of relevant dates, and we want to calculate day intervals
# 1) Date biobank sample
# 2) Date disease onset
# 3) Date hospital admission
# 4) Date positive PCR test

In [16]:
# Let's create new columns with day ranges for each of those conditions

# First, we need to check the datetime format of these dates

file['Date for biobank sample']

0      2020-04-28 00:00:00
1      2020-09-29 00:00:00
2      2020-04-30 00:00:00
3      2020-05-05 00:00:00
4      2020-05-12 00:00:00
              ...         
387                      0
388                      0
389                      0
390                      0
391                      0
Name: Date for biobank sample, Length: 387, dtype: object

In [17]:
# Example (locating one date in the dataframe)

d1 = file.loc[0,'Date disease onset']
d1.strftime("%Y-%m-%d")

'2020-04-11'

In [18]:
d2 = file.loc[0,'Date for biobank sample']
d2.strftime("%Y-%m-%d")

'2020-04-28'

In [19]:
# Define days_between function

from datetime import datetime

def days_between(d1, d2):
    d1 = datetime.strptime(d1.strftime("%Y-%m-%d"), "%Y-%m-%d")
    d2 = datetime.strptime(d2.strftime("%Y-%m-%d"), "%Y-%m-%d")
    return (d2 - d1).days

In [20]:
days_between(d1,d2)

17

In [21]:
# Let's use this function for every value in our dataframe
# First we will reset dataframe indices so that they are in a consecutive integer list

file = file.reset_index()
file

Unnamed: 0,index,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onset,...,Högsta Ferritin +/- 24 h från biobanksprov 4,Högsta D-dimerer +/- 24 h från biobanksprov 4,Högsta LD +/- 24 h från biobanksprov 4,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum
0,0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,...,,,,,,,n,n,n,n
1,1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,...,,,,,,,n,n,n,n
2,2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,...,,,,,,,n,n,j,n
3,3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,...,,,,,,,n,n,j,n
4,4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,...,,,,,,,n,n,j,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,...,,,,,,,n,n,n,n
383,388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,...,,,,,,,n,n,n,n
384,389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,...,,,,,,,n,n,n,n
385,390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,...,,,,,,,n,n,n,n


In [22]:
file = file.rename(columns = {'index':'original index'})
file

Unnamed: 0,original index,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onset,...,Högsta Ferritin +/- 24 h från biobanksprov 4,Högsta D-dimerer +/- 24 h från biobanksprov 4,Högsta LD +/- 24 h från biobanksprov 4,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum
0,0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,...,,,,,,,n,n,n,n
1,1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,...,,,,,,,n,n,n,n
2,2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,...,,,,,,,n,n,j,n
3,3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,...,,,,,,,n,n,j,n
4,4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,...,,,,,,,n,n,j,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,...,,,,,,,n,n,n,n
383,388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,...,,,,,,,n,n,n,n
384,389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,...,,,,,,,n,n,n,n
385,390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,...,,,,,,,n,n,n,n


In [23]:
file['Date disease onset'].unique().tolist()

[datetime.datetime(2020, 4, 11, 0, 0),
 datetime.datetime(2020, 4, 12, 0, 0),
 datetime.datetime(2020, 6, 1, 0, 0),
 datetime.datetime(2020, 4, 23, 0, 0),
 datetime.datetime(2020, 5, 15, 0, 0),
 datetime.datetime(2020, 4, 9, 0, 0),
 datetime.datetime(2020, 4, 7, 0, 0),
 datetime.datetime(2020, 4, 14, 0, 0),
 datetime.datetime(2020, 3, 27, 0, 0),
 datetime.datetime(2020, 3, 23, 0, 0),
 datetime.datetime(2020, 3, 31, 0, 0),
 datetime.datetime(2020, 2, 27, 0, 0),
 datetime.datetime(2020, 3, 26, 0, 0),
 datetime.datetime(2020, 4, 16, 0, 0),
 datetime.datetime(2020, 4, 17, 0, 0),
 datetime.datetime(2020, 4, 20, 0, 0),
 datetime.datetime(2020, 4, 15, 0, 0),
 datetime.datetime(2020, 3, 24, 0, 0),
 datetime.datetime(2020, 3, 15, 0, 0),
 datetime.datetime(2020, 4, 5, 0, 0),
 datetime.datetime(2020, 4, 13, 0, 0),
 datetime.datetime(2020, 4, 30, 0, 0),
 datetime.datetime(2020, 4, 22, 0, 0),
 datetime.datetime(2020, 4, 29, 0, 0),
 datetime.datetime(2020, 3, 11, 0, 0),
 datetime.datetime(2020, 4, 2

In [24]:
# We have some non date values ('na' & 0), so we'll have to take care of that in our function)

file['Days since symptom onset'] = ''

for value in list(range(0,387)):
    d1 = file.loc[value,'Date disease onset']
    d2 = file.loc[value, 'Date for biobank sample']
    if d1 == 'na' or d2 == 'na' or d1 == 0 or d2 == 0 :
        file.loc[value, 'Days since symptom onset'] = 'NaN'
    else:
        days = days_between(d1,d2)
        file.loc[value,'Days since symptom onset'] = days
    
file

Unnamed: 0,original index,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onset,...,Högsta D-dimerer +/- 24 h från biobanksprov 4,Högsta LD +/- 24 h från biobanksprov 4,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum,Days since symptom onset
0,0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,...,,,,,,n,n,n,n,17
1,1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,...,,,,,,n,n,n,n,171
2,2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,...,,,,,,n,n,j,n,18
3,3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,...,,,,,,n,n,j,n,23
4,4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,...,,,,,,n,n,j,n,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,...,,,,,,n,n,n,n,
383,388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,...,,,,,,n,n,n,n,
384,389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,...,,,,,,n,n,n,n,
385,390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,...,,,,,,n,n,n,n,


In [25]:
file['Days since symptom onset'].tolist()

[17,
 171,
 18,
 23,
 30,
 15,
 22,
 19,
 26,
 33,
 131,
 11,
 140,
 61,
 4,
 15,
 134,
 17,
 147,
 14,
 168,
 28,
 34,
 32,
 38,
 14,
 154,
 24,
 30,
 57,
 63,
 29,
 35,
 43,
 12,
 10,
 11,
 28,
 39,
 172,
 12,
 19,
 152,
 10,
 116,
 20,
 153,
 37,
 45,
 46,
 173,
 25,
 180,
 15,
 149,
 162,
 22,
 29,
 5,
 12,
 13,
 20,
 96,
 146,
 9,
 13,
 16,
 15,
 22,
 14,
 14,
 168,
 10,
 148,
 13,
 23,
 10,
 50,
 226,
 9,
 16,
 23,
 30,
 142,
 10,
 164,
 126,
 11,
 'NaN',
 5,
 12,
 10,
 10,
 17,
 'NaN',
 'NaN',
 20,
 27,
 34,
 170,
 30,
 33,
 7,
 14,
 96,
 24,
 15,
 8,
 166,
 16,
 6,
 11,
 151,
 11,
 168,
 17,
 171,
 19,
 145,
 'NaN',
 25,
 32,
 32,
 39,
 22,
 11,
 158,
 15,
 22,
 162,
 14,
 13,
 20,
 171,
 13,
 118,
 11,
 158,
 9,
 16,
 17,
 174,
 'NaN',
 'NaN',
 'NaN',
 9,
 15,
 162,
 8,
 12,
 163,
 7,
 14,
 9,
 'NaN',
 12,
 14,
 18,
 147,
 18,
 12,
 22,
 11,
 18,
 123,
 15,
 22,
 16,
 124,
 11,
 21,
 147,
 15,
 133,
 9,
 8,
 150,
 15,
 151,
 17,
 16,
 23,
 115,
 13,
 14,
 13,
 17,
 'NaN',
 51,

In [26]:
file['Date hospital admission'].unique().tolist()

[datetime.datetime(2020, 4, 23, 0, 0),
 datetime.datetime(2020, 4, 14, 0, 0),
 datetime.datetime(2020, 6, 6, 0, 0),
 datetime.datetime(2020, 5, 7, 0, 0),
 datetime.datetime(2020, 5, 23, 0, 0),
 datetime.datetime(2020, 4, 12, 0, 0),
 datetime.datetime(2020, 6, 3, 0, 0),
 datetime.datetime(2020, 4, 21, 0, 0),
 datetime.datetime(2020, 4, 2, 0, 0),
 datetime.datetime(2020, 4, 6, 0, 0),
 datetime.datetime(2020, 4, 5, 0, 0),
 datetime.datetime(2020, 3, 27, 0, 0),
 datetime.datetime(2020, 4, 17, 0, 0),
 datetime.datetime(2020, 4, 20, 0, 0),
 datetime.datetime(2020, 4, 28, 0, 0),
 datetime.datetime(2020, 4, 30, 0, 0),
 datetime.datetime(2020, 4, 3, 0, 0),
 datetime.datetime(2020, 3, 24, 0, 0),
 datetime.datetime(2020, 4, 10, 0, 0),
 datetime.datetime(2020, 5, 1, 0, 0),
 datetime.datetime(2020, 4, 27, 0, 0),
 datetime.datetime(2020, 5, 3, 0, 0),
 datetime.datetime(2020, 4, 29, 0, 0),
 datetime.datetime(2020, 5, 6, 0, 0),
 datetime.datetime(2020, 4, 22, 0, 0),
 datetime.datetime(2020, 4, 1, 0, 0

In [27]:
# 2) Date since hospital admission

file['Days since hospital admission'] = ''

for value in list(range(0,387)):
    d1 = file.loc[value,'Date hospital admission']
    d2 = file.loc[value, 'Date for biobank sample']
    if d1 == 'na' or d2 == 'na' or d1 == 0 or d2 == 0 :
        file.loc[value, 'Days since hospital admission'] = 'NaN'
    else:
        days = days_between(d1,d2)
        file.loc[value,'Days since hospital admission'] = days
    
file

Unnamed: 0,original index,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onset,...,Högsta LD +/- 24 h från biobanksprov 4,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum,Days since symptom onset,Days since hospital admission
0,0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,...,,,,,n,n,n,n,17,5
1,1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,...,,,,,n,n,n,n,171,159
2,2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,...,,,,,n,n,j,n,18,16
3,3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,...,,,,,n,n,j,n,23,21
4,4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,...,,,,,n,n,j,n,30,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,...,,,,,n,n,n,n,,
383,388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,...,,,,,n,n,n,n,,
384,389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,...,,,,,n,n,n,n,,
385,390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,...,,,,,n,n,n,n,,


In [28]:
file['Date positive PCR test'].unique().tolist()

[datetime.datetime(2020, 4, 23, 0, 0),
 datetime.datetime(2020, 4, 14, 0, 0),
 datetime.datetime(2020, 6, 6, 0, 0),
 datetime.datetime(2020, 5, 7, 0, 0),
 datetime.datetime(2020, 5, 23, 0, 0),
 datetime.datetime(2020, 4, 12, 0, 0),
 datetime.datetime(2020, 6, 3, 0, 0),
 datetime.datetime(2020, 4, 21, 0, 0),
 datetime.datetime(2020, 4, 2, 0, 0),
 datetime.datetime(2020, 4, 8, 0, 0),
 datetime.datetime(2020, 4, 5, 0, 0),
 datetime.datetime(2020, 3, 27, 0, 0),
 datetime.datetime(2020, 4, 17, 0, 0),
 datetime.datetime(2020, 4, 20, 0, 0),
 datetime.datetime(2020, 4, 28, 0, 0),
 datetime.datetime(2020, 4, 30, 0, 0),
 datetime.datetime(2020, 4, 3, 0, 0),
 datetime.datetime(2020, 3, 24, 0, 0),
 datetime.datetime(2020, 4, 10, 0, 0),
 datetime.datetime(2020, 5, 1, 0, 0),
 datetime.datetime(2020, 4, 27, 0, 0),
 datetime.datetime(2020, 5, 3, 0, 0),
 datetime.datetime(2020, 4, 29, 0, 0),
 datetime.datetime(2020, 5, 6, 0, 0),
 datetime.datetime(2020, 4, 22, 0, 0),
 datetime.datetime(2020, 4, 1, 0, 0

In [29]:
# 2) Date since positive pcr test

file['Days since positive PCR test'] = ''

for value in list(range(0,387)):
    d1 = file.loc[value,'Date positive PCR test']
    d2 = file.loc[value, 'Date for biobank sample']
    if d1 == 0 or d2 == 0 :
        file.loc[value, 'Days since positive PCR test'] = 'NaN'
    else:
        days = days_between(d1,d2)
        file.loc[value,'Days since positive PCR test'] = days
    
file

Unnamed: 0,original index,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onset,...,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum,Days since symptom onset,Days since hospital admission,Days since positive PCR test
0,0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,...,,,,n,n,n,n,17,5,5
1,1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,...,,,,n,n,n,n,171,159,159
2,2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,...,,,,n,n,j,n,18,16,16
3,3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,...,,,,n,n,j,n,23,21,21
4,4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,...,,,,n,n,j,n,30,28,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,387,H20,HD,HD,RAPIDCTRL20,1,0,6,0,0,...,,,,n,n,n,n,,,
383,388,H21,HD,HD,RAPIDCTRL21,1,0,13,0,0,...,,,,n,n,n,n,,,
384,389,H22,HD,HD,RAPIDCTRL22,1,0,13,0,0,...,,,,n,n,n,n,,,
385,390,H23,HD,HD,RAPIDCTRL23,1,0,13,0,0,...,,,,n,n,n,n,,,


In [30]:
# Relevant day ranges are added, let's explore other columns that we can clean
file.columns.tolist()

['original index',
 'Subject ID',
 'Biobank Sample',
 'Disease Status',
 'Code for biobanksample',
 'Include?',
 'Exclusion Criteria Comments',
 'WSP',
 'Date for biobank sample',
 'Date disease onset',
 'Date hospital admission',
 'Date hospital admission_2a',
 'Date hospital dismisal',
 'Date ICU',
 'Date ICU 2nd admission',
 'Date ICU 3rd admission',
 'Date invasive ventilation',
 'Date ECMO',
 'Date ICU dismisal',
 'Date ICU dismisal 2nd admission',
 'Date ICU dismisal 3rd admission',
 'Outcome',
 'Outcome Adjusted',
 'Date of death (if applicable)',
 'covid-19-diagnos: verifierad, klinisk/radiologisk, ej covid-19',
 'Date positive PCR test',
 'Age',
 'Sex',
 'BMI final',
 'Country of birth',
 'Smoking (nuvarande/tidigare/ej rökare)',
 'CCI (Charlson comorbidity index)',
 'Ischemisk hs',
 'Hjärtsvikt',
 'Perifer kärlsjd',
 'Hypertoni',
 'Cerebrovaskulär sjd',
 'Demens',
 'Chronic pulmonary disease (1=KOL, 2=Astma, 3=Annat)',
 'Connective tissue disease = Reumatiska sjd',
 'Ulkus sj

In [31]:
# Let's explore some of the variables in our cohort further
file['Age'].describe()

count    387.000000
mean      56.165375
std       17.716786
min        0.000000
25%       52.000000
50%       60.000000
75%       67.000000
max       91.000000
Name: Age, dtype: float64

In [32]:
file['Sex'].value_counts()

m    284
k     82
0     21
Name: Sex, dtype: int64

In [33]:
file['BMI final'].describe()

count    387.000000
mean      27.149561
std        8.668428
min        0.000000
25%       24.860000
50%       27.700000
75%       31.325000
max       55.000000
Name: BMI final, dtype: float64

In [34]:
file['Smoking (nuvarande/tidigare/ej rökare)'].value_counts()

ej rökare    174
tidigare     143
rökare        22
0             21
?             18
Ej rökare      4
Rökare         2
Tidigare       2
EJ rökare      1
Name: Smoking (nuvarande/tidigare/ej rökare), dtype: int64

In [35]:
file['CCI (Charlson comorbidity index)'].describe()

count    387.000000
mean       1.736434
std        2.007187
min        0.000000
25%        0.000000
50%        1.000000
75%        3.000000
max       12.000000
Name: CCI (Charlson comorbidity index), dtype: float64

In [36]:
# From here, I can come up with a few challenges

In [37]:
# Patients older than 50

file.loc[file['Age'] > 50]

Unnamed: 0,original index,Subject ID,Biobank Sample,Disease Status,Code for biobanksample,Include?,Exclusion Criteria Comments,WSP,Date for biobank sample,Date disease onset,...,Högsta krea +/- 24 h från biobanksprov 4,Högsta bilirubin +/- 24 h från biobanksprov 4,Lägsta TPK +/- 24 h från biobanksprov 4,Positive blododling,Positiv nedre luftvägsodling,Positiv urinvägsodling,SARS-CoV-2_serum,Days since symptom onset,Days since hospital admission,Days since positive PCR test
0,0,PSP034_P92408,1,Acute,KKICOV034,1,0,2,2020-04-28 00:00:00,2020-04-11 00:00:00,...,,,,n,n,n,n,17,5,5
1,1,PSP034_P92408,7,Convalescent,KKICOV436,1,0,13,2020-09-29 00:00:00,2020-04-11 00:00:00,...,,,,n,n,n,n,171,159,159
2,2,PSP039_P92649,1,Acute,KKICOV039,1,0,2,2020-04-30 00:00:00,2020-04-12 00:00:00,...,,,,n,n,j,n,18,16,16
3,3,PSP039_P92649,2,Acute,KKICOV048,1,0,3,2020-05-05 00:00:00,2020-04-12 00:00:00,...,,,,n,n,j,n,23,21,21
4,4,PSP039_P92649,3,Acute,KKICOV071,1,0,3,2020-05-12 00:00:00,2020-04-12 00:00:00,...,,,,n,n,j,n,30,28,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,365,PSP496,1,Acute,KKICOV496,1,0,12,2020-08-18 00:00:00,2020-07-31 00:00:00,...,,,,j,j,n,n,18,15,15
361,366,PSP498,1,Acute,KKICOV498,1,0,12,2020-07-03 00:00:00,2020-06-22 00:00:00,...,,,,n,n,n,j,11,5,5
362,367,PSP498,2,Acute,KKICOV500,1,0,12,2020-07-10 00:00:00,2020-06-22 00:00:00,...,,,,n,n,n,j,18,12,12
363,368,PSP498,6,Convalescent,KKICOV429,1,0,14,2020-10-06 00:00:00,2020-06-22 00:00:00,...,,,,n,n,n,j,106,100,100


In [38]:
# What sort of treatment are they on
file.loc[file['Age'] > 50]['Cytokinblockad före biobanksprov1'].value_counts()

0    265
1     37
Name: Cytokinblockad före biobanksprov1, dtype: int64

In [42]:
# Mean neutrophil count for biobank 1

file.loc[file['Biobank Sample'] == 1]['Neutrophil ct +/- 24 hrs'].mean()

7.467894736842107

In [46]:
# Compared to Later in infection
file.loc[file['Biobank Sample'] == 3]['Neutrophil ct +/- 24 hrs'].mean()

7.200000000000001