### Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Read the sample data and see what it looks like

In [2]:
seqs = pd.read_csv("data/files-archive.tsv", sep = "\t")
seqs.head()

Unnamed: 0,study_id,specimen collector sample ID,sample collected by,sequence submitted by,sample collection date,sample collection date null reason,geo_loc_name (country),geo_loc_name (state/province/territory),organism,isolate,...,consensus sequence software name,consensus sequence software version,breadth of coverage value,depth of coverage value,reference genome accession,bioinformatics protocol,gene name,diagnostic pcr Ct value,diagnostic pcr Ct value null reason,GISAID accession
0,UHTC-ON,UHTC_0471,Unity Health Toronto,Ontario Institute for Cancer Research (OICR),2020-12-15,,Canada,Ontario,Not Provided,hCoV-19/Canada/ON-UHTC_0471/2021,...,ARTIC-nanopolish,1.1.3,Not Provided,5384.4X,,,Not Provided,,Not Provided,
1,UHTC-ON,UHTC_0464,Unity Health Toronto,Ontario Institute for Cancer Research (OICR),2020-10-12,,Canada,Ontario,Not Provided,hCoV-19/Canada/ON-UHTC_0464/2021,...,ARTIC-nanopolish,1.1.3,Not Provided,2912.4X,,,Not Provided,,Not Provided,
2,UHTC-ON,UHTC_0463,Unity Health Toronto,Ontario Institute for Cancer Research (OICR),2020-10-02,,Canada,Ontario,Not Provided,hCoV-19/Canada/ON-UHTC_0463/2021,...,ARTIC-nanopolish,1.1.3,Not Provided,2661.6X,,,Not Provided,,Not Provided,
3,KHSC-ON,SLB3076,Queen's University / Kingston Health Sciences ...,Ontario Institute for Cancer Research (OICR),2020-04-12,,Canada,Ontario,Not Provided,hCoV-19/Canada/ON-SLB3076/2021,...,ncov2019-artic-nf,OICR v1.6,Not Provided,1671.9X,,,Not Provided,,Not Provided,
4,KHSC-ON,SLB3121,Queen's University / Kingston Health Sciences ...,Ontario Institute for Cancer Research (OICR),2020-04-22,,Canada,Ontario,Not Provided,hCoV-19/Canada/ON-SLB3121/2021,...,ncov2019-artic-nf,OICR v1.6,Not Provided,959.3X,,,Not Provided,,Not Provided,


### Print out all column names

In [3]:
print(seqs.columns)

Index(['study_id', 'specimen collector sample ID', 'sample collected by',
       'sequence submitted by', 'sample collection date',
       'sample collection date null reason', 'geo_loc_name (country)',
       'geo_loc_name (state/province/territory)', 'organism', 'isolate',
       'fasta header name', 'purpose of sampling',
       'purpose of sampling details', 'anatomical material', 'anatomical part',
       'body product', 'environmental material', 'environmental site',
       'collection device', 'collection method', 'host (scientific name)',
       'host disease', 'host age', 'host age null reason', 'host age unit',
       'host age bin', 'host gender', 'purpose of sequencing',
       'purpose of sequencing details', 'sequencing instrument',
       'sequencing protocol', 'raw sequence data processing method',
       'dehosting method', 'consensus sequence software name',
       'consensus sequence software version', 'breadth of coverage value',
       'depth of coverage value', 'r

### Remove the columns we don't need

In [4]:
seqs.drop(columns=['study_id', 'specimen collector sample ID', 'sample collected by',
       'sequence submitted by',
       'sample collection date null reason', 'geo_loc_name (country)',
       'organism', 'isolate',
       'fasta header name',
       'purpose of sampling details', 'anatomical material', 'anatomical part',
       'body product', 'environmental material', 'environmental site',
       'collection device', 'collection method', 'host (scientific name)',
       'host disease','host age null reason', 'host age unit',
       'purpose of sequencing details',
       'sequencing protocol', 'raw sequence data processing method',
       'dehosting method', 'consensus sequence software name',
       'consensus sequence software version', 'reference genome accession',
       'bioinformatics protocol', 'gene name', 
       'diagnostic pcr Ct value null reason'], inplace = True)

In [5]:
seqs.head()

Unnamed: 0,sample collection date,geo_loc_name (state/province/territory),purpose of sampling,host age,host age bin,host gender,purpose of sequencing,sequencing instrument,breadth of coverage value,depth of coverage value,diagnostic pcr Ct value,GISAID accession
0,2020-12-15,Ontario,Not Provided,,Not Applicable,Not Provided,Not Provided,Oxford Nanopore,Not Provided,5384.4X,,
1,2020-10-12,Ontario,Not Provided,,Not Applicable,Not Provided,Not Provided,Oxford Nanopore,Not Provided,2912.4X,,
2,2020-10-02,Ontario,Not Provided,,Not Applicable,Not Provided,Not Provided,Oxford Nanopore,Not Provided,2661.6X,,
3,2020-04-12,Ontario,Not Provided,,Not Applicable,Not Provided,Not Provided,Illumina,Not Provided,1671.9X,,
4,2020-04-22,Ontario,Not Provided,,Not Applicable,Not Provided,Not Provided,Illumina,Not Provided,959.3X,,


### Let's replace the many forms of 'Not Provided' or NaN with np.nan and convert collection date to a date time object

In [6]:
seqs.replace({'Not Provided':np.nan, 'NaN':np.nan, 'Not Applicable':np.nan}, inplace = True)
seqs['sample collection date'] = pd.to_datetime(seqs['sample collection date'])
seqs.head()

Unnamed: 0,sample collection date,geo_loc_name (state/province/territory),purpose of sampling,host age,host age bin,host gender,purpose of sequencing,sequencing instrument,breadth of coverage value,depth of coverage value,diagnostic pcr Ct value,GISAID accession
0,2020-12-15,Ontario,,,,,,Oxford Nanopore,,5384.4X,,
1,2020-10-12,Ontario,,,,,,Oxford Nanopore,,2912.4X,,
2,2020-10-02,Ontario,,,,,,Oxford Nanopore,,2661.6X,,
3,2020-04-12,Ontario,,,,,,Illumina,,1671.9X,,
4,2020-04-22,Ontario,,,,,,Illumina,,959.3X,,


### What percent of the data is missing?

In [9]:
seqs.isna().sum()/len(seqs)*100

sample collection date                      0.000000
geo_loc_name (state/province/territory)     0.000000
purpose of sampling                        21.533453
host age                                   92.793075
host age bin                               54.541226
host gender                                56.600890
purpose of sequencing                      22.678132
sequencing instrument                       0.000000
breadth of coverage value                   0.105356
depth of coverage value                    25.508509
diagnostic pcr Ct value                    73.663829
GISAID accession                            1.722714
dtype: float64

### All samples have associated collection dates!