# Preparing data

This notebook reads data processed by Galaxy and

 1. Validates sites to ensire that coordinates are correct
 2. Adds information about variants of concern (VOC) and sites under selection


In [None]:
import pandas as pd

In [None]:
!pip install biopython pandasql

In [None]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
funclass_translation = {'SILENT':'Synonymous','MISSENSE':'Non-synonymous','NONSENSE':'Stop','.':'Non-coding','NONE':'Non-coding'}

## Which dataset to run notebook on?
At this time there are three possible datasets:

 - Boston: `bos`
 - COG-Pre: `cog-pre`
 - COG-Post: `cog-post`

Setting this variable runs all notebook content for this particular dataset. The actual paths are pulled out from `datasets` dict (next cell)

In [None]:
dataset = 'bos'

In [None]:
datasets = {
    'bos':
        [   
            'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/bos_by_sample.tsv.gz',
            'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/bos_by_var.tsv.gz'
        ],
    'cog-pre': 
        [   
            'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/cog_20200917_by_sample.tsv.gz',
            'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/cog_20200917_by_var.tsv.gz'
        ],
    'cog-post': 
        [   
            'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/cog_20201120_by_sample.tsv.gz',
            'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/var/cog_20201120_by_var.tsv.gz'
        ]
}

In [None]:
# SARS-CoV-2 genome assembly url
gnm_url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.gbff.gz'
gnm_file = gnm_url.split('/')[-1]

# VOC data
voc_url = 'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/voc/voc.tsv.gz'

# Selection data
sel_url = 'https://github.com/galaxyproject/SARS-CoV-2/raw/master/data/selection/selection.tsv.gz'

In [None]:
# Get SARS-CoV-2 RefSeq genomes (in GenBank format) from NCBI
import os.path
from os import path
if not path.exists(gnm_file[:-3]):
    !wget -nc {gnm_url}
    !gunzip {gnm_file}
else:
    print('File {} is already here\nDoing nothing!'.format(gnm_file))

In [None]:
from Bio import SeqIO
genome = SeqIO.read(gnm_file[0:len(gnm_file)-3], "genbank")

In [None]:
# Get variants by sample
var = pd.read_csv(datasets[dataset][0],sep='\t')

In [None]:
len(var)

In [None]:
var = var.replace({'FUNCLASS':funclass_translation})

In [None]:
# Changing coordinates to 0-based
var['POS'] = var['POS']-1

In [None]:
# Validation function for checking against genome

def check_against_genome(seqobject,df,pos_base_list,l):
    wrong = []
    slip_sites = dict()
    bad = 0
    good = 0
    pb = df[pos_base_list].to_numpy()
    for item in pb:
        base = seqobject[item[0]:(item[0])+l].seq
        if base != item[1] and len(base) == len(item[1]):
            slip = seqobject[(item[0]-1):(item[0]-1)+l].seq
            if slip == item[1]:
                good += 1
                slip_sites[item[0]] = True
            else:
                bad += 1
                wrong.append([item[1],base,item[0]])
        elif base == item[1] and len(base) == len(item[1]): 
            good += 1
    print ('Total = {}, Wrong = {}, Correct = {}'.format(len(pb),bad,good))
    return(slip_sites,wrong)

In [None]:
check_against_genome(genome,var,['POS','REF'],1)

In [None]:
var.head()

In [None]:
# Variants by site
sites = pd.read_csv(datasets[dataset][1],sep='\t')

In [None]:
len(sites)

In [None]:
sites = sites.replace({'FUNCLASS':funclass_translation})

In [None]:
# Changing coordinates to 0-based
sites['POS'] = sites['POS']-1

In [None]:
check_against_genome(genome,sites,['POS','REF'],1)

In [None]:
sites.head()

In [None]:
sel = pd.read_csv(sel_url,sep='\t')

In [None]:
# Add info about sites under sleection to the main variant table
sites = pysqldf('select sites.*, sel.fel,sel.meme,sel.freq from sites left join sel on sites.POS >= sel.pos and sites.POS <= sel.pos+2 ')

In [None]:
sites.head()

In [None]:
voc = pd.read_csv(voc_url, sep='\t',names=['voc_set','mut','position'],header=None)

In [None]:
voc.head()

In [None]:
for item in voc['voc_set'].unique():
    sites = pysqldf('select sites.*, mut from sites left join voc on (POS >= position and POS < position+3) and voc_set = "{0}"'.format(item))
    sites = sites.rename(columns={"mut": item})

In [None]:
pysqldf('select * from sites where P1 is not null')

In [None]:
len(sites)

In [None]:
# Assumes df has columns labelled 'ALT' and 'REF'
def chng_type(df):
    df.loc[df['REF'].str.len() == df['ALT'].str.len(), 'type'] = 'SNP'
    df.loc[df['REF'].str.len() != df['ALT'].str.len(), 'type'] = 'Indel'

In [None]:
chng_type(var)
chng_type(sites)

In [None]:
var.to_csv('{}_by_sample_processed.tsv'.format(dataset),sep='\t',index=False)
sites.to_csv('{}_by_var_processed.tsv'.format(dataset),sep='\t',index=False)
!gzip *.tsv

In [None]:
!ls

In [None]:
from google.colab import files
files.download('{}_by_sample_processed.tsv.gz'.format(dataset))
files.download('{}_by_var_processed.tsv.gz'.format(dataset))