# module import

In [None]:
# general needs
import os  
import glob
import re
import shutil
import gzip
import pandas as pd
from datetime import date
from google.cloud import storage

# biopython
from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord

# date input

In [None]:
# current date
current_date = str(date.today())

# creating working directories and subdirectories 

In [None]:
# define working directory (do NOT change this)
home = '/home/diana_ir/Documents/nextstrain' 

# define GCP working bucket without including the gs:// prefix
bucket =  '/home/diana_ir/Documents/nextstrain'

# define scripts path
scripts = '/home/diana_ir/scripts'

# creating main_directory 
main_dir = os.path.join(home, 'covid_sequencing')
if not os.path.exists(main_dir):
    os.mkdir(main_dir)

# creating subdirectories for main directory   
assembly_metrics_dir = os.path.join(main_dir, 'assembly_metrics')
if not os.path.exists(assembly_metrics_dir):
    os.mkdir(assembly_metrics_dir)

concat_metrics_dir = os.path.join(main_dir, 'concat_metrics')
if not os.path.exists(concat_metrics_dir):
    os.mkdir(concat_metrics_dir)

fasta_files_dir = os.path.join(main_dir, 'fasta_files')
if not os.path.exists(fasta_files_dir):
    os.mkdir(fasta_files_dir)
    
out_data_dir = os.path.join(main_dir, 'out_data')
if not os.path.exists(out_data_dir):
    os.mkdir(out_data_dir)

# exclusion sequences

In [None]:
# wastewater samples
wwt_runs = [
    
    'COVMIN_0008', 'COVMIN_0013', 'Covseq_WW5', 'COVSEQ_WW6', 'COVMIN_0054', 'COVSEQ_CJ2021_WGS']

# bad sequences
bad_seq = [
    
    'COVARC_001', 'COVARC_001rr', 'COVARC_002', 'COVARC_002rex', 'COVARC_002rr', 'COVARC_003', 
    'COVIDSeq_Test2_NextSeq550', 'COVMIN_Midnight3', 'COVMIN_Midnight4', 'COVMIN_midnight_02',
    
    'COVMIN_0028', 'COVMIN_0029', 'COVMIN_0055', 'COVMIN_0093', 'COVMIN_0107', 'COVMIN_0108', 'COVMIN_0136', 
    'COVMIN_0154', 'COVMIN_0156', 'COVMIN_0159', 'COVMIN_0167', 'COVMIN_0167rr','COVMIN_0196', 'COVMIN_0197', 
    'COVMIN_0198','COVMIN_0202', 'COVMIN_0203', 'COVMIN_0218', 'COVMIN_0222', 'COVMIN_0263', 'COVMIN_0264', 
    'COVMIN_0294', 'COVMIN_0295', 
    
    'COVMIN_0312', 'COVMIN_0340','COVMIN_0353', 'COVMIN_0363', 'COVMIN_0390', 'COVMIN_0395', 'COVMIN_0402', 
    'COVMIN_0413', 'COVMIN_0438', 'COVMIN_0445', 'COVMIN_0483', 'COVMIN_0486', 'COVMIN_0487', 'COVMIN_0503', 
    'COVMIN_0572', 'COVMIN_0609','COVMIN_0625', 'COVMIN_0642', 'COVMIN_0647', 'COVMIN_0648', 'COVMIN_0649',
    'COVMIN_0650', 'COVMIN_0653',
    
    'COVMIN_0741', 'COVMIN_0759', 'COVMIN_0772', 'COVMIN_0813', 'COVMIN_0827', 'COVMIN_0828', 'COVMIN_0829', 
    'COVMIN_0830', 'COVMIN_0831',
    
    'COVSEQ_0049', 'COVSEQ_0051', 'COVSEQ_0062', 'COVSEQ_0120', 'COVSEQ_0018', 'COVSEQ_0037','COVSEQ_0047', 
    'COVSEQ_0048'

               ]

# inclusion sequences

In [None]:
# covmin sequence rereuns
covmin_reruns = [ 
    
    'COVMIN_0009', 'COVMIN_0010', 'COVMIN_0011', 'COVMIN_0012', 'COVMIN_0014', 'COVMIN_0015', 'COVMIN_0016', 
    'COVMIN_0017', 'COVMIN_0018', 'COVMIN_0019', 'COVMIN_0020','COVMIN_0021', 'COVMIN_0022', 'COVMIN_0023', 
    'COVMIN_0024', 'COVMIN_0025', 'COVMIN_0026', 'COVMIN_0027', 'COVMIN_0028rr','COVMIN_0029rr',
    
    'COVMIN_0030', 'COVMIN_0031', 'COVMIN_0032', 'COVMIN_0033', 'COVMIN_0034', 'COVMIN_0035', 'COVMIN_0036',  
    'COVMIN_0037','COVMIN_0038', 'COVMIN_0039', 'COVMIN_0040', 'COVMIN_0041', 'COVMIN_0042', 'COVMIN_0043', 
    'COVMIN_0044', 'COVMIN_0045', 'COVMIN_0046', 'COVMIN_0047', 'COVMIN_0048', 'COVMIN_0049',
    
    'COVMIN_0050', 'COVMIN_0051', 'COVMIN_0052', 'COVMIN_0053', 'COVMIN_0055rr', 'COVMIN_0056', 'COVMIN_0057',
    'COVMIN_0058', 'COVMIN_0059', 'COVMIN_0060', 'COVMIN_0061', 'COVMIN_0062', 'COVMIN_0063', 'COVMIN_0064',
    'COVMIN_0065', 'COVMIN_0477', 'COVMIN_0489', 'COVMIN_0490', 'COVMIN_0491', 'COVMIN_0492',  'COVMIN_0493', 
    'COVMIN_0495', 'COVMIN_0496', 'COVMIN_0497', 'COVMIN_0498', 'COVMIN_0499', 'COVMIN_0500'
]

# archived sample runs
covarc_runs = [
    
    'NEXSEQ_001', 'NEXSEQ_002', 'NEXSEQ_003', 'NEXSEQ_004', 'NEXSEQ_005', 'NEXSEQ_006', 'NEXSEQ_007', 
    'NEXSEQ_008', 'NEXSEQ_009', 'NEXSEQ_010', 'NEXSEQ_011', 'NEXSEQ_013', 'NEXSEQ_014','NEXSEQ_015', 
    'NEXSEQ_016', 'NEXSEQ_017'
]

# get a list of all the sequence runs in the GCP covid_terra bucket

In [None]:
# get list of seq_runs present in the covid_terra bucket
bucket_name = 'covid_terra'
bucket_prefix = os.path.join('')

client_storage = storage.Client()
blobs = client_storage.list_blobs(bucket_name, prefix = bucket_prefix)

n = 0
seq_runs = []
regrexs = [re.compile('COVSEQ'), re.compile('COVMIN'), re.compile('Covseq'), 
           re.compile('COVARC'), re.compile('NEXSEQ'), re.compile('COVID'), re.compile('NOVASEQ')]

for blob in blobs:
    seq_run = re.findall('([0-9a-zA-Z_\-]+)', blob.name)[0]
    if (any(regrex.match(seq_run) for regrex in regrexs) and 
        seq_run not in seq_runs and 
        seq_run not in wwt_runs and 
        seq_run not in bad_seq and 
        not re.search('WW', seq_run)):
        
        n = n + 1
        seq_runs.append(seq_run)
        print(n, "  ", seq_run)
print(len(seq_runs))

# pull and concatenate the assembly metrics

In [None]:
for seq_run in seq_runs:
    if re.search('COVSEQ', seq_run):
        assembly_metrics_terra_bucket = 'covid_terra/%s/terra_outputs/%s_sequence_assembly_metrics.csv' % (seq_run, seq_run)
    
    elif re.search('COVMIN', seq_run) and seq_run in covmin_reruns:
        assembly_metrics_terra_bucket = 'covid_terra/%s/terra_outputs_rr/%s_sequence_assembly_metrics.csv' % (seq_run, seq_run)
        
    elif re.search('COVMIN', seq_run):
        assembly_metrics_terra_bucket = 'covid_terra/%s/terra_outputs/%s_sequence_assembly_metrics.csv' % (seq_run, seq_run)
    
    elif re.search('COVARC', seq_run):
        assembly_metrics_terra_bucket = 'covid_terra/%s/terra_outputs/%s_sequence_assembly_metrics.csv' % (seq_run, seq_run)
    
    elif re.search('NOVASEQ', seq_run):
        assembly_metrics_terra_bucket = 'covid_terra/%s/terra_outputs/%s_sequence_assembly_metrics.csv' % (seq_run, seq_run)
    
    elif re.search('NEXSEQ', seq_run):
        assembly_metrics_terra_bucket = 'covid_terra/%s/terra_outputs/%s_sequence_assembly_metrics.csv' % (seq_run, seq_run)
    
    
  # check to see if that assembly metrics file already exists in the assembly_metrics_dir
    assembly_metrics_path = os.path.join(assembly_metrics_dir, '%s_sequence_assembly_metrics.csv' % seq_run)
    
    if not os.path.exists(assembly_metrics_path):
        print('adding %s_sequence_assembly_metrics.csv to assembly_metrics directory' % seq_run)
        !gsutil -m cp -r gs://{assembly_metrics_terra_bucket} {assembly_metrics_dir}
    
    print('')
    
# concatendate the files into one
os.chdir(assembly_metrics_dir)

data_frame_list = []
for file in glob.glob('*.csv'):
    d = pd.read_csv(file, dtype = {'accession_id' : object})
    data_frame_list.append(d)

df = pd.concat(data_frame_list)

# save it
outfile = os.path.join(concat_metrics_dir, 'concatenated_assembly_metrics_%s.csv' % current_date)
df.to_csv(outfile, index=False)

#check the size
print(df.shape)

# removing specific "bad samples" from the rerun tsv file

In [None]:
# path to the rerun samples spreadsheet 
# https://docs.google.com/spreadsheets/d/1PtaHHa8iuqM6fNGTSkHiinxc5poQnVt1Ti67yLSVQ6c/edit#gid=0
path = '/home/diana_ir/Documents/nextstrain/bad_noodles/re-run_samples.tsv'
re_runs = pd.read_csv(path, sep = '\t', dtype = {'accession_id' : object})
print(re_runs.shape)

# path to the concatenated dataset (CHANGE)
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/concat_metrics/concatenated_metrics_bad_samples_removed_2022-07-14.csv'
df = pd.read_csv(path, dtype = {'accession_id' : object})
print(df.shape)

crit = ~df.accession_id.isin(re_runs)
df = df[crit]
df = df.reset_index(drop = True)

# remove re-run samples
print('... removing re-run samples')
print('...... num rows before: %d' % df.shape[0])


# create a dictionary of accession id and runs to drop...
re_runs_dict = {}
for row in range(re_runs.shape[0]):
    accession_id = re_runs.accession_id[row]
    first_run = re_runs.first_run[row]
    
    print()
    
    if re.search('/', str(first_run)):
        first_runs = first_run.split('/')
        first_runs_list = []
        for first_run in first_runs:
            first_runs_list.append(first_run)
        re_runs_dict[accession_id] = first_runs_list
    elif re.search('\(', str(first_run)):
        re_runs_dict[accession_id] = first_run.split('(')[0]
    else:
        re_runs_dict[accession_id] = first_run

    print(accession_id, ': ', re_runs_dict[accession_id])
        
# identify indexes (rows) with re-run samples
indexes_to_drop = []
for row in range(df.shape[0]):
    accession_id = df.accession_id[row]
    if accession_id in re_runs_dict.keys():
        if df.seq_run[row] in str(re_runs_dict[accession_id]):
            indexes_to_drop.append(row)

#drop those rows
df = df[~df.index.isin(indexes_to_drop)]
print('...... num rows after: %d' % df.shape[0])

# # save it
outfile = os.path.join(concat_metrics_dir, 'concatenated_metrics_bad_samples_removed_%s.csv' % current_date)
df.to_csv(outfile, index=False)

#check the size
print(df.shape)

# removing weird characters in the fasta file (Molly had mentioned -1 or -2 in the accession ids) which might affect merging, if files are already in the directory downloaded previously, no need to rerun this

In [None]:
# fix the "-1" samples...
# read concatenated metrics file...
outfile = os.path.join(concat_metrics_dir, 'concatenated_metrics_bad_samples_removed_%s.csv' % current_date)
metrics = pd.read_csv(outfile, dtype = {'accession_id' : object})


for seq_run in seq_runs:
    if re.search('NEXSEQ', seq_run) or seq_run in covarc_runs:
        seq_run_subdir = os.path.join(fasta_files_dir, seq_run)
        os.chdir(seq_run_subdir)

        for file in glob.glob('*'):
            if re.search('\d{10}-\d_consensus_renamed.fa', file):

                print('correcting %s in %s' % (file, seq_run))
                accession_id = re.findall('(\d{10})-\d_consensus_renamed.fa', file)[0]

                # check to make sure that the -1 and non -1 accession were not run in the same seq_run
                # otherwise the -1 will overwrite the non -1
                # will keep the onew ith the higher percent coverage....

                crit = metrics.seq_run == seq_run
                temp = metrics[crit]
                og_accessions = temp.OG_accession_id.unique().tolist()

                if accession_id in og_accessions:
                    print('"-1" and non "-1" sample on same sequencing run... check for higher percent coverage')

                    crit = temp.accession_id == accession_id
                    temp = temp[crit]

                    temp = temp.sort_values(by = 'percent_non_ambigous_bases', ascending = False)
                    temp = temp.reset_index(drop = True)

                    higher_cvg_sample = temp.loc[0, 'OG_accession_id']

                    print('sample with higher coverage == %s' % higher_cvg_sample)

                    if re.search('\d{10}-\d', higher_cvg_sample):
                        print('-1 sample will overwrite non -1 sample')


                        # rename the file
                        new_file_name = os.path.join(seq_run_subdir, '%s_consensus_renamed.fa' % accession_id)

                        # correct fasta header...
                        record = SeqIO.read(file, 'fasta')
                        new_id = 'CO-CDPHE-%s' % accession_id
                        new_record = SeqRecord(record.seq, id = new_id, name = new_id, description = '' )

                        with open(new_file_name, 'w') as handle:
                            SeqIO.write(new_record, handle, 'fasta')
                else:
                    # rename the file
                    new_file_name = os.path.join(seq_run_subdir, '%s_consensus_renamed.fa' % accession_id)

                    # correct fasta header...
                    record = SeqIO.read(file, 'fasta')
                    new_id = 'CO-CDPHE-%s' % accession_id
                    new_record = SeqRecord(record.seq, id = new_id, name = new_id, description = '' )

                    with open(new_file_name, 'w') as handle:
                        SeqIO.write(new_record, handle, 'fasta')

# Dropping columns and removing samples and filtering

In [None]:
# reading the file (CHANGED EACH TIME)
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/concat_metrics/concatenated_assembly_metrics_2022-07-14.csv'
df = pd.read_csv(path, dtype = {'accession_id' : object})
print('original', df.shape)

# drop columns from the concat metrics bad samples removed file
drop_col = [
    'plate_name', 'plate_sample_well', 'primer_set', 'omicron_spike_mutations', 'delta_plus_spike_mutations', 
    'total_nucleotide_mutations', 'total_AA_substitutions', 'total_AA_deletions', 'mean_depth', 
    'number_aligned_bases', 'number_non_ambigous_bases', 'number_seqs_in_fasta', 'total_nucleotide_deletions',
    'total_nucleotide_insertions', 'num_reads', 'mean_base_quality', 'mean_map_quality', 
    'number_N_bases', 'tech_platform', 'read_type', 'analysis_date', 'samtools_mean_depth', 
    'samtools_percent_cvg', 'samtools_mean_base_quality', 'samtools_mean_map_quality', 'assembler_version', 
    'mean_map_quality.1', 'mean_depth.1', 'mean_base_quality.1', 'num_reads.1'
        ]

## if you run the previous step, then these variables need to be on the drop_col
# 'fasta_header' 'OG_accession_id' 'patient_state' 'patient_county'\n 'collection_date' 
# 'customer_name' 'first_name' 'last_name' 'dob'\n 'patient_zip' 'receive_date' 
# 'first_name' 'last_name' 'dob'

df = df.drop(columns = drop_col)
print('removing columns', df.shape)

# removing specific "keywords" in the spreadsheet 
df = df[df["accession_id"].str.contains("POS")==False]
df = df[df["accession_id"].str.contains("NTC")==False]
df = df[df["accession_id"].str.contains("ntc")==False]
df = df[df["accession_id"].str.contains("NC")==False]
df = df[df["accession_id"].str.contains("DiaplexPositive")==False]
df = df[df["accession_id"].str.contains("ExtractionPositive")==False]
df = df[df["accession_id"].str.contains("Control")==False]
df = df[df["accession_id"].str.contains("Contaminated")==False]
df = df[df["accession_id"].str.contains("PC")==False]
df = df[df["accession_id"].str.contains("Blank")==False]
print('removing specific keywords in the spreadsheet', df.shape)

# renaming specific samples in the spreadsheet
df['accession_id'] = df['accession_id'].replace({'2005230549-1': '2005230549'})
df['accession_id'] = df['accession_id'].replace({'2005230619-1': '2005230619'})
df['accession_id'] = df['accession_id'].replace({'2102326037-1': '2102326037'})
df['accession_id'] = df['accession_id'].replace({'2002280169-1': '2002280169'})
df['accession_id'] = df['accession_id'].replace({'2003090165-1': '2003090165'})
df['accession_id'] = df['accession_id'].replace({'2005190431-1': '2005190431'})
df['accession_id'] = df['accession_id'].replace({'2005190442-1': '2005190442'})
df['accession_id'] = df['accession_id'].replace({'2005190513-1': '2005190513'})
df['accession_id'] = df['accession_id'].replace({'2005190587-1': '2005190587'})
df['accession_id'] = df['accession_id'].replace({'2005230137-1': '2005230137'})
df['accession_id'] = df['accession_id'].replace({'2007090800-1': '2007090800'})
df['accession_id'] = df['accession_id'].replace({'2007090806-1': '2007090806'})
df['accession_id'] = df['accession_id'].replace({'2007090831-1': '2007090831'})
df['accession_id'] = df['accession_id'].replace({'2007090850-1': '2007090850'})
df['accession_id'] = df['accession_id'].replace({'2004150183-1': '2004150183'})
df['accession_id'] = df['accession_id'].replace({'2004180099-1': '2004180099'})
df['accession_id'] = df['accession_id'].replace({'2004200476-1': '2004200476'})
df['accession_id'] = df['accession_id'].replace({'2004250387-1': '2004250387'})
df['accession_id'] = df['accession_id'].replace({'2102455039-1': '2102455039'})
df['accession_id'] = df['accession_id'].replace({'2100820125-1': '2100820125'})
df['accession_id'] = df['accession_id'].replace({'2005160401-1': '2005160401'})
df['accession_id'] = df['accession_id'].replace({'2005240111-1': '2005240111'})
df['accession_id'] = df['accession_id'].replace({'2102326037-2': '2102326037'})
df['accession_id'] = df['accession_id'].replace({'2102455039-2': '2102455039'})
df['accession_id'] = df['accession_id'].replace({'2100820125-2': '2100820125'})
df['accession_id'] = df['accession_id'].replace({'': ''})
df['accession_id'] = df['accession_id'].replace({'': ''})
df['accession_id'] = df['accession_id'].replace({'': ''})
df['accession_id'] = df['accession_id'].replace({'': ''})
df['accession_id'] = df['accession_id'].replace({'': ''})
print('renaming samples', df.shape)


# filter out the bad runs and waste water samples...
crit = ~df.seq_run.isin(bad_seq)
crit2 = ~df.seq_run.isin(wwt_runs)
crit3 = df.accession_id.str.contains('^2')
ww = df[crit & crit2 & crit3]
print('removing bad runs, WWT runs', ww.shape)

# filter samples to >90% coverage
crit = ww.percent_non_ambigous_bases != 'sample failed assembly'
ww = ww[crit]
ww.percent_non_ambigous_bases = ww.percent_non_ambigous_bases.astype('float')
print(ww.dtypes)
print(ww.shape)
crit = ww.percent_non_ambigous_bases >= 90
flit_df = ww[crit]
print('filtering based on 90 percent coverage', flit_df.shape)

# drop duplicates
drop = flit_df.sort_values(by ='percent_non_ambigous_bases', ascending = False)
drop = flit_df.drop_duplicates(subset = 'accession_id', keep = 'first')
print('dropping duplicates', drop.shape)


# save filtered metrics
outfile = os.path.join(out_data_dir, 'filtered_assembly_metrics_%s.csv' % current_date)
drop.to_csv(outfile, index = False)

# download fasta files to directory (if the files don't download, delete the folder and rerun this command)

In [None]:
# we are first going to pull all the fasta file directories from the bucket
for seq_run in seq_runs:
    # create seq_run subdir in the fasta's directory
    seq_run_subdir = os.path.join(fasta_files_dir, seq_run)
    
    if not os.path.exists(seq_run_subdir):
        print(seq_run)
        os.mkdir(seq_run_subdir)
        
        if re.search('COVSEQ', seq_run):
            bucket_path = 'covid_terra/%s/terra_outputs/assemblies/*consensus_renamed.fa' % (seq_run)
        elif re.search('COVMIN', seq_run) and seq_run in covmin_reruns:
            bucket_path = 'covid_terra/%s/terra_outputs_rr/assemblies/*consensus_renamed.fa' % (seq_run )
        elif re.search('COVMIN', seq_run):
            bucket_path = 'covid_terra/%s/terra_outputs/assemblies/*consensus_renamed.fa' % (seq_run )
        elif re.search('COVARC', seq_run):
            bucket_path = 'covid_terra/%s/terra_outputs/assemblies/*consensus_renamed.fa' % (seq_run )
        elif re.search('NOVASEQ', seq_run):
            bucket_path = 'covid_terra/%s/terra_outputs/assemblies/*consensus_renamed.fa' % (seq_run )         
        elif re.search('NEXSEQ', seq_run):
            bucket_path = 'covid_terra/%s/terra_outputs/assemblies/*consensus_renamed.fa' % (seq_run )
        
        # download the fasta file to the fasta files tmp directory
        !gsutil -m cp -r gs://{bucket_path} {seq_run_subdir}
    

# filter to 90% cov samples and pull fasta files in together

In [None]:
# pathway to the filtered assembly metrics file (CHANGE)
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/filtered_assembly_metrics_2022-07-14.csv'
metadata = pd.read_csv(path, dtype = {'accession_id' : object})
print (metadata.shape)

# using the fitlered assembly metrics we want to pull out the fasta files for those sequences
out_fasta = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/all_sequence_data.fasta'
if os.path.isfile(out_fasta):
    os.remove(out_fasta)

for row in range(metadata.shape[0]):
    accession_id = metadata.accession_id[row]
    seq_run = metadata.seq_run[row]
    
    fasta_file_path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/fasta_files/%s/%s_consensus_renamed.fa' % (seq_run, accession_id)
    
    print('adding %s_consensus_renamed.fa from %s to concatenated fasta file' % (accession_id, seq_run))
    record = SeqIO.read(fasta_file_path, 'fasta')
    with open(out_fasta, 'a') as out_handle:
        SeqIO.write(record, out_handle, 'fasta')


# removing dups from multi-fasta files

In [None]:
from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid

path = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/all_sequence_data_dups.fasta"
out = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/all_sequence_data.fasta"

def remove_dup_seqs(records):
    """"SeqRecord iterator to removing duplicate sequences."""
    checksums = set()
    for record in records:
        checksum = seguid(record.seq)
        if checksum in checksums:
            print ("Ignoring %s" % record.id)
            continue
        checksums.add(checksum)
        yield record

records = remove_dup_seqs(SeqIO.parse(path, "fasta"))
count = SeqIO.write(records, out, "fasta")
print ("Saved %i records" % count)

# clade dataset concatenation

In [None]:
# download files from GCP
download = '/home/diana_ir/Downloads/'
clade_01 = 'diana_sandbox/Nextstrain/nextclade_out/group_1_nextclade.csv'
clade_02 = 'diana_sandbox/Nextstrain/nextclade_out/group_2_nextclade.csv'
clade_03 = 'diana_sandbox/Nextstrain/nextclade_out/group_3_nextclade.csv'
clade_04 = 'diana_sandbox/Nextstrain/nextclade_out/group_4_nextclade.csv'
clade_05 = 'diana_sandbox/Nextstrain/nextclade_out/group_5_nextclade.csv'
clade_06 = 'diana_sandbox/Nextstrain/nextclade_out/group_6_nextclade.csv'

!gsutil -m cp -r gs://{clade_01} {download}
!gsutil -m cp -r gs://{clade_02} {download}
!gsutil -m cp -r gs://{clade_03} {download}
!gsutil -m cp -r gs://{clade_04} {download}
!gsutil -m cp -r gs://{clade_05} {download}
!gsutil -m cp -r gs://{clade_06} {download}


pangolin_01 = 'diana_sandbox/Nextstrain/pangolin_out/group_1_pangolin_lineage_report.csv'
pangolin_02 = 'diana_sandbox/Nextstrain/pangolin_out/group_2_pangolin_lineage_report.csv'
pangolin_03 = 'diana_sandbox/Nextstrain/pangolin_out/group_3_pangolin_lineage_report.csv'
pangolin_04 = 'diana_sandbox/Nextstrain/pangolin_out/group_4_pangolin_lineage_report.csv'
pangolin_05 = 'diana_sandbox/Nextstrain/pangolin_out/group_5_pangolin_lineage_report.csv'
pangolin_06 = 'diana_sandbox/Nextstrain/pangolin_out/group_6_pangolin_lineage_report.csv'


!gsutil -m cp -r gs://{pangolin_01} {download}
!gsutil -m cp -r gs://{pangolin_02} {download}
!gsutil -m cp -r gs://{pangolin_03} {download}
!gsutil -m cp -r gs://{pangolin_04} {download}
!gsutil -m cp -r gs://{pangolin_05} {download}
!gsutil -m cp -r gs://{pangolin_06} {download}



In [None]:
# pathway to the nextclade files
path1 = "/home/diana_ir/Downloads/group_1_nextclade.csv"
c1 = pd.read_csv(path1,sep ='\t', dtype = {'accession_id' : object})

path2 = "/home/diana_ir/Downloads/group_2_nextclade.csv"
c2 = pd.read_csv(path2,sep ='\t', dtype = {'accession_id' : object})

path3 = "/home/diana_ir/Downloads/group_3_nextclade.csv"
c3 = pd.read_csv(path3,sep ='\t', dtype = {'accession_id' : object})

path4 = "/home/diana_ir/Downloads/group_4_nextclade.csv"
c4 = pd.read_csv(path4,sep ='\t', dtype = {'accession_id' : object})

path5 = "/home/diana_ir/Downloads/group_5_nextclade.csv"
c5 = pd.read_csv(path5,sep ='\t', dtype = {'accession_id' : object})

path6 = "/home/diana_ir/Downloads/group_6_nextclade.csv"
c6 = pd.read_csv(path6,sep ='\t', dtype = {'accession_id' : object})

clade = pd.concat([c1, c2, c3, c4, c5, c6])

# save filtered metrics
outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/clade_%s.csv" % (current_date)
clade.to_csv(outfile,  index = False)


# pangolin dataset concatentation

In [None]:
# pathway to the pangolin files
path1 = "/home/diana_ir/Downloads/group_1_pangolin_lineage_report.csv"
p1 = pd.read_csv(path1, sep ='\t', dtype = {'accession_id' : object})

path2 = "/home/diana_ir/Downloads/group_2_pangolin_lineage_report.csv"
p2 = pd.read_csv(path2, sep ='\t', dtype = {'accession_id' : object})

path3 = "/home/diana_ir/Downloads/group_3_pangolin_lineage_report.csv"
p3 = pd.read_csv(path3, sep ='\t', dtype = {'accession_id' : object})

path4 = "/home/diana_ir/Downloads/group_4_pangolin_lineage_report.csv"
p4 = pd.read_csv(path4, sep ='\t', dtype = {'accession_id' : object})
                                            
path5 = "/home/diana_ir/Downloads/group_5_pangolin_lineage_report.csv"
p5 = pd.read_csv(path5, sep ='\t', dtype = {'accession_id' : object})
                                            
path6 = "/home/diana_ir/Downloads/group_6_pangolin_lineage_report.csv"
p6 = pd.read_csv(path6, sep ='\t', dtype = {'accession_id' : object})

pangolin = pd.concat([p1, p2, p3, p4, p5, p6])

# save filtered metrics
outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/pangolin_%s.csv" % (current_date)
pangolin.to_csv(outfile, index = False)

# pangolin and clade dataset merge with cdphe metadata strain dataset and adding the "required columns - strain, region, country, division, location" to the metadata, remove duplication

In [None]:
# read in the data
# pangolin
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/pangolin_%s.csv' % (current_date)
pangolin = pd.read_csv(path, dtype = {'taxon' : object})
pangolin = pangolin.rename(columns = {'lineage' : 'pangolin'})
pangolin = pangolin.set_index('taxon')

# nextclade
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/clade_%s.csv' % (current_date)
nextclade = pd.read_csv(path, dtype = {'seqName' : object})
nextclade = nextclade.set_index('seqName')

# metadata
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/filtered_assembly_metrics_%s.csv' % (current_date)
df = pd.read_csv(path, dtype = {'accession_id' : object})

# renaming the accession_id to "strain"
def create_strain_name(accession_id):
    return 'CO-CDPHE-%s' % accession_id

strain = df.apply(lambda x:create_strain_name(x.accession_id), axis = 1)
df.insert(loc = 0, value = strain, column = 'strain')

# adding in the necessary columns
df['region'] = 'North America'
df['country'] = 'USA'
df['division'] = 'Colorado'
df['location'] = ''

df = df.set_index('strain', drop=False)
print(df.shape)

# join all pangolin and nexclade
j = df.join(pangolin.pangolin, how = 'left')
k = j.join(nextclade.clade, how = 'left')
k

# drop duplicates
k = k.drop_duplicates(subset = 'strain', keep = 'first')
print(k.shape)

# save output file
outfile = os.path.join(out_data_dir, 'cdphe_metadata_%s.tsv' % current_date)
k.to_csv(outfile, index = False, sep = '\t')

# merge horizon data with seq + pangolin + clade dataset

In [None]:
metadata = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/cdphe_metadata_%s.tsv" % (current_date)
metadata = pd.read_csv(metadata, )
varseq = "/home/diana_ir/Desktop/summary_results/cv_var_seq_bioinformatics 2022-04-14.csv"
bicovid = "/home/diana_ir/Desktop/summary_results/bi_covid_bioinformatics 2021-09-22.csv"
output = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/"

In [None]:
# merge horizon with internal dataset
!{scripts}/horizon_internal_2.py -i {metadata} --varseq {varseq} --pui {bicovid} -o {output}

In [None]:
# merge horizon with external dataset
!{scripts}/horizon_external_2.py -i {metadata} --varseq {varseq} --pui {bicovid} -o {output}

# filtering concat files to (3 mo prior) and (6 mo prior) date

In [None]:
# **************internal build files********************

path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal.tsv'
internal = pd.read_csv(path, sep='\t', dtype = {'accession_id' : object})
print("original", internal.shape)

# drop columns
drop_col = ['first_name', 'last_name', 'dob']
internal = internal.drop(columns=drop_col)

#  3 mo prior from "3 mo prior from current date"
internal['date2'] = pd.to_datetime(internal['date'])  
mask = (internal['date2'] <= "2022-02-01")
before = internal.loc[mask]

# sample 20000 random samples 
before = before.sample(n = 20000)

# 3 mo prior from current date
mask = (internal['date2'] >= "2022-02-01")
three_in = internal.loc[mask]

# merge 3 mo with 20000 random samples 
three_in_merged = pd.concat([three_in, before])
three_in_merged = three_in_merged.reset_index()
print("merged", three_in_merged.shape)


# remove duplicates
three_in_merged = three_in_merged.drop_duplicates(subset = 'strain', keep = 'first')
print("duplications", three_in_merged.shape)

# drop "TMI" columns
drop_col = ["first_name", "last_name", "dob"]



# **************internal build final files********************

outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal-before.tsv"
before.to_csv(outfile, sep='\t', index = False)

outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal-3m.tsv"
three_in.to_csv(outfile, sep='\t', index = False)

outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal-3m-merged.tsv"
three_in_merged.to_csv(outfile, sep='\t', index = False)


# **************external build files********************

path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-external.tsv'
external = pd.read_csv(path, sep='\t', dtype = {'accession_id' : object})

drop_col = ['first_name', 'last_name', 'dob']
external = external.drop(columns=drop_col)

#  3 mo prior from "3 mo prior from current date"
external['date2'] = pd.to_datetime(external['date'])  
mask = (external['date2'] <= "2022-02-01")
prev = external.loc[mask]

# sample 20000 random samples 
prev = prev.sample(n = 20000)


# 3 mo
mask = (external['date2'] >= "2022-02-01")
three_ex = external.loc[mask]

# merge 3 mo with 20000 random samples 
three_ex_merged = pd.concat([three_ex, before])
three_ex_merged = three_ex_merged.reset_index()

# remove duplicates
three_ex_merged = three_ex_merged.drop_duplicates(subset = 'strain', keep = 'first')
print(three_ex_merged.shape)


# **************external build final files********************

outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-external-prev.tsv"
prev.to_csv(outfile, sep='\t', index = False)

outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-external-3m.tsv"
three_ex.to_csv(outfile, sep='\t', index = False)

outfile = "/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-external-3m-merged.tsv"
three_ex_merged.to_csv(outfile, sep='\t', index = False)

# filter cov samples to internal 3 months and pull fasta files in together

In [None]:
# pulling metadata-internal-3m-merged fasta files together
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal-3m-merged.tsv'
metadata = pd.read_csv(path, sep='\t', dtype = {'accession_id' : object})
print (metadata.shape)

# filter to samples with >90% coverage 
crit = metadata.percent_non_ambigous_bases >= 90
metadata = metadata[crit]
print(metadata.shape)
metadata = metadata.reset_index(drop = True)

# using the fitlered assembly metrics we want to pull out the internal fasta files for those sequences
out_fasta = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal-3m-merged_sequence_data.fasta'
if os.path.isfile(out_fasta):
    os.remove(out_fasta)
        
for row in range(metadata.shape[0]):
    accession_id = metadata.accession_id[row]
    seq_run = metadata.seq_run[row]
    
    fasta_file_path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/fasta_files/%s/%s_consensus_renamed.fa' % (seq_run, accession_id)
    
    print('adding %s_consensus_renamed.fa from %s to concatenated fasta file' % (accession_id, seq_run))
    record = SeqIO.read(fasta_file_path, 'fasta')
    with open(out_fasta, 'a') as out_handle:
        SeqIO.write(record, out_handle, 'fasta')


# filter cov samples to external 3 months and pull fasta files in together

In [None]:
# pulling metadata-external-3m-merged fasta files together
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-external-3m-merged.tsv'
metadata = pd.read_csv(path, sep='\t', dtype = {'accession_id' : object})
print (metadata.shape)


# filter to samples with >90% coverage 
crit = metadata.percent_non_ambigous_bases >= 90
metadata = metadata[crit]
print(metadata.shape)
metadata = metadata.reset_index(drop = True)

# using the fitlered assembly metrics we want to pull out the external fasta files for those sequences
out_fasta = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-external-3m-merged_sequence_data.fasta'
if os.path.isfile(out_fasta):
    os.remove(out_fasta)

for row in range(metadata.shape[0]):
    accession_id = metadata.accession_id[row]
    seq_run = metadata.seq_run[row]
    
    fasta_file_path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/fasta_files/%s/%s_consensus_renamed.fa' % (seq_run, accession_id)
    
    print('adding %s_consensus_renamed.fa from %s to concatenated fasta file' % (accession_id, seq_run))
    record = SeqIO.read(fasta_file_path, 'fasta')
    with open(out_fasta, 'a') as out_handle:
        SeqIO.write(record, out_handle, 'fasta')


# filter samples to only delta samples

In [None]:
# pathway to original metadata internal file
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal.tsv'
delta = pd.read_csv(path,  sep="\t", dtype = {'accession_id' : object})
print("original", delta.shape)

# filtering date 6 months prior to current date, NEXTSTRAIN can't handle >50,000 samples in a build
delta['date2'] = pd.to_datetime(delta['date'])  
mask = (delta['date2'] >= "2021-09-01")
filtered = delta.loc[mask]
print("filtered date", filtered.shape)

# filtering down to B.1.617.2's
a01 = filtered.pangolin == ('B.1.617.2')
a02 = filtered["pangolin"].str.contains("AY")==True
df_filtered = filtered[ a01 | a02 ]
print("delta filtered", df_filtered.shape)

# save output file as...
outfile = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/delta_metadata-internal.tsv'
df_filtered.to_csv(outfile, index = False, sep = '\t')
print (df_filtered.shape)

# filter cov samples to internal delta variant only and pull fasta files in together

In [None]:
# pathway to delta metadata file
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/delta_metadata-internal.tsv'
metadata = pd.read_csv(path, sep = '\t', dtype = {'accession_id' : object})

# remove duplicates
metadata = metadata.drop_duplicates(subset = 'strain', keep = 'first')
print(metadata.shape)

# filter to samples with >90% coverage 
crit = metadata.percent_non_ambigous_bases >= 90
metadata = metadata[crit]
print(metadata.shape)
metadata = metadata.reset_index(drop = True)

# using the fitlered assembly metrics we want to pull out the fasta files for those sequences
out_fasta = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/delta_sequence_data.fasta'
if os.path.isfile(out_fasta):
    os.remove(out_fasta)

for row in range(metadata.shape[0]):
    accession_id = metadata.accession_id[row]
    seq_run = metadata.seq_run[row]
    
    fasta_file_path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/fasta_files/%s/%s_consensus_renamed.fa' % (seq_run, accession_id)
    
    print('adding %s_consensus_renamed.fa from %s to concatenated fasta file' % (accession_id, seq_run))
    record = SeqIO.read(fasta_file_path, 'fasta')
    with open(out_fasta, 'a') as out_handle:
        SeqIO.write(record, out_handle, 'fasta')


# filter samples to only omicron samples

In [None]:
# pathway to original metadata internal file
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/metadata-internal.tsv'
omicron = pd.read_csv(path,  sep="\t", dtype = {'accession_id' : object})
print("original", omicron.shape)

# filtering date 6 months prior to current date, NEXTSTRAIN can't handle >50,000 samples in a build
omicron['date2'] = pd.to_datetime(omicron['date'])  
mask = (omicron['date2'] >= "2022-01-01")
filtered = omicron.loc[mask]
print("filtered date", filtered.shape)

# filtering down to B.1.1.529 south african strain
a01 = filtered["pangolin"].str.contains("BA")==True

df_filtered = filtered[ a01 ]
print("filtered to omicron", df_filtered)


outfile = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/omicron_metadata-internal.tsv'
df_filtered.to_csv(outfile, index = False, sep = '\t')
print (df_filtered.shape)

# filter cov samples to internal omicron variant only and pull fasta files in together

In [None]:
# pathway to omicron metadata file
path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/omicron_metadata-internal.tsv'
metadata = pd.read_csv(path, sep = '\t', dtype = {'accession_id' : object})


crit = metadata.percent_non_ambigous_bases >= 90
metadata = metadata[crit]
print(metadata.shape)
metadata = metadata.reset_index(drop = True)


# using the fitlered assembly metrics we want to pull out the fasta files for those sequences
out_fasta = '/home/diana_ir/Documents/nextstrain/covid_sequencing/out_data/omicron_sequence_data.fasta'
if os.path.isfile(out_fasta):
    os.remove(out_fasta)

for row in range(metadata.shape[0]):
    accession_id = metadata.accession_id[row]
    seq_run = metadata.seq_run[row]
    
    fasta_file_path = '/home/diana_ir/Documents/nextstrain/covid_sequencing/fasta_files/%s/%s_consensus_renamed.fa' % (seq_run, accession_id)
    
    print('adding %s_consensus_renamed.fa from %s to concatenated fasta file' % (accession_id, seq_run))
    record = SeqIO.read(fasta_file_path, 'fasta')
    with open(out_fasta, 'a') as out_handle:
        SeqIO.write(record, out_handle, 'fasta')
