In [23]:
import pandas as pd
from collections import Counter
import numpy as np
import json
pd.options.mode.chained_assignment = None  # default='warn'

### Table S3A: Combine CNV data from each strain into one dataframe

In [24]:
pop_dict = json.load(open('../isolate_and_pop_info/pop_dict.json'))
for strain, pops in pop_dict.items():
    pop_dict[strain] = pops
    
strains = list(pop_dict.keys())

In [25]:
cnv_files = !ls FREEC_output_250bp_window/*_CNVs

df = pd.DataFrame()

for i, f in enumerate(cnv_files):
    
    strain = f.replace('.CpSilv_core_only_bwa.sorted.bam_CNVs', '').replace('FREEC_output_250bp_window/','')
    
    if strain not in strains:
        continue
        
    temp = pd.read_csv(f, sep='\t', header=None,)
    temp.columns=['chrom', 'start','stop', 'CN', 'type']
    temp['strain'] = [strain for s in range(temp.shape[0])]

    df = df.append(temp)

In [27]:
df.to_csv('../tables/TableS3A_allCNV.csv', index=False)

### Combine CNV info across strains

In [51]:
ratio_files = !ls FREEC_output_250bp_window/*_ratio.txt

In [52]:
for i,f in enumerate(ratio_files):
    
    strain = f.replace('FREEC_output_250bp_window/', '').replace('.CpSilv_core_only_bwa.sorted.bam_ratio.txt', '')  
    if strain not in strains:
        continue
    
    temp = pd.read_csv(f, sep='\t')[['Chromosome','Start','CopyNumber']]
    temp = temp.rename(columns={'CopyNumber':strain})
    
    if i == 0:
        df = temp
    else:
        df = pd.merge(df,temp, on=['Chromosome','Start'])

In [53]:
df.head()

Unnamed: 0,Chromosome,Start,2566_Venezuela,3796_Venezuela,4545-MICE_Venezuela,730334_Guatemala,B10757_Nevada,B5773_Brazil,Coahuila_2,Colorado_Springs_1,...,Tucson_18,Tucson_19,Tucson_21,Tucson_22,Tucson_23,Tucson_3,Tucson_5,Tucson_6,Tucson_8,Tucson_9
0,CP075068.1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,CP075068.1,251,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,CP075068.1,501,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,CP075068.1,751,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,CP075068.1,1001,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Add gene info

In [54]:
gff_path = '../ref/genbank_files/CpSilv.genes_only.gff3'
gff = pd.read_csv(gff_path, sep='\t', header=None)[[0,2,3,4,6,8]]
gff.columns = ['chrom', 'type', 'start', 'stop', 'direction', 'annotation']
gff['name'] = gff['annotation'].str.split(';',expand=True)[0].str.replace('ID=gene-','')

gff = gff[gff['chrom'].isin(['CP075068.1','CP075069.1','CP075070.1','CP075071.1','CP075072.1'])]

gff.head()

Unnamed: 0,chrom,type,start,stop,direction,annotation,name
0,CP075068.1,gene,57375,57980,-,ID=gene-D8B26_000001;Name=D8B26_000001;gbkey=G...,D8B26_000001
1,CP075068.1,gene,58131,58768,-,ID=gene-D8B26_000002;Name=D8B26_000002;gbkey=G...,D8B26_000002
2,CP075068.1,gene,58628,59933,-,ID=gene-D8B26_000003;Name=D8B26_000003;gbkey=G...,D8B26_000003
3,CP075068.1,gene,60054,62173,-,ID=gene-D8B26_000004;Name=D8B26_000004;gbkey=G...,D8B26_000004
4,CP075068.1,gene,126638,126720,+,ID=gene-D8B26_000005;Name=D8B26_000005;gbkey=G...,D8B26_000005


In [55]:
window_size=250
window_genes = []

for chrom in gff['chrom'].unique():
    
    print(chrom)
    
    temp_genes = gff[gff['chrom']==chrom]
    temp_cnv = df[df['Chromosome']==chrom]
        
    for gene in temp_genes.iterrows():

        gene_start,gene_stop = gene[1].start, gene[1].stop
            
        for window_start in temp_cnv['Start']:
            if window_start > gene_stop:
                break
            
            window_stop=window_start+window_size                
            overlap = range(max(window_start, gene_start), min(window_stop, gene_stop)+1)

            if len(overlap)<50:   
                 continue
                    
            else:
                window_genes += [[chrom, window_start, gene[1]['name'], gene[1]['start'], gene[1]['stop']]]
                

CP075068.1
CP075069.1
CP075070.1
CP075071.1
CP075072.1


In [56]:
window_gene_df = pd.DataFrame(window_genes, columns=['Chromosome', 'Start', 'gene', 'gene_start', 'gene_stop'])
df = window_gene_df.merge(df, on=['Chromosome', 'Start'], how='outer').sort_values(['Chromosome','Start'])

In [57]:
df.head()

Unnamed: 0,Chromosome,Start,gene,gene_start,gene_stop,2566_Venezuela,3796_Venezuela,4545-MICE_Venezuela,730334_Guatemala,B10757_Nevada,...,Tucson_18,Tucson_19,Tucson_21,Tucson_22,Tucson_23,Tucson_3,Tucson_5,Tucson_6,Tucson_8,Tucson_9
68273,CP075068.1,1,,,,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
68274,CP075068.1,251,,,,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
68275,CP075068.1,501,,,,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
68276,CP075068.1,751,,,,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
68277,CP075068.1,1001,,,,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### Calculate Vst in each window

In [58]:
pop_dict = json.load(open('../isolate_and_pop_info/pop_dict.json'))
    
strains = list(pop_dict.keys())
sorted_strain_cols = sorted(pop_dict, key=pop_dict.get)

In [59]:
df = df[df.columns[:5].tolist() + sorted_strain_cols]

In [60]:
df.head()

Unnamed: 0,Chromosome,Start,gene,gene_start,gene_stop,Tucson_9,Phoenix_7,Tucson_21,Phoenix_2,Tucson_19,...,4545-MICE_Venezuela,3796_Venezuela,Nuevo_Leon_1,730334_Guatemala,Nuevo_Leon_2,San_Antonio_1,GT002_Texas,Coahuila_2,Sonora_1,B5773_Brazil
68273,CP075068.1,1,,,,1,1,1,1,1,...,1,1,1,1,1,2,1,1,1,1
68274,CP075068.1,251,,,,1,1,1,1,1,...,1,1,1,1,1,2,1,1,1,1
68275,CP075068.1,501,,,,1,1,1,1,1,...,1,1,1,1,1,2,1,1,1,1
68276,CP075068.1,751,,,,1,1,1,1,1,...,1,1,1,1,1,2,1,1,1,1
68277,CP075068.1,1001,,,,1,1,1,1,1,...,1,1,1,1,1,2,1,1,1,1


In [61]:
#for each population, calculate CN mean and variance within that population
pops = ['AZ','TXMXSA','CB']

var_df = df[list(df.columns[:3])]

for pop in pops:
    pop_strains = [s for s in strains if pop_dict[s]==pop]
    
    var_df.loc[:,'CN_MEAN_{}'.format(pop)] = df[pop_strains].apply(np.mean, axis=1)
    var_df.loc[:,'VAR_{}'.format(pop)] = df[pop_strains].apply(np.var, axis=1)
    var_df.loc[:,'N_{}'.format(pop)] = len(pop_strains)

In [62]:
#calculate VST
def calc_vst_3pops(v_pop1, v_pop2, v_pop3, n_pop1, n_pop2, n_pop3, v_total):
    
    vn = ((v_pop1*n_pop1)+(v_pop2*n_pop2)+(v_pop3*n_pop3))/(n_pop1+n_pop2+n_pop3)   
    vst = (v_total-vn)/v_total
    return vst

pop1, pop2, pop3 = pops
VAR_total = df[strains].apply(np.var, axis=1)
var_df.loc[:, 'VST_{}_{}_{}'.format(pop1, pop2, pop3)] = calc_vst_3pops(var_df['VAR_'+pop1], var_df['VAR_'+pop2],
                                                                     var_df['VAR_'+pop3],var_df['N_'+pop1], 
                                                                     var_df['N_'+pop2], var_df['N_'+pop3], VAR_total)

In [63]:
df = var_df[['Chromosome', 'Start','VST_AZ_TXMXSA_CB']].merge(df, 
         on=['Chromosome', 'Start'], how='outer').sort_values(['Chromosome','Start'])
df = df.drop(columns=['gene_start', 'gene_stop'])

In [64]:
df.to_csv('../tables/TableS3B_VST.csv', index=False)