In [5]:
import os
import pandas as pd
import dendropy
import csv
from Bio import SeqIO
import numpy as np
import json
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

### Split AZ population into strains with the chromosome III locus present or with the locus absent

In [6]:
#add pop info
pop_dict = json.load(open('../isolate_and_pop_info/pop_dict.json'))
for strain, pops in pop_dict.items():
    pop_dict[strain] = pops
    
AZ_strains = [i for i, pop in pop_dict.items() if pop=='AZ']
len(AZ_strains)

29

In [7]:
with open('AZ_strains.txt', 'w') as f:

    for strain in AZ_strains:
        f.write(strain+'\n')

In [8]:
AZ_with = ["Phoenix_1","Phoenix_3","Phoenix_6","Tucson_14","Phoenix_4","Tucson_12","Tucson_1","Tucson_18","Tucson_17","Tucson_10",]
AZ_without =  ["GT162_USA","Colorado_Springs_1","Tucson_9","Phoenix_7","Tucson_21","Phoenix_2","Tucson_19","Phoenix_9",
              "B10757_Nevada","Tucson_23","Phoenix_5","Phoenix_8","Tucson_15","Tucson_3","Tucson_6","Tucson_5","Tucson_8",
              "Tucson_13","Tucson_22",]




with open('AZ_strains_locus_present.txt', 'w') as f:

    for strain in AZ_with:
        f.write(strain+'\n')
        
with open('AZ_strains_locus_absent.txt', 'w') as f:

    for strain in AZ_without:
        f.write(strain+'\n')

### Calculate Tajima's D

In [24]:
##Tajima's D for strains with locus present

vcf = '../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf'

!vcftools --vcf {vcf} --TajimaD 250 --max-missing 1.0 --keep AZ_strains_locus_present.txt --out AZ_strains_locus_present_250bp_window



VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf ../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf
	--keep AZ_strains_locus_present.txt
	--max-missing 1
	--out AZ_strains_locus_present_250bp_window
	--TajimaD 250

Keeping individuals in 'keep' list
After filtering, kept 10 out of 61 Individuals
Outputting Tajima's D Statistic...
	TajimaD: Only using bialleleic sites.
After filtering, kept 404632 out of a possible 1012365 Sites
Run Time = 16.00 seconds


In [25]:
##Tajima's D for strains with locus absent

vcf = '../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf'

!vcftools --vcf {vcf} --TajimaD 250 --max-missing 1.0 --keep AZ_strains_locus_absent.txt --out AZ_strains_locus_absent_250bp_window



VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf ../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf
	--keep AZ_strains_locus_absent.txt
	--max-missing 1
	--out AZ_strains_locus_absent_250bp_window
	--TajimaD 250

Keeping individuals in 'keep' list
After filtering, kept 19 out of 61 Individuals
Outputting Tajima's D Statistic...
	TajimaD: Only using bialleleic sites.
After filtering, kept 360743 out of a possible 1012365 Sites
Run Time = 16.00 seconds


In [26]:
td_present = pd.read_csv('AZ_strains_locus_present_250bp_window.Tajima.D', sep='\t').drop(columns='N_SNPS')
td_absent = pd.read_csv('AZ_strains_locus_absent_250bp_window.Tajima.D', sep='\t').drop(columns='N_SNPS')

In [27]:
td = td_present.merge(td_absent, on=['CHROM','BIN_START'], suffixes=['_present','_absent'], how='outer')
td['BIN_START'] = td['BIN_START']+1

td.head()

Unnamed: 0,CHROM,BIN_START,TajimaD_present,TajimaD_absent
0,CP075068.1,501,-0.59155,
1,CP075068.1,751,,-0.825509
2,CP075068.1,1001,-0.768572,
3,CP075068.1,1251,,
4,CP075068.1,1501,,-1.0906


### Calculate FST

In [28]:
vcf = '../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf'

!vcftools --vcf {vcf} --fst-window-size 250 --fst-window-step 250 --max-missing 1.0 --weir-fst-pop  AZ_strains_locus_present.txt --weir-fst-pop  AZ_strains_locus_absent.txt --out AZ_strains_250bp_window


VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf ../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf
	--fst-window-size 250
	--fst-window-step 250
	--weir-fst-pop AZ_strains_locus_present.txt
	--weir-fst-pop AZ_strains_locus_absent.txt
	--keep AZ_strains_locus_present.txt
	--keep AZ_strains_locus_absent.txt
	--max-missing 1
	--out AZ_strains_250bp_window

Keeping individuals in 'keep' list
After filtering, kept 29 out of 61 Individuals
Outputting Windowed Weir and Cockerham Fst estimates.
Weir and Cockerham mean Fst estimate: -0.0040478
Weir and Cockerham weighted Fst estimate: 0.0019009
After filtering, kept 339399 out of a possible 1012365 Sites
Run Time = 17.00 seconds


In [29]:
fst = pd.read_csv('AZ_strains_250bp_window.windowed.weir.fst', sep='\t').drop(columns=['BIN_END', 'MEAN_FST', 'N_VARIANTS'])
fst.head()

Unnamed: 0,CHROM,BIN_START,WEIGHTED_FST
0,CP075068.1,751,-0.038251
1,CP075068.1,1501,-0.038251
2,CP075068.1,2251,0.068627
3,CP075068.1,4501,-0.038251
4,CP075068.1,66251,-0.019324


### Calculate pi

In [30]:
vcf = '../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf'

!vcftools --vcf {vcf} --window-pi 250 --window-pi-step 250 --max-missing 1.0 --keep AZ_strains_locus_present.txt --out AZ_strains_locus_present_250bp_window



VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf ../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf
	--keep AZ_strains_locus_present.txt
	--max-missing 1
	--out AZ_strains_locus_present_250bp_window
	--window-pi 250
	--window-pi-step 250

Keeping individuals in 'keep' list
After filtering, kept 10 out of 61 Individuals
Outputting Windowed Nucleotide Diversity Statistics...
After filtering, kept 404632 out of a possible 1012365 Sites
Run Time = 16.00 seconds


In [31]:
vcf = '../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf'

!vcftools --vcf {vcf} --window-pi 250 --window-pi-step 250 --max-missing 1.0 --keep AZ_strains_locus_absent.txt --out AZ_strains_locus_absent_250bp_window



VCFtools - 0.1.17
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf ../isolate_and_pop_info/all_strains_variant_sites.renamed.vcf
	--keep AZ_strains_locus_absent.txt
	--max-missing 1
	--out AZ_strains_locus_absent_250bp_window
	--window-pi 250
	--window-pi-step 250

Keeping individuals in 'keep' list
After filtering, kept 19 out of 61 Individuals
Outputting Windowed Nucleotide Diversity Statistics...
After filtering, kept 360743 out of a possible 1012365 Sites
Run Time = 16.00 seconds


In [32]:
pi_present = pd.read_csv('AZ_strains_locus_present_250bp_window.windowed.pi', sep='\t').drop(columns=['BIN_END', 'N_VARIANTS'])
pi_absent = pd.read_csv('AZ_strains_locus_absent_250bp_window.windowed.pi', sep='\t').drop(columns=['BIN_END', 'N_VARIANTS'])


In [33]:
pi = pi_present.merge(pi_absent, on=['CHROM','BIN_START'], suffixes=['_present','_absent'], how='outer')
pi.head()

Unnamed: 0,CHROM,BIN_START,PI_present,PI_absent
0,CP075068.1,501,0.000758,
1,CP075068.1,1001,0.001516,
2,CP075068.1,2251,0.000758,
3,CP075068.1,5251,0.000758,
4,CP075068.1,5501,0.000758,


### Merge all

In [34]:
merged = td.merge(pi, on=['CHROM', 'BIN_START'], how='outer').merge(fst, on=['CHROM', 'BIN_START'], how='outer')
merged = merged.sort_values(['CHROM', 'BIN_START'])
merged

Unnamed: 0,CHROM,BIN_START,TajimaD_present,TajimaD_absent,PI_present,PI_absent,WEIGHTED_FST
0,CP075068.1,501,-0.591550,,0.000758,,
1,CP075068.1,751,,-0.825509,,0.000410,-0.038251
2,CP075068.1,1001,-0.768572,,0.001516,,
3,CP075068.1,1251,,,,,
4,CP075068.1,1501,,-1.090600,,0.000819,-0.038251
...,...,...,...,...,...,...,...
111169,CP075072.1,1488001,,,,,
111170,CP075072.1,1488251,-0.591550,,0.000758,,
111171,CP075072.1,1488501,0.351953,,0.001347,,
111172,CP075072.1,1488751,,-0.694900,,0.001957,0.007223


In [35]:
merged.to_csv('../tables/TableS5_present_absent_stats.csv')

In [36]:
start = 1237751
stop = 1428001

merged_upstream = merged[(merged['CHROM']=='CP075070.1') & (merged['BIN_START'].isin(range(start-5000, start+250)))]
merged_downstream = merged[(merged['CHROM']=='CP075070.1') & (merged['BIN_START'].isin(range(stop-250, stop+5000)))]


In [38]:
np.random.seed(400)

p_dict = {}

for stat in ['PI_present','PI_absent', 'WEIGHTED_FST']:
    
    max_val = merged_upstream[stat].max()
    sample_counts = []

    for _ in range(10000):
    
        sample_index = merged.sample().index[0]
        sample = merged.loc[sample_index:sample_index+(5000/250)]
        sample_over_threshold = sample[sample[stat] >= float(max_val)].shape[0]    
        sample_counts += [sample_over_threshold]


    p = len([i for i in sample_counts if i >= 1])/len(sample_counts)
    p_dict[stat+'_upstream'] = [max_val, p]


for stat in ['TajimaD_present', 'TajimaD_absent']:
    
    min_val = merged_upstream[stat].min()
    sample_counts = []

    for _ in range(10000):
    
        sample_index = merged.sample().index[0]
        sample = merged.loc[sample_index:sample_index+(5000/250)]
        sample_over_threshold = sample[sample[stat] <= float(min_val)].shape[0]    
        sample_counts += [sample_over_threshold]


    p = len([i for i in sample_counts if i >= 1])/len(sample_counts)
    p_dict[stat+'_upstream'] = [min_val, p]
    

In [39]:
np.random.seed(500)

for stat in ['PI_present','PI_absent', 'WEIGHTED_FST']:
    
    max_val = merged_downstream[stat].max()
    sample_counts = []

    for _ in range(10000):
    
        sample_index = merged.sample().index[0]
        sample = merged.loc[sample_index:sample_index+(5000/250)]
        sample_over_threshold = sample[sample[stat] >= float(max_val)].shape[0]    
        sample_counts += [sample_over_threshold]


    p = len([i for i in sample_counts if i >= 1])/len(sample_counts)
    p_dict[stat+'_downstream'] = [max_val, p]


for stat in ['TajimaD_present', 'TajimaD_absent']:
    
    min_val = merged_downstream[stat].min()
    sample_counts = []

    for _ in range(10000):
    
        sample_index = merged.sample().index[0]
        sample = merged.loc[sample_index:sample_index+(5000/250)]
        sample_over_threshold = sample[sample[stat] <= float(min_val)].shape[0]    
        sample_counts += [sample_over_threshold]


    p = len([i for i in sample_counts if i >= 1])/len(sample_counts)
    p_dict[stat+'_downstream'] = [min_val, p]
    

In [40]:
p_dict

{'PI_present_upstream': [0.00614737, 0.2926],
 'PI_absent_upstream': [0.00505263, 0.4055],
 'WEIGHTED_FST_upstream': [0.0420787, 0.7775],
 'TajimaD_present_upstream': [-1.003, 0.0675],
 'TajimaD_absent_upstream': [-1.25802, 0.5038],
 'PI_present_downstream': [0.0048, 0.5239],
 'PI_absent_downstream': [0.00669132, 0.198],
 'WEIGHTED_FST_downstream': [0.0686275, 0.7124],
 'TajimaD_present_downstream': [-0.948945, 0.1454],
 'TajimaD_absent_downstream': [-1.37697, 0.2305]}

In [43]:
p_df = pd.DataFrame.from_dict(p_dict, orient='index')
p_df.columns = ['threshold_tested', 'p_val']

In [44]:
p_df

Unnamed: 0,threshold_tested,p_val
PI_present_upstream,0.006147,0.2926
PI_absent_upstream,0.005053,0.4055
WEIGHTED_FST_upstream,0.042079,0.7775
TajimaD_present_upstream,-1.003,0.0675
TajimaD_absent_upstream,-1.25802,0.5038
PI_present_downstream,0.0048,0.5239
PI_absent_downstream,0.006691,0.198
WEIGHTED_FST_downstream,0.068627,0.7124
TajimaD_present_downstream,-0.948945,0.1454
TajimaD_absent_downstream,-1.37697,0.2305


In [45]:
p_df.to_csv('../tables/TableS5_p_vals.csv')