# Compare stats of ipa.popgen to vcftools

In [1]:
# conda install vcftools -c conda-forge -c bioconda
# conda install ipcoal -c conda-forge

In [2]:
import toytree
import ipcoal
print(ipcoal.__version__)  # must be 0.1.6 or above.

0.1.6


### SETUP

In [3]:
NTIPS = 2
NDIPLOIDS = 5
TREEHEIGHT = 1e5
NE = 1e5
NLOCI = 10
NSITES = 500

### 1. Simulate a VCF with no missing data

In [4]:
# setup simulation
tree = toytree.rtree.unittree(ntips=NTIPS, treeheight=TREEHEIGHT)
mod = ipcoal.Model(tree=tree, Ne=NE, nsamples=NDIPLOIDS * 2)
mod.sim_loci(nloci=NLOCI, nsites=NSITES)

# get vcf dataframe and write to file
vcfdf = mod.write_vcf(diploid=True)
mod.write_vcf(name="test", outdir="/tmp", diploid=True)

wrote 100 SNPs across 10 linkage blocks to /tmp/test.vcf


In [5]:
vcfdf.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,r0-0,r0-1,r0-2,r0-3,r0-4,r1-0,r1-1,r1-2,r1-3,r1-4
0,1,55,.,C,T,99,PASS,.,GT,0|0,0|0,0|1,1|1,0|0,0|0,0|0,0|0,0|0,0|0
1,1,59,.,G,T,99,PASS,.,GT,0|0,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0
2,1,99,.,C,G,99,PASS,.,GT,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0
3,1,111,.,T,A,99,PASS,.,GT,1|0,0|0,0|0,0|0,1|1,0|0,0|0,0|0,0|0,0|0
4,1,168,.,C,G,99,PASS,.,GT,0|0,0|0,0|1,0|1,0|0,0|0,0|0,0|0,0|0,0|0


### 2. Simulate VCF with 25% missing data

In [6]:
# get the same data as vcf with 25% missing
mod.apply_missing_mask(coverage=0.25, coverage_type='site')

# get vcf dataframe and write to file
mvcfdf = mod.write_vcf(diploid=True, fill_missing_alleles=True)
mod.write_vcf(name="test-miss", outdir="/tmp", diploid=True, fill_missing_alleles=True)

wrote 53 SNPs across 10 linkage blocks to /tmp/test-miss.vcf


In [7]:
mvcfdf.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,r0-0,r0-1,r0-2,r0-3,r0-4,r1-0,r1-1,r1-2,r1-3,r1-4
0,1,99,.,C,G,99,PASS,.,GT,.|.,.|.,0|0,.|.,.|.,0|0,1|0,0|0,.|.,.|.
1,1,111,.,T,A,99,PASS,.,GT,.|.,.|.,.|.,.|.,1|1,0|0,0|0,0|0,.|.,0|0
2,1,216,.,A,C,99,PASS,.,GT,.|.,0|0,0|0,.|.,.|.,1|0,.|.,.|.,0|0,0|0
3,1,340,.,T,A,99,PASS,.,GT,.|.,0|0,.|.,.|.,0|0,1|1,.|.,.|.,.|.,1|1
4,1,486,.,T,C,99,PASS,.,GT,.|.,.|.,.|.,1|1,0|0,.|.,.|.,.|.,0|0,.|.


### Make populations files

In [8]:
with open("population-1.txt", 'w') as out:
    out.write("\n".join([f"r0-{i}" for i in range(NDIPLOIDS)]))

In [9]:
with open("population-2.txt", 'w') as out:
    out.write("\n".join([f"r1-{i}" for i in range(NDIPLOIDS)]))

### Calculate Fst

In [10]:
%%bash

vcftools --vcf /tmp/test.vcf \
         --weir-fst-pop population-1.txt \
         --weir-fst-pop population-2.txt \
         --max-missing 0.1 \
         --maf 0.1 \
         --out /tmp/fst
         
cat /tmp/fst.weir.fst

CHROM	POS	WEIR_AND_COCKERHAM_FST
1	55	0.166667
1	111	0.166667
1	340	0.166667
1	486	0.3125
2	99	0.25
2	143	0.166667
2	167	0.166667
2	212	0.25
2	220	-0.0416667
3	5	0.25
3	122	0.25
3	483	0.25
4	170	0.541667
4	328	0.541667
4	354	0.541667
4	356	0.541667
4	398	0.541667
5	389	0.375
6	6	-0.05
6	217	-0.0294118
6	224	0.375
6	234	0.166667
6	296	0.375
6	314	-0.05
7	7	0.3125
7	266	0.166667
7	270	-0.0576923
7	284	0.107143
7	476	0.107143
7	490	-0.05
8	94	1
8	140	0.541667
8	174	0.541667
8	184	1
8	192	0.642857
8	290	1
8	333	0.166667
9	259	0.45
9	358	0.166667
9	500	0.4
10	79	0.12037
10	120	0.12037
10	207	0.583333
10	213	0.12037
10	240	0.25
10	294	0.12037



VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf /tmp/test.vcf
	--weir-fst-pop population-1.txt
	--weir-fst-pop population-2.txt
	--keep population-1.txt
	--keep population-2.txt
	--maf 0.1
	--max-missing 0.1
	--out /tmp/fst

Keeping individuals in 'keep' list
After filtering, kept 10 out of 10 Individuals
Outputting Weir and Cockerham Fst estimates.
Weir and Cockerham mean Fst estimate: 0.30511
Weir and Cockerham weighted Fst estimate: 0.38479
After filtering, kept 46 out of a possible 100 Sites
Run Time = 0.00 seconds


In [11]:
%%bash

vcftools --vcf /tmp/test-miss.vcf \
         --weir-fst-pop population-1.txt \
         --weir-fst-pop population-2.txt \
         --max-missing 0.1 \
         --maf 0.1 \
         --out /tmp/fst-miss
         
cat /tmp/fst-miss.weir.fst

CHROM	POS	WEIR_AND_COCKERHAM_FST
1	99	-0.4
1	111	1
1	340	1
1	486	-1
2	99	8.32667e-17
2	111	-1
2	143	-0.2
2	167	-1
2	212	1
2	220	0.204082
3	5	0.368421
3	64	-nan
3	165	-1
3	310	8.32667e-17
3	455	0
4	170	0.368421
4	328	0.25
4	354	0
4	356	0.25
4	398	-0.5
4	474	0
5	289	0
5	356	8.32667e-17
6	6	-nan
6	29	-1
6	217	0
6	234	0
6	296	-1
6	314	-0.153846
7	452	0.6
7	476	-0.387755
7	490	-0.333333
8	94	1
8	140	-1
8	174	-nan
8	184	1
8	192	1
8	208	-1
8	290	1
8	404	-1
9	259	-nan
9	358	1
9	395	0
9	500	0.25
10	79	-0.440443
10	120	1
10	213	0.333333
10	240	0.578947
10	294	-1



VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf /tmp/test-miss.vcf
	--weir-fst-pop population-1.txt
	--weir-fst-pop population-2.txt
	--keep population-1.txt
	--keep population-2.txt
	--maf 0.1
	--max-missing 0.1
	--out /tmp/fst-miss

Keeping individuals in 'keep' list
After filtering, kept 10 out of 10 Individuals
Outputting Weir and Cockerham Fst estimates.
Weir and Cockerham mean Fst estimate: -0.004715
Weir and Cockerham weighted Fst estimate: 0.26394
After filtering, kept 49 out of a possible 53 Sites
Run Time = 0.00 seconds
