# Creating and Testing Simulations

In [28]:
import toytree
import ipcoal
import numpy as np
import pandas as pd

## Random tree

In [16]:
tree = toytree.rtree.imbtree(ntips=10, treeheight=1e5)
tree.draw(ts='p')

(<toyplot.canvas.Canvas at 0x7ff9b595c760>,
 <toyplot.coordinates.Cartesian at 0x7ff9b5bb9ca0>,
 <toytree.Render.ToytreeMark at 0x7ffb0c48e340>)

## Simulating Introgression

In [17]:
tree.draw(ts='p', admixture_edges=[(3,8)])

(<toyplot.canvas.Canvas at 0x7ff9b82b1310>,
 <toyplot.coordinates.Cartesian at 0x7ff9b829b850>,
 <toytree.Render.ToytreeMark at 0x7ff9b5b0a2e0>)

In [18]:
mod_introgress =  ipcoal.Model(tree=tree, Ne=1e5, admixture_edges=[(3, 8, 0.5, 0.5)], nsamples=1)
mod_introgress.sim_loci(nloci=1, nsites=100000) #1 haploid chromosome
genos_introgress=mod_introgress.write_vcf().iloc[:, 9:].T #i need a matrix of 1's and 0's
genos_introgress

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1872,1873,1874,1875,1876,1877,1878,1879,1880,1881
r0,0,0,1,0,0,1,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
r1,0,0,0,0,0,0,1,1,0,0,...,0,1,0,1,0,0,0,0,1,0
r2,0,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
r3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
r4,0,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
r5,0,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
r6,1,0,0,1,1,0,0,0,0,1,...,0,0,1,0,1,1,1,1,0,1
r7,1,0,0,0,0,0,0,0,0,1,...,1,1,0,1,0,0,0,0,1,0
r8,1,1,0,0,0,0,0,0,1,1,...,0,1,0,1,0,0,0,0,1,0
r9,1,0,0,0,0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,1,0


## Simulating High ILS

I want to include high ILS as an option for null comparison

In [19]:
mod_highILS =  ipcoal.Model(tree=tree, Ne=1e8, nsamples=1)
mod_highILS.sim_loci(nloci=1, nsites=1000) #1 haploid chromosome
genos_highILS=mod_highILS.write_vcf()
data_highILS = genos_highILS
genos_highILS

#getting 2's and 3's

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9
0,0,1,.,T,"A,C,G",99,PASS,.,GT,3,0,0,2,0,2,0,1,0,0
1,0,2,.,A,C,99,PASS,.,GT,0,1,0,1,1,0,0,1,0,1
2,0,3,.,G,"A,C,T",99,PASS,.,GT,0,3,2,2,3,1,2,1,1,3
3,0,4,.,T,"A,C,G",99,PASS,.,GT,2,2,1,0,2,3,3,3,3,2
4,0,5,.,T,"A,C,G",99,PASS,.,GT,2,2,3,2,2,1,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,0,995,.,A,"C,G,T",99,PASS,.,GT,2,1,2,2,3,1,0,1,2,2
981,0,996,.,G,"C,T",99,PASS,.,GT,0,2,0,1,0,2,2,2,1,1
982,0,997,.,A,T,99,PASS,.,GT,1,0,1,0,1,0,0,0,0,0
983,0,998,.,C,"A,G,T",99,PASS,.,GT,3,2,3,0,3,2,1,2,0,0


In [20]:
mod_highILS.df

Unnamed: 0,locus,start,end,nbps,nsnps,tidx,genealogy
0,0,0,1,1,1,0,"(r0:3.89915e+08,(r2:2.94..."
1,0,1,3,2,2,1,"(r0:3.89915e+08,(r2:2.94..."
2,0,3,4,1,1,2,"(r0:3.28997e+08,(r2:2.94..."
3,0,4,5,1,1,3,"(r0:3.28997e+08,(r2:2.51..."
4,0,5,7,2,2,4,"(r0:3.28997e+08,(r2:2.51..."
...,...,...,...,...,...,...,...
522,0,990,993,3,2,522,"(((r4:5.53426e+07,(r0:5...."
523,0,993,994,1,1,523,"(((r4:5.53426e+07,(r0:5...."
524,0,994,995,1,1,524,"(r6:1.48641e+08,((r7:2.9..."
525,0,995,997,2,2,525,"(r6:1.48641e+08,((r7:2.9..."


In [21]:
# a dictionary of arguments to style the drawings
kwargs = {
    "ts": "c",
    "tip_labels": True,
    "shared_axis": True,
    "width": 600,
    "height": 200,
    "node_sizes": 6,
}
toytree.mtree(mod_highILS.df.genealogy).draw()

(<toyplot.canvas.Canvas at 0x7ff9b42d7dc0>,
 [<toyplot.coordinates.Cartesian at 0x7ff9b42d7d60>,
  <toyplot.coordinates.Cartesian at 0x7ff9b42f3880>,
  <toyplot.coordinates.Cartesian at 0x7ff9b42f61f0>,
  <toyplot.coordinates.Cartesian at 0x7ff9b42f6b20>],
 [<toytree.Render.ToytreeMark at 0x7ff9b427c490>,
  <toytree.Render.ToytreeMark at 0x7ff9b427c550>,
  <toytree.Render.ToytreeMark at 0x7ff9b427c580>,
  <toytree.Render.ToytreeMark at 0x7ff9b427c5b0>])

In [37]:
genealogies = np.array(mod_highILS.df['genealogy'])
unique_genos = np.unique(genealogies)
len(unique_genos)
len(genealogies)

527

In [40]:
trees=[]
for geno in unique_genos:
    tree = toytree.tree(geno, tree_format=0)
    trees.append(tree)

In [42]:
trees

[<toytree.Toytree.ToyTree at 0x7ff9b438e880>,
 <toytree.Toytree.ToyTree at 0x7ff9b613e250>,
 <toytree.Toytree.ToyTree at 0x7ff9b5f0ab20>,
 <toytree.Toytree.ToyTree at 0x7ff9b4245a90>,
 <toytree.Toytree.ToyTree at 0x7ff9b6015460>,
 <toytree.Toytree.ToyTree at 0x7ff9b5faffd0>,
 <toytree.Toytree.ToyTree at 0x7ffb0c440220>,
 <toytree.Toytree.ToyTree at 0x7ff9b438e940>,
 <toytree.Toytree.ToyTree at 0x7ffb0c440310>,
 <toytree.Toytree.ToyTree at 0x7ff9b5f81610>,
 <toytree.Toytree.ToyTree at 0x7ff9b5fa1cd0>,
 <toytree.Toytree.ToyTree at 0x7ff9b5fa1be0>,
 <toytree.Toytree.ToyTree at 0x7ff9b5fa1070>,
 <toytree.Toytree.ToyTree at 0x7ff9b5fb35e0>,
 <toytree.Toytree.ToyTree at 0x7ff9b5fb3c70>,
 <toytree.Toytree.ToyTree at 0x7ff9b5fb3880>,
 <toytree.Toytree.ToyTree at 0x7ff9b46981f0>,
 <toytree.Toytree.ToyTree at 0x7ff9b5f82f10>,
 <toytree.Toytree.ToyTree at 0x7ff9b5f82820>,
 <toytree.Toytree.ToyTree at 0x7ff9b5f82d90>,
 <toytree.Toytree.ToyTree at 0x7ff9b5f88bb0>,
 <toytree.Toytree.ToyTree at 0x7ff

## Slow mutation rate

In [24]:
mod_slowmut =  ipcoal.Model(tree=tree, Ne=1e6, nsamples=1)
mod_slowmut.sim_loci(nloci=1, nsites=100000) #1 haploid chromosome
genos_slowmut=mod_slowmut.write_vcf().iloc[:, 9:].T
genos_slowmut

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11221,11222,11223,11224,11225,11226,11227,11228,11229,11230
r0,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
r1,1,0,0,0,1,0,1,0,0,0,...,0,1,0,1,0,1,0,1,0,1
r2,1,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,1,0,1,0,0
r3,1,0,1,1,0,0,1,0,0,0,...,1,0,1,0,0,0,0,0,1,0
r4,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,1,0,0
r5,1,0,1,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,1,0,0
r6,1,0,1,1,0,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,0
r7,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,1,0,0,1,0,0
r8,1,0,0,0,1,1,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
r9,1,0,1,1,0,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [26]:
for col in genos_slowmut:
    genos_slowmut[col] = genos_slowmut[col].replace([2,3],1)

## Trying to simulate SNPs

In [31]:
model = ipcoal.Model(tree, Ne=1e5, nsamples=1) #initialize model

In [32]:
model.sim_snps(nsnps=100) #simulate snps

In [33]:
model.df #look at dataframe

Unnamed: 0,locus,start,end,nbps,nsnps,tidx,genealogy
0,0,0,1,1,1,0,"(r6:367457,(r8:171024,((..."
1,1,0,1,1,1,0,"((r5:116213,(r8:113972,r..."
2,2,0,1,1,1,0,"(((r0:55561.9,r5:55561.9..."
3,3,0,1,1,1,0,"((r1:176125,(r9:113921,(..."
4,4,0,1,1,1,0,"((r3:141210,(r8:123326,(..."
...,...,...,...,...,...,...,...
95,95,0,1,1,1,0,"((r6:93426,(r2:55068.3,r..."
96,96,0,1,1,1,0,"(((r6:126447,(r4:107894,..."
97,97,0,1,1,1,0,"(r7:190087,((r0:76920.3,..."
98,98,0,1,1,1,0,"((r9:138946,(r3:105167,(..."


In [34]:
kwargs = {
    "ts": "c",
    "tip_labels": True,
    "width": 600,
    "height": 200,
    "node_sizes": 6,
}
toytree.mtree(model.df.genealogy).draw(**kwargs); #draw genealogies

In [35]:
snps=model.write_vcf()
snp_data = snps.iloc[:, 9:].T
snps

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9
0,0,1,.,C,A,99,PASS,.,GT,0,0,1,0,0,0,0,0,0,0
1,1,1,.,G,T,99,PASS,.,GT,0,0,0,0,0,0,0,0,0,1
2,2,1,.,C,A,99,PASS,.,GT,1,1,0,0,0,1,1,0,1,0
3,3,1,.,T,A,99,PASS,.,GT,1,0,0,0,0,0,0,0,0,0
4,4,1,.,T,G,99,PASS,.,GT,0,1,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,1,.,T,A,99,PASS,.,GT,0,0,1,1,0,0,1,0,0,0
96,96,1,.,T,A,99,PASS,.,GT,0,0,0,0,0,1,0,0,0,0
97,97,1,.,T,C,99,PASS,.,GT,0,0,0,0,0,0,0,0,1,0
98,98,1,.,T,G,99,PASS,.,GT,0,0,0,0,0,0,0,1,0,0


In [45]:
csv = snp_data.to_csv(file1)

In [6]:
mydata = pd.read_csv(file1, index_col=0)
#mydata.drop(["Unnamed: 0"], axis=1)
mydata

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
r0,0,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
r1,0,0,1,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
r2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
r3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
r4,0,0,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
r5,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
r6,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
r7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
r8,0,0,1,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
r9,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
model.df

Unnamed: 0,locus,start,end,nbps,nsnps,tidx,genealogy
0,0,0,1,1,1,0,"((r6:130956,(r7:130407,(..."
1,1,0,1,1,1,0,"((r4:125522,(r8:103182,(..."
2,2,0,1,1,1,0,"(r9:356573,(r1:210923,(r..."
3,3,0,1,1,1,0,"((r9:118795,(r0:96411,(r..."
4,4,0,1,1,1,0,"(((r4:62954,(r0:44648.2,..."
...,...,...,...,...,...,...,...
95,95,0,1,1,1,0,"((r9:148718,(r0:97078,r8..."
96,96,0,1,1,1,0,"(r8:968471,((r5:131232,(..."
97,97,0,1,1,1,0,"(r7:265789,(((r0:86569.4..."
98,98,0,1,1,1,0,"((r6:103972,(r8:98686.5,..."


## Simulating SNPs in the Presence of Introgression

I want to simulate an admixture event close to recent between divergent lineages (mimic horizontal gene transfer). I want a simulated dataset that I can run through Hogtie as a test.

In [2]:
hgt_tree = toytree.rtree.imbtree(ntips=10, treeheight=1e10)
hgt_tree.draw(ts='p', admixture_edges=(2,8))

(<toyplot.canvas.Canvas at 0x7ffb1c660820>,
 <toyplot.coordinates.Cartesian at 0x7ff9ba22ab50>,
 <toytree.Render.ToytreeMark at 0x7ff9b9760a90>)

In [3]:
snp_introgression_model =  ipcoal.Model(tree=hgt_tree, Ne=1e8, admixture_edges=[(2, 8, 0.6, 0.9)], nsamples=1)

In [4]:
snp_introgression_model.sim_snps(nsnps=10000, repeat_on_trees=1)

In [5]:
snp_introgression_model.df

Unnamed: 0,locus,start,end,nbps,nsnps,tidx,genealogy
0,0,0,1,1,1,0,"(r9:1.01568e+10,((r2:1.3..."
1,1,0,1,1,1,0,"(r9:1.01025e+10,((r2:1.4..."
2,2,0,1,1,1,0,"(r9:1.02837e+10,((r2:1.4..."
3,3,0,1,1,1,0,"(r9:1.00285e+10,((r2:1.3..."
4,4,0,1,1,1,0,"(r9:1.00535e+10,((r2:1.6..."
...,...,...,...,...,...,...,...
9995,9995,0,1,1,1,0,"(r9:1.02572e+10,((r2:1.6..."
9996,9996,0,1,1,1,0,"(r9:1.01631e+10,((r2:1.5..."
9997,9997,0,1,1,1,0,"(r9:1.00469e+10,((r2:1.4..."
9998,9998,0,1,1,1,0,"(r9:1.03693e+10,((r2:1.6..."


In [6]:
data = snp_introgression_model.write_vcf().iloc[:, 9:].T

[1, 1, 2, 2, 0, 0, 0, 1, 0, 1]

In [7]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
r0,1,2,1,3,0,1,2,2,1,1,...,2,1,1,2,0,0,0,2,2,1
r1,1,2,2,3,2,0,0,1,1,2,...,1,0,2,1,2,0,3,1,2,0
r2,2,0,3,2,1,0,1,1,2,3,...,2,0,3,2,2,1,3,1,0,3
r3,2,2,0,3,0,1,0,0,0,3,...,1,2,2,1,1,0,1,2,2,1
r4,0,0,0,0,2,2,0,1,2,1,...,3,2,2,2,3,0,3,0,0,1
r5,0,2,2,2,2,1,1,1,0,3,...,1,2,2,1,0,0,2,1,1,1
r6,0,0,1,2,0,0,3,0,0,1,...,1,0,1,1,1,2,3,3,1,2
r7,1,1,0,1,2,0,3,1,2,3,...,2,1,3,0,2,1,3,2,0,0
r8,0,1,1,1,1,1,2,3,0,1,...,3,1,3,2,2,1,2,0,0,2
r9,1,0,3,2,0,3,0,0,2,0,...,1,2,0,1,1,1,1,1,1,1


In [8]:
for col in data:
    data[col] = data[col].replace([2,3],1)

In [10]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
r0,1,1,1,1,0,1,1,1,1,1,...,1,1,1,1,0,0,0,1,1,1
r1,1,1,1,1,1,0,0,1,1,1,...,1,0,1,1,1,0,1,1,1,0
r2,1,0,1,1,1,0,1,1,1,1,...,1,0,1,1,1,1,1,1,0,1
r3,1,1,0,1,0,1,0,0,0,1,...,1,1,1,1,1,0,1,1,1,1
r4,0,0,0,0,1,1,0,1,1,1,...,1,1,1,1,1,0,1,0,0,1
r5,0,1,1,1,1,1,1,1,0,1,...,1,1,1,1,0,0,1,1,1,1
r6,0,0,1,1,0,0,1,0,0,1,...,1,0,1,1,1,1,1,1,1,1
r7,1,1,0,1,1,0,1,1,1,1,...,1,1,1,0,1,1,1,1,0,0
r8,0,1,1,1,1,1,1,1,0,1,...,1,1,1,1,1,1,1,0,0,1
r9,1,0,1,1,0,1,0,0,1,0,...,1,1,0,1,1,1,1,1,1,1


In [13]:
data.to_csv(file1)

In [3]:
import numpy as np
import pandas as pd

In [15]:
for column in data:
    data[column] = data[column].replace([2,3],1)

In [11]:
import os
HOGTIEDIR = os.path.dirname(os.getcwd())
file1 = os.path.join(HOGTIEDIR, "sampledata", "testmatrix.csv")
file2 = os.path.join(HOGTIEDIR, "sampledata", "testtree.txt")