## Load packages

In [157]:
import subprocess, msprime, tskit, os, statistics, pyslim
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import math
import os
import pickle

## Write input files

In [158]:
def WriteInput(ts, Nrun, SampleSize, Ne, Stop, SeqLen, MutRate, RecRate, seed1, seed2, filename):
    filename_tmp = filename + "1"
    f1 = open(filename_tmp,"w+")
    f1.write("Number of runs = %d\n" % Nrun)
    f1.write("Effective population size = %d\n" % Ne)
    f1.write("Recombination = %.15f\n" % RecRate)
    f1.write("Mutation = %.15f\n" % MutRate)
    f1.write("Length = %f\n" % SeqLen)
    f1.write("Stop = %d\n" % Stop)
    f1.write("Fix seed = %d, %d\n" % (seed1, seed2))
    f1.write("#")
    f1.close()
    
    filename_tmp = filename + "2"
    f2 = open(filename_tmp,"w+")
    
    # Get distinct haplotypes
    geno =  ts.genotype_matrix() 
    L = len(geno)
    f2.write("Loci = %d\n" % (L+1))
    SampleSize = ts.get_sample_size()
    f2.write("Genes = %d\n" % SampleSize)
          
    # initialise with the first distinct haplotype
    haps_1 = [0]
    haps_2 = [geno[i][0] for i in range(L)]
    haps = np.hstack((haps_1, haps_2))
    # the number of this haplotype is 1
    #haps.append(1)
    haps = np.hstack((haps, [1]))
    haps = np.array([haps])
    ntype = 1
    order = [0]

    for j in range(1,SampleSize):
        new_1 = [0]
        new_2 = [geno[i][j] for i in range(L)]
        new = np.hstack((new_1, new_2))
        flag = 0 # flag=0: a new distinct haplotype
        for k in range(ntype):
            if(haps[k][:(L+1)].tolist() == new.tolist()):
                haps[k][(L+1)] += 1
                flag = 1
                order.append(j)
                break
        if flag == 0:
            order.append(ntype)
            ntype += 1
            #new.append(1)
            new = np.hstack((new, [1]))
            haps = np.vstack([haps, new]) 
          
    f2.write("Distinct Haplotypes = %d\n" % ntype)
    tot=0
    for i in range(ntype):
        tot = tot+ np.sum(haps[i,:(L+1)])*haps[i,(L+1)]
    stationary = tot/(L*SampleSize)
    f2.write("Stationary = %.5f\n" % (1-stationary))
    
    f2.write("Haplotypes\n")
    for i in range(haps.shape[0]):
        for j in range(haps.shape[1]):
            if j == L+1:
                f2.write("%d\n" % haps[i][j])
            else:
                f2.write("%d " % haps[i][j])
    
    # Mutation transition rates
    f2.write("\nMutation\n0.0 1.0 0.0 1.0\n")
             
    # Genetic positions
    f2.write("\nPositions\n")
    positions = [0.0]
    for s in ts.sites():
        positions.append(s.position)
    for i in range(len(positions)):
        f2.write("%.2f " %(positions[i]))
             
    f2.write("\n#")
    f2.close()
    order = sorted(range(len(order)), key=lambda k: order[k])
    return order, L, 1-stationary;

## Set parameters

In [180]:
Ne = 775
SampleSize = 20
Nrun = 5000 # number of SIS runs
Stop = 11 # the stopping criteria
SeqLen = 16569 # sequence length
MutRate = 2.6e-7 # mutation rate
RecRate = 0 # recombination rate
seed1 = 1 # seed1 and seed2 are used in our C progra m
seed2 = 10
seed = 123
filename = "C:\\Users\\blabl\\Desktop\\git repos\\Sampling_Wu\\data8_infile"

## Sample case and control sequences
To sample case sequences, the easier way is to find nodes that contribute to exactly ten sequences at present. The ten sequences at present are set to case sequences. If such nodes are not enough, I look for nodes that contribute to more than ten sequences at present, and then randomly select ten sequences out of the present-time-children of such nodes. In this example, there are eight nodes that contribute to exactly ten sequences at present.

In [196]:
tree = pyslim.load("C:\\Users\\blabl\\Dropbox\\RA_project\\SliM\\recipe_new.trees")
tot = 0
clade_MRCA1 = [] # store nodes that contribute to exactly ten samples at present
clade_MRCA2 = [] # store nodes that contribute to more than ten samples
for c in range(Ne*2):
    #clade = c
    count = 0
    ts = tree.at(100)
    for s in range(Ne*2, Ne*4):
        if ts.is_descendant(s, c):
            count = count + 1
    if count>0:
        tot = tot + count
        if count == 10:
            clade_MRCA1.append(c)
            print("clade %d contribute to %d samples" %(c, count))
        if count > 10:
            clade_MRCA2.append(c)
            print("clade %d contribute to %d samples" %(c, count))
print("check:%d samples in total" % tot)

clade 27 contribute to 11 samples
clade 38 contribute to 13 samples
clade 68 contribute to 12 samples
clade 126 contribute to 10 samples
clade 129 contribute to 13 samples
clade 162 contribute to 12 samples
clade 210 contribute to 10 samples
clade 285 contribute to 10 samples
clade 294 contribute to 10 samples
clade 340 contribute to 15 samples
clade 408 contribute to 20 samples
clade 417 contribute to 10 samples
clade 440 contribute to 11 samples
clade 448 contribute to 10 samples
clade 607 contribute to 12 samples
clade 697 contribute to 11 samples
clade 711 contribute to 11 samples
clade 779 contribute to 10 samples
clade 830 contribute to 14 samples
clade 855 contribute to 16 samples
clade 1004 contribute to 20 samples
clade 1019 contribute to 19 samples
clade 1067 contribute to 13 samples
clade 1146 contribute to 10 samples
clade 1169 contribute to 12 samples
clade 1197 contribute to 13 samples
clade 1234 contribute to 11 samples
clade 1260 contribute to 14 samples
clade 1272 cont

In [183]:
# For nodes that contribute to exactly ten sequences
cases = []
ts = tree.at(100)
for s in range(Ne*2, Ne*4):
    if ts.is_descendant(s, clade_MRCA1[0]):
        cases.append(s)
cases

[1707, 2040, 2067, 2199, 2395, 2458, 2543, 2568, 2809, 3095]

In [184]:
np.random.seed(18)
rest = [x for x in range(Ne*2, Ne*4) if x not in cases]
control = np.random.choice(rest, size=(SampleSize - len(cases)), replace=False)
control

array([2929, 2455, 2006, 2625, 1679, 2577, 3081, 2630, 2853, 2438])

## Get the tree for the selected sample sequences
1. Do recapitation to the original SLiM tree
2. Simplify the recapitated tree so the simplified tree only contians history of the selected sample sequences
3. Add mutations to the simplified tree
4. Get the haplotypes that will be used to write the input files for the C program

In [185]:
tree_re = pyslim.recapitate(tree, ancestral_Ne=Ne, recombination_rate=0, random_seed=seed)
com = np.hstack([cases, control])
tree_com = tree_re.simplify(samples=com, keep_unary=True, map_nodes=True)
map_node = tree_com[1]
tree_com = msprime.sim_mutations(tree_com[0], rate=MutRate, random_seed=seed, keep=True)
geno_com = tree_com.genotype_matrix()

In [186]:
np.shape(geno_com) # 26 segregating sites, 20 sequences

(17, 20)

In [187]:
for var in tree_com.variants():
    print(var.site.position, var.alleles, var.genotypes, sep="\t")

68.0	('C', 'T')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
712.0	('A', 'G')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
955.0	('C', 'T')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
1378.0	('G', 'A')	[0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0]
2536.0	('G', 'C')	[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
2987.0	('A', 'C')	[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
3832.0	('A', 'T')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
4482.0	('C', 'G')	[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0]
5281.0	('G', 'A')	[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
6299.0	('T', 'A')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
6510.0	('G', 'T')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
7904.0	('T', 'C')	[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
9099.0	('C', 'T')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
9224.0	('C', 'T')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
10392.0	('C', 'T')	[1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 0]
10571.0	('A', 'G')	[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
14498.0	('T', 'A')	[1 1 1 1 1 1 1 1 1 1 1 

In [188]:
# store the ID of the MRCA of case clade in the simplified tree 
new_case_MRCA = map_node[clade_MRCA1[0]]

In [189]:
# check if the samples are descendants of their MRCA
tree_com.at(100).is_descendant(0, new_case_MRCA)

True

## Write the input files for the C program

In [190]:
order_com = WriteInput(tree_com, Nrun, SampleSize, Ne, Stop, SeqLen, MutRate, RecRate, seed1, seed2, filename)

## Count the number of mutations before MRCA of case clade is found

In [191]:
# get the node time of the case clade's MRCA
case_MRCA_time = tree_com.tables.nodes[new_case_MRCA].time

In [192]:
sum(tree_com.tables.mutations.time < case_MRCA_time)

0

In [226]:
# For nodes that contribute to exactly ten sequences

np.random.seed(18)

for c in range(len(clade_MRCA1)):
    # select cases
    samples = []
    ts = tree.at(100)
    for s in range(Ne*2, Ne*4):
        if ts.is_descendant(s, clade_MRCA1[c]):
            samples.append(s)
    cases = random.sample(samples, int(SampleSize/2))        
    
    # sample controls
    rest = [x for x in range(Ne*2, Ne*4) if x not in samples]
    control = np.random.choice(rest, size=(SampleSize - len(cases)), replace=False)
    
    # tree strucure for cases + controls
    tree_re = pyslim.recapitate(tree, ancestral_Ne=Ne, recombination_rate=0, random_seed=seed)
    com = np.hstack([cases, control])
    tree_com = tree_re.simplify(samples=com, keep_unary=True, map_nodes=True)
    map_node = tree_com[1]
    tree_com = msprime.sim_mutations(tree_com[0], rate=MutRate, random_seed=seed, keep=True)
    geno_com = tree_com.genotype_matrix()
    
    # count number of mutations before cases' MRCA
    new_case_MRCA = map_node[clade_MRCA1[c]]
    case_MRCA_time = tree_com.tables.nodes[new_case_MRCA].time
    for i in range(len(cases)):
        if not tree_com.at(100).is_descendant(i, new_case_MRCA):
            print("Wrong mapping of node id.")
    mut_num = sum(tree_com.tables.mutations.time < case_MRCA_time)
    print("data %d, case MRCA: node %d, number of mutations %d" % (c+1, clade_MRCA1[c], mut_num))
    
    # write input for C program
    filename = "C:\\Users\\blabl\\Desktop\\git repos\\Sampling_Wu\\data" + str(c+1+len(clade_MRCA1)) + "_infile"
    order_com = WriteInput(tree_com, Nrun, SampleSize, Ne, Stop, SeqLen, MutRate, RecRate, seed1, seed2, filename)

data 1, case MRCA: node 126, number of mutations 0
data 2, case MRCA: node 210, number of mutations 0
data 3, case MRCA: node 285, number of mutations 0
data 4, case MRCA: node 294, number of mutations 0
data 5, case MRCA: node 417, number of mutations 0
data 6, case MRCA: node 448, number of mutations 0
data 7, case MRCA: node 779, number of mutations 0
data 8, case MRCA: node 1146, number of mutations 0


## Generate more data sets

In [225]:
# For nodes that contribute to more than ten sequences

np.random.seed(18)

for c in range(len(clade_MRCA2)):
    # select cases
    samples = []
    ts = tree.at(100)
    for s in range(Ne*2, Ne*4):
        if ts.is_descendant(s, clade_MRCA2[c]):
            samples.append(s)
    cases = random.sample(samples, int(SampleSize/2))        
    
    # sample controls
    rest = [x for x in range(Ne*2, Ne*4) if x not in samples]
    control = np.random.choice(rest, size=(SampleSize - len(cases)), replace=False)
    
    # tree strucure for cases + controls
    tree_re = pyslim.recapitate(tree, ancestral_Ne=Ne, recombination_rate=0, random_seed=seed)
    com = np.hstack([cases, control])
    tree_com = tree_re.simplify(samples=com, keep_unary=True, map_nodes=True)
    map_node = tree_com[1]
    tree_com = msprime.sim_mutations(tree_com[0], rate=MutRate, random_seed=seed, keep=True)
    geno_com = tree_com.genotype_matrix()
    
    # count number of mutations before cases' MRCA
    new_case_MRCA = map_node[clade_MRCA2[c]]
    case_MRCA_time = tree_com.tables.nodes[new_case_MRCA].time
    for i in range(len(cases)):
        if not tree_com.at(100).is_descendant(i, new_case_MRCA):
            print("Wrong mapping of node id.")
    mut_num = sum(tree_com.tables.mutations.time < case_MRCA_time)
    print("data %d, case MRCA: node %d, number of mutations %d" % (c+1+len(clade_MRCA1), clade_MRCA2[c], mut_num))
    
    # write input for C program
    filename = "C:\\Users\\blabl\\Desktop\\git repos\\Sampling_Wu\\data" + str(c+1+len(clade_MRCA1)) + "_infile"
    order_com = WriteInput(tree_com, Nrun, SampleSize, Ne, Stop, SeqLen, MutRate, RecRate, seed1, seed2, filename)


data 9, case MRCA: node 27, number of mutations 0
data 10, case MRCA: node 38, number of mutations 0
data 11, case MRCA: node 68, number of mutations 0
data 12, case MRCA: node 129, number of mutations 0
data 13, case MRCA: node 162, number of mutations 0
data 14, case MRCA: node 340, number of mutations 0
data 15, case MRCA: node 408, number of mutations 0
data 16, case MRCA: node 440, number of mutations 0
data 17, case MRCA: node 607, number of mutations 0
data 18, case MRCA: node 697, number of mutations 0
data 19, case MRCA: node 711, number of mutations 0
data 20, case MRCA: node 830, number of mutations 0
data 21, case MRCA: node 855, number of mutations 0
data 22, case MRCA: node 1004, number of mutations 0
data 23, case MRCA: node 1019, number of mutations 0
data 24, case MRCA: node 1067, number of mutations 0
data 25, case MRCA: node 1169, number of mutations 0
data 26, case MRCA: node 1197, number of mutations 0
data 27, case MRCA: node 1234, number of mutations 0
data 28, c