-
Notifications
You must be signed in to change notification settings - Fork 1
/
laaaf.py
118 lines (98 loc) · 6.5 KB
/
laaaf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
laaaf.py - estimate ancestral allele frequencies
Copyright (C) 2013 Giulio Genovese
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Written by Giulio Genovese <giulio.genovese@gmail.com>
"""
import os, argparse, sys, ctypes, gzip, numpy as np, vcf, collections, re, functools, scipy.optimize, math
import libla
def laaaf(argv):
libpath = os.path.abspath(os.path.dirname(sys.argv[0]) + os.sep + 'latools.so')
parser = argparse.ArgumentParser(description='latools.py map: admixture-map SNPs', add_help=False, usage='latools.py map [options] <la.bg> [in.vcf]')
parser.add_argument('la', metavar='la.bg', type=str, help='local ancestry file in bedgraph format (can be gzipped)')
parser.add_argument('i', metavar='in.vcf', nargs='?', type=str, default='/dev/stdin', help='input VCF file (can be gzipped) [stdin]')
parser.add_argument('-o', metavar='FILE', type=argparse.FileType('w'), default=sys.stdout, help='output VCF file [stdout]')
parser.add_argument('-e', metavar='FILE', type=argparse.FileType('r'), help='list of training samples of European ancestry')
parser.add_argument('-a', metavar='FILE', type=argparse.FileType('r'), help='list of training samples of African ancestry')
parser.add_argument('-n', metavar='FILE', type=argparse.FileType('r'), help='list of training samples of Native American ancestry')
parser.add_argument('-s', action='store_true', help='output VCF with no samples genotypes')
parser.add_argument('-g', metavar='INT', type=int, default=20, help='minimum number of genotypes [20]')
parser.add_argument('-p', metavar='FILE', type=str, default=libpath, help='library for likelihood computations [' + libpath + ']')
try:
parser.error = parser.exit
args = parser.parse_args(argv)
except SystemExit:
parser.print_help()
exit(2)
clib = ctypes.CDLL(args.p)
# open the BedGraph file and stores it in a numpy array
la_samples, chrom, start, end, la = libla.getla(args.la,clib)
# compute ancestry proportions
la_ancXX = np.sum(la>0,axis=0).astype(ctypes.c_double)
la_ancEE = np.sum(la==1,axis=0).astype(ctypes.c_double) / la_ancXX
la_ancEA = np.sum(la==2,axis=0).astype(ctypes.c_double) / la_ancXX
la_ancAA = np.sum(la==3,axis=0).astype(ctypes.c_double) / la_ancXX
la_ancEN = np.sum(la==4,axis=0).astype(ctypes.c_double) / la_ancXX
la_ancAN = np.sum(la==5,axis=0).astype(ctypes.c_double) / la_ancXX
la_ancNN = np.sum(la==6,axis=0).astype(ctypes.c_double) / la_ancXX
# add training samples from ancestral populations
tr_samples = list(la_samples)
tr_ancEE, tr_ancEA, tr_ancAA, tr_ancEN, tr_ancAN, tr_ancNN = la_ancEE, la_ancEA, la_ancAA, la_ancEN, la_ancAN, la_ancNN
# tr_ancE, tr_ancA, tr_ancN = la_ancE, la_ancA, la_ancN
trfile = [args.e, args.a, args.n]
for i in range(3):
if trfile[i] != None:
tr_samples += [sample.rstrip('\r\n') for sample in trfile[i].readlines()]
trfile[i].close()
tr_ancEE = np.concatenate((tr_ancEE,np.tile(1.0 if i==0 else 0.0,len(tr_samples)-len(tr_ancEE))))
tr_ancEA = np.concatenate((tr_ancEA,np.tile(0.0,len(tr_samples)-len(tr_ancEA))))
tr_ancAA = np.concatenate((tr_ancAA,np.tile(1.0 if i==1 else 0.0,len(tr_samples)-len(tr_ancAA))))
tr_ancEN = np.concatenate((tr_ancEN,np.tile(0.0,len(tr_samples)-len(tr_ancEN))))
tr_ancAN = np.concatenate((tr_ancAN,np.tile(0.0,len(tr_samples)-len(tr_ancAN))))
tr_ancNN = np.concatenate((tr_ancNN,np.tile(1.0 if i==2 else 0.0,len(tr_samples)-len(tr_ancNN))))
# open the input VCF file
vcf_reader = vcf.Reader(filename=args.i)
vcf_samples = vcf_reader.samples
# add INFO fields to the VCF structure
_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc'])
vcf_reader.infos['EUR_AAF'] = _Info('EUR_AAF','1','Float','Estimated Ancestral Allele Frequency for EUR Samples')
vcf_reader.infos['AFR_AAF'] = _Info('AFR_AAF','1','Float','Estimated Ancestral Allele Frequency for AFR Samples')
vcf_reader.infos['NAT_AAF'] = _Info('NAT_AAF','1','Float','Estimated Ancestral Allele Frequency for AMR Samples')
# this code is extremely slow, but does the job
tr2vcf = [(x, y) for x in range(len(tr_samples)) for y in range(len(vcf_samples)) if tr_samples[x] == vcf_samples[y]]
tr2vcf_loc = [x[0] for x in tr2vcf]
vcf2tr_loc = [x[1] for x in tr2vcf]
la2vcf = [(x, y) for x in range(len(la_samples)) for y in range(len(vcf_samples)) if la_samples[x] == vcf_samples[y]]
la2vcf_loc = [x[0] for x in la2vcf]
vcf2la_loc = [x[1] for x in la2vcf]
# iterate over the lines in the VCF file
for record in vcf_reader:
gl0, gl1, gl2, ind = libla.getGL(record,vcf2tr_loc,clib)
n = len(ind)
if n < args.g: vcf_writer.write_record(record); continue
frqE = frqA = frqN = (sum(gl1)/2+sum(gl2))/len(gl0)
tr_ind = [tr2vcf_loc[i] for i in ind]
ancEE = (ctypes.c_double * n)(); ancEE[:] = tr_ancEE[tr_ind]
ancEA = (ctypes.c_double * n)(); ancEA[:] = tr_ancEA[tr_ind]
ancAA = (ctypes.c_double * n)(); ancAA[:] = tr_ancAA[tr_ind]
ancEN = (ctypes.c_double * n)(); ancEN[:] = tr_ancEN[tr_ind]
ancAN = (ctypes.c_double * n)(); ancAN[:] = tr_ancAN[tr_ind]
ancNN = (ctypes.c_double * n)(); ancNN[:] = tr_ancNN[tr_ind]
def f(x,y,z): return -libla.sumlog(libla.lkl3(n,x,y,z,ancEE,ancEA,ancAA,ancEN,ancAN,ancNN,gl0,gl1,gl2,clib),clib)
# estimate ancestral allele frequencies
for i in range(3):
frqE = scipy.optimize.fminbound(functools.partial(f,y=frqA,z=frqN),args.m,1-args.m)
frqA = scipy.optimize.fminbound(functools.partial(f,frqE,z=frqN),args.m,1-args.m)
frqN = scipy.optimize.fminbound(functools.partial(f,frqE,frqA),args.m,1-args.m)
record.add_info('EUR_AAF',round(1e4*frqE)/1e4)
record.add_info('AFR_AAF',round(1e4*frqA)/1e4)
record.add_info('NAT_AAF',round(1e4*frqN)/1e4)