/
build_chrom_db.py
82 lines (73 loc) · 2.57 KB
/
build_chrom_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
"""
Connects to a UCSC table browser and scrapes chrominfo for every build
specified by an input file (such as one output by parse_builds.py).
If not input file specified, it will connect using parse_builds.py to
retrieve a list of available builds.
All chromInfo is placed in a path with the convention
{dbpath}/buildname.len
Usage:
python build_chrom_db.py dbpath/ [builds_file]
"""
from __future__ import print_function
import fileinput
import os
import sys
import requests
from six.moves.urllib.parse import urlencode
import parse_builds
def getchrominfo(url, db):
tableURL = "http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?"
URL = tableURL + urlencode({
"clade": "",
"org": "",
"db": db,
"hgta_outputType": "primaryTable",
"hgta_group": "allTables",
"hgta_table": "chromInfo",
"hgta_track": db,
"hgta_regionType": "",
"position": "",
"hgta_doTopSubmit": "get info"})
page = requests.get(URL).text
for line in page.split('\n'):
line = line.rstrip( "\r\n" )
if line.startswith("#"):
continue
fields = line.split("\t")
if len(fields) > 1 and len(fields[0]) > 0 and int(fields[1]) > 0:
yield [fields[0], fields[1]]
else:
raise Exception("Problem parsing line '%s'" % line)
if __name__ == "__main__":
if len(sys.argv) == 1:
sys.exit("Path to place chromInfo tables must be specified.")
dbpath = sys.argv[1]
builds = []
if len(sys.argv) > 2:
try:
buildfile = fileinput.FileInput(sys.argv[2])
for line in buildfile:
if line.startswith("#"):
continue
builds.append(line.split("\t")[0])
except:
sys.exit("Bad input file.")
else:
try:
for build in parse_builds.getbuilds("http://genome.cse.ucsc.edu/cgi-bin/das/dsn"):
builds.append(build[0])
except:
sys.exit("Unable to retrieve builds.")
for build in builds:
if build == "?":
continue # no lengths for unspecified chrom
print("Retrieving " + build)
outfile_name = dbpath + build + ".len"
try:
with open(outfile_name, "w") as outfile:
for chrominfo in getchrominfo("http://genome-test.cse.ucsc.edu/cgi-bin/hgTables?", build):
print("\t".join(chrominfo), file=outfile)
except Exception as e:
print("Failed to retrieve %s: %s" % (build, e))
os.remove(outfile_name)