forked from Sage-Bionetworks/tcgaImport
-
Notifications
You must be signed in to change notification settings - Fork 0
/
whitelist.py
94 lines (81 loc) · 3.89 KB
/
whitelist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import synapseclient
from synapseclient import File
import synapseHelpers
import pandas as pd
import hashlib
from synapseHelpers import query2df, thisCodeInSynapse
#from multiprocessing.dummy import Pool
QUERY_STR = "select * from file where benefactorId=='syn2812961' and acronym=='PANCAN'"
WHITELISTID = 'syn4551248'
def getFileIdFromName(name):
q = syn.chunkedQuery("select id from file where name=='%s' and parentId=='%s'" %(name, 'syn4557014'))
id = q.next()['file.id']
return id
def isUptodate(name, files, toRemove, platform):
try:
activity = syn.getProvenance(getFileIdFromName(name))
except StopIteration:
print 'File not found'
return False
used = set([(x['reference']['targetId'], x['reference']['targetVersionNumber']) for x in activity['used'] if x['wasExecuted']==False])
oldWhitelist = id, version = [i for i in used if i[0]==WHITELISTID][0]
used = set(['%s.%s' %i for i in used if i[0]!=WHITELISTID])
currentVersions = set(['%s.%s' % (x.id, x.versionNumber) for x in files])
# * If upstream data files changed return False
if currentVersions!=used:
return False
#else check if the whitelisting is different for this specific platform
oldToRemove = getChangeSet(oldWhitelist[1], platform)
return oldToRemove==toRemove
def getChangeSet(version, platform):
"""Extracts the old whitelist id and version of used and filters the changes down
to a specific platform."""
old_whitelist = syn.get(WHITELISTID, version=version)
whitelist = pd.read_csv(whitelistEntity.path, sep='\t')
oldToRemove = set(whitelist.ix[whitelist.Do_not_use & (whitelist.platform==platform),
'aliquot_barcode'])
return oldToRemove
#mp = Pool(8)
syn = synapseclient.login(silent=True)
whitelistEntity = syn.get(WHITELISTID)
whitelist = pd.read_csv(whitelistEntity.path, sep='\t')
inputFiles = synapseHelpers.query2df(syn.chunkedQuery(QUERY_STR))
code=synapseHelpers.thisCodeInSynapse(parentId='syn1774100')
for i, row in inputFiles.iterrows():
print row.id, row['name'],
inputFileEntity = syn.get(row.id)
outFileName = row['name'][:-4]+'_whitelisted'+row['name'][-4:]
toRemove = set(whitelist.ix[whitelist.Do_not_use & (whitelist.platform == row['platform']),
'aliquot_barcode'])
if isUptodate(outFileName, [inputFileEntity], toRemove, row['platform']):
print ' is up to date - but update provenance'
e = syn.get(getFileIdFromName(outFileName), downloadFile=False)
syn.store(e, used=[inputFileEntity, whitelistEntity], executed=code)
continue
if row.fileType =='bed5': #Do the filtering for bed files
df = pd.read_csv(inputFileEntity.path, sep='\t')
print df.shape,
idx = ~df.Sample.isin(toRemove)
df = df[idx]
print '->', df.shape
df.to_csv('/gluster/home/lomberg/tcgaImport/out/'+outFileName, sep='\t', index=False)
nFeatures = 0
nSamples = len(set(df.Sample))
else: #All other fileTypes
df = pd.read_csv(inputFileEntity.path, sep='\t', index_col=0)
print df.shape,
colsToKeep = [col for col in df.columns if (col.startswith('TCGA') and
(col.split('.')[0] not in toRemove) and
('.' not in col))]
df = df.ix[:, colsToKeep]
print '->', df.shape
df.to_csv('/gluster/home/lomberg/tcgaImport/out/'+outFileName, sep='\t')
nFeatures , nSamples = df.shape
annots = syn.getAnnotations(row.id)
del annots['etag']
del annots['uri']
del annots['id']
annots['nSamples'] = nSamples
annots['nFeatures'] = nFeatures
f = File('/gluster/home/lomberg/tcgaImport/out/'+outFileName, parent='syn4557014', annotations=annots)
f = syn.store(f, used=[inputFileEntity, whitelistEntity], executed=code)