-
Notifications
You must be signed in to change notification settings - Fork 1
/
DIPscripts.py
118 lines (105 loc) · 4.95 KB
/
DIPscripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def DIPgetproteinIDs(DIPintfile):
""" Takes the filename of a DIP interaction file and writes three files containing the proteins IDs
found in that DIP interaction file. Returns uniprotIDs (because you need it to run the next thing)"""
import csv
from collections import OrderedDict
from matplotlib.cbook import flatten
c = csv.reader(open(DIPintfile), delimiter="\t")
#skip first line
c.next()
#take out the first two strings for every line
IDstrings = map(lambda x: (x[0],x[1]), c )
# The below uniqueifier is from [this stackoverflow post](http://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-python-whilst-preserving-order).
# would quite like to flatten this list
# then remove duplicate entries
IDstrings = list(OrderedDict.fromkeys(flatten(IDstrings)))
print "Number of proteins in DIP dataset is %i"%len(IDstrings)
print "Example entries:"
print IDstrings[0:10]
# Next have to remove entries that only have DIP identifiers because I can't work with those.
# Or I could just split all the strings by the `|` and split it up into the different identifier types and push all three files through the uniprot web system.
splitstrings = map(lambda x: x.split("|"), IDstrings)
# then have to sort into three lists
# get ready for some super readable code
# who needs programming efficiency?
DIPIDs = []
uniprotIDs = []
refseqIDs = []
for line in splitstrings:
#can only look at the strings themselves to sort
for s in line:
#test for each case:
if "DIP" in s:
DIPIDs.append(s)
elif "uniprot" in s:
uniprotIDs.append(s)
elif "refseq" in s:
refseqIDs.append(s)
#write to three files
csv.writer(open("IDs.DIP.txt", "w"),delimiter="\n").writerow(DIPIDs)
uniprotIDs = map(lambda x: x.split(":")[1],uniprotIDs)
csv.writer(open("IDs.uniprot.txt", "w"),delimiter="\n").writerow(uniprotIDs)
refseqIDs = map(lambda x: x.split(":")[1],refseqIDs)
csv.writer(open("IDs.refseq.txt", "w"),delimiter="\n").writerow(refseqIDs)
return uniprotIDs
def gettotaluniprot(DIPmap,refseqmap,uniprotIDs):
""" Takes filenames for DIP and refseq map tables from uniprot and a list of uniprotIDs and combines the
uniprotIDs in all three then writes it to a file and returns it"""
import csv
from collections import OrderedDict
import pdb
# Using the uniprot website to map these to Entrez.
# Starting with the DIP IDs, got two files out:
#
# 1. A mapping table from DIP to uniprot ID
# 2. A list of targets
#
# Interestingly, though, these two files are completely different lengths. Which could mean that a number of the DIP IDs map to the same uniprot ID. Checking if this is the case:
# In[131]:
#load in DIP mapping table
c = csv.reader(open(DIPmap), delimiter="\t")
#ignore first row
c.next()
# convert to lists
DIPmap = list(c)
# unpack and zip to get uniprot
u = zip(*DIPmap)[1]
# how long is it to start with
print "Before removing duplicates length of uniprot is %i"%len(u)
# then remove duplicates
u = list(OrderedDict.fromkeys(u))
# how long is it now?
print "After removing duplicates length of uniprot is %i"%len(u)
# Appears some number of these even have the same name so seems like it shouldn't be a syntax problem.
# Should probably remove duplicates before sending this to uniprot.
# Some of the entries also have extra numbers after dashes I could try slicing off as well.
# Putting this into uniprot mapped 2,135 of 2,189.
# Using this mapping table to add to the uniprot list as before:
#load in refseq mapping table
c = csv.reader(open(refseqmap), delimiter="\t")
#ignore first row
c.next()
# convert to lists
refseqmap = list(c)
# unpack and zip to get uniprot
ur = zip(*refseqmap)[1]
# how long is it to start with
print "Before removing duplicates length of uniprot is %i"%len(u)
# then remove duplicates
ur = list(OrderedDict.fromkeys(u))
# how long is it now?
print "After removing duplicates length of uniprot is %i"%len(u)
# __Wait a second__, how is the mapping table longer than the number of identifiers mapped? Is this just the general mapping table for all proteins or something?
#
# __Nope__, looks like the refseq IDs map onto multiple uniprot IDs...
#
# Combining everything together into a real total uniprot list:
# In[139]:
#remove duplicates
totaluniprot = list(OrderedDict.fromkeys(u+uniprotIDs+ur))
#slice off anything extra
totaluniprot = map(lambda x: x.split("-")[0], totaluniprot)
print "Length before removing duplicates was %i and after is now %i."%(len(u+uniprotIDs+ur),len(totaluniprot))
#write the file
csv.writer(open("total.uniprot.txt", "w"),delimiter="\n").writerow(totaluniprot)
return totaluniprot