In [62]:
import pandas as pd
import os
import glob
import numpy as np
import statistics as stats
import scipy
import src.mpra_tools.predicted_occupancy as po
import src.mpra_tools.fasta_utils as fu
import math
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import random
from Ensembl_converter import EnsemblConverter
import requests, sys
from gseapy import Biomart

In [63]:
activity_df = pd.read_csv("Data/activity.csv", index_col=0)
retinopathy_df = pd.read_csv("Data/retinopathy.csv", index_col=0)
L = 164

In [101]:
motif_df = pd.read_parquet("Data/Motifs/summarized_all_motifs.parquet")
TFs = list(set(motif_df['motif']))

In [64]:
rna_seq = pd.read_csv("Data/RNA/WT2_run1/quant.sf", sep='\t', index_col=0)

In [65]:
rna_seq

Unnamed: 0_level_0,Length,EffectiveLength,TPM,NumReads
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUST00000193812.2,1070,896.000,0.000000,0.000
ENSMUST00000082908.3,110,15.000,0.000000,0.000
ENSMUST00000162897.2,4153,3914.249,0.381653,29.154
ENSMUST00000159265.2,2989,2762.517,0.272171,14.673
ENSMUST00000070533.5,3634,3350.511,0.736722,48.172
...,...,...,...,...
ENSMUST00000082419.1,519,338.164,2533.277917,16718.354
ENSMUST00000082420.1,69,10.000,25.620428,5.000
ENSMUST00000082421.1,1144,958.849,4937.545937,92394.303
ENSMUST00000082422.1,67,10.000,0.000000,0.000


In [111]:
expressed = rna_seq[rna_seq['TPM'] > 0.0]
expressed

Unnamed: 0_level_0,Length,EffectiveLength,TPM,NumReads
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUST00000162897.2,4153,3914.249,0.381653,29.154
ENSMUST00000159265.2,2989,2762.517,0.272171,14.673
ENSMUST00000070533.5,3634,3350.511,0.736722,48.172
ENSMUST00000194099.2,2309,2062.051,0.024849,1.000
ENSMUST00000192183.2,926,755.359,0.067836,1.000
...,...,...,...,...
ENSMUST00000082418.1,1824,1623.975,1714.731816,54344.945
ENSMUST00000082419.1,519,338.164,2533.277917,16718.354
ENSMUST00000082420.1,69,10.000,25.620428,5.000
ENSMUST00000082421.1,1144,958.849,4937.545937,92394.303


In [112]:
converter = EnsemblConverter(use_progress_bar=True)

In [113]:
ensembl_ids = [id.split('.')[0] for id in expressed.index.to_list()[:100]]
result = converter.convert_ids(ensembl_ids)

100%|██████████| 100/100 [01:10<00:00,  1.43it/s]


In [114]:
result

Unnamed: 0,ENSG,Symbol
0,ENSMUST00000162897,Xkr4-203
1,ENSMUST00000159265,Xkr4-202
2,ENSMUST00000070533,Xkr4-201
3,ENSMUST00000194099,Gm37686-201
4,ENSMUST00000192183,Gm7341-201
...,...,...
95,ENSMUST00000052843,Mcmdc2-201
96,ENSMUST00000171802,Mcmdc2-205
97,ENSMUST00000140948,Mcmdc2-204
98,ENSMUST00000182580,Snhg6-203


In [115]:
batch_size = 128
ensembl_ids = [id.split('.')[0] for id in expressed.index.to_list()]
batched_ids = []

while len(ensembl_ids) >= batch_size:
    batched_ids.append(ensembl_ids[:batch_size])
    del ensembl_ids[:batch_size]
batched_ids.append(ensembl_ids)


In [116]:

batched_jsons = [
    '{ "ids" : ["' + '", "'.join(id_batch) + '" ] }' for id_batch in batched_ids
]

In [117]:
batched_jsons[0]

'{ "ids" : ["ENSMUST00000162897", "ENSMUST00000159265", "ENSMUST00000070533", "ENSMUST00000194099", "ENSMUST00000192183", "ENSMUST00000192692", "ENSMUST00000192427", "ENSMUST00000208660", "ENSMUST00000208793", "ENSMUST00000027032", "ENSMUST00000194382", "ENSMUST00000182774", "ENSMUST00000193443", "ENSMUST00000193658", "ENSMUST00000130201", "ENSMUST00000156816", "ENSMUST00000045689", "ENSMUST00000115538", "ENSMUST00000192286", "ENSMUST00000146665", "ENSMUST00000195445", "ENSMUST00000134384", "ENSMUST00000027036", "ENSMUST00000115529", "ENSMUST00000141278", "ENSMUST00000155020", "ENSMUST00000081551", "ENSMUST00000165720", "ENSMUST00000078030", "ENSMUST00000194052", "ENSMUST00000002533", "ENSMUST00000118000", "ENSMUST00000147158", "ENSMUST00000156289", "ENSMUST00000133521", "ENSMUST00000168963", "ENSMUST00000194114", "ENSMUST00000144339", "ENSMUST00000169520", "ENSMUST00000192847", "ENSMUST00000044369", "ENSMUST00000194676", "ENSMUST00000194301", "ENSMUST00000194978", "ENSMUST00000192698"

In [118]:

server = "https://rest.ensembl.org/lookup/id"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}


batched_geneids = []

r = requests.post(server, headers=headers, data=batched_jsons[0])

if not r.ok:
  r.raise_for_status()
  sys.exit()
 
decoded = r.json()



In [119]:
len(batched_jsons)

512

In [120]:
gene_info = []
counter = 0
for batch in batched_jsons:
    r = requests.post(server, headers=headers, data=batch)

    if not r.ok:
        r.raise_for_status()
        sys.exit()
    
    decoded = r.json()
    
    gene_info.append(pd.DataFrame(decoded).T)
    counter +=1 
    print(counter)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [121]:
gene_info = pd.concat(gene_info)

In [110]:
gene_info = gene_info.dropna()
gene_info

Unnamed: 0,species,start,display_name,version,assembly_name,seq_region_name,object_type,end,length,is_canonical,db_type,id,source,biotype,Parent,strand,logic_name,TF_name
ENSMUST00000002533,mus_musculus,4979799,Rgs20-201,15,GRCm39,1,Transcript,5089762,1778,0,core,ENSMUST00000002533,ensembl_havana,protein_coding,ENSMUSG00000002459,-1,ensembl_havana_transcript_mus_musculus,Rgs20
ENSMUST00000027053,mus_musculus,16175998,Rdh10-201,8,GRCm39,1,Transcript,16203958,3587,1,core,ENSMUST00000027053,ensembl_havana,protein_coding,ENSMUSG00000025921,1,ensembl_havana_transcript_mus_musculus,Rdh10
ENSMUST00000186528,mus_musculus,10097423,Cops5-204,7,GRCm39,1,Transcript,10108166,962,0,core,ENSMUST00000186528,havana,nonsense_mediated_decay,ENSMUSG00000025917,-1,havana_mus_musculus,Cops5
ENSMUST00000128164,mus_musculus,10196209,Cspp1-207,2,GRCm39,1,Transcript,10200243,670,0,core,ENSMUST00000128164,havana,retained_intron,ENSMUSG00000056763,1,havana_mus_musculus,Cspp1
ENSMUST00000123261,mus_musculus,10157972,Cspp1-206,2,GRCm39,1,Transcript,10160542,647,0,core,ENSMUST00000123261,havana,retained_intron,ENSMUSG00000056763,1,havana_mus_musculus,Cspp1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUST00000091190,mus_musculus,1260771,Ddx3y-201,12,GRCm39,Y,Transcript,1286629,4600,1,core,ENSMUST00000091190,ensembl_havana,protein_coding,ENSMUSG00000069045,-1,ensembl_havana_transcript_mus_musculus,Ddx3y
ENSMUST00000167967,mus_musculus,90764326,Mid1-ps1-201,4,GRCm39,Y,Transcript,90774754,1248,1,core,ENSMUST00000167967,havana,unprocessed_pseudogene,ENSMUSG00000095134,1,havana_mus_musculus,Mid1-ps1
ENSMUST00000069309,mus_musculus,1097144,Uty-201,14,GRCm39,Y,Transcript,1245718,5208,1,core,ENSMUST00000069309,ensembl_havana,protein_coding,ENSMUSG00000068457,-1,ensembl_havana_transcript_mus_musculus,Uty
ENSMUST00000082418,mus_musculus,11742,mt-Nd5-201,1,GRCm39,MT,Transcript,13565,1824,1,core,ENSMUST00000082418,RefSeq,protein_coding,ENSMUSG00000064367,1,mt_genbank_import_mus_musculus,mt-Nd5


In [103]:
gene_info['TF_name'] = gene_info['display_name'].map(lambda x: '-'.join(x.split('-')[:-1]))





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_info['TF_name'] = gene_info['display_name'].map(lambda x: '-'.join(x.split('-')[:-1]))


In [109]:
genes = set(gene_info['TF_name'].values)

for g in genes:
    if 'AP2B' in g:
        print('NRL')


