In [310]:
from collections import Counter
from itertools import combinations, count, chain

import networkx
from networkx import connected_components
import numpy as np
import pandas as pd

In [21]:
ens_main = pd.read_csv('data/ens_main_int.csv', dtype={'chromosome_or_scaffold':'O', '5utr_start': 'Int64', '5utr_end': 'Int64', '3utr_start': 'Int64', '3utr_end': 'Int64', 'cds_start': 'Int64', 'cds_end': 'Int64'})
ens_exons = pd.read_csv('data/ens_exons_int.csv', dtype={'5utr_start': 'Int64', '5utr_end': 'Int64', '3utr_start': 'Int64', '3utr_end': 'Int64', 'cds_start': 'Int64', 'cds_end': 'Int64'})

In [22]:
ens_main = ens_main[ens_main['chromosome_or_scaffold'].isin([str(i) for i in list(range(1,23))] + ['X', 'Y'])]
ens_main = ens_main[ens_main['gene_type'] == 'protein_coding']
ens_main = ens_main[~ens_main['ensembl_gene_name'].isna()]
gene_size = ens_main.groupby('ensembl_gene_name').size()
gene_size = gene_size[gene_size > 1].index.to_list()
ens_main = ens_main[ens_main['ensembl_gene_name'].isin(gene_size)]
ens_min = ens_main[['ensembl_gene_name', 'ensembl_trs_id', 'trs_type', 'uniprot_isoform_id', 'uniprot_trembl_id', 'cds_start', 'cds_end']]

In [31]:
exons_int_safe = ens_exons.drop(['5utr_start', '5utr_end', '3utr_start', '3utr_end'], axis=1).dropna(subset=['cds_start', 'cds_end'])
exons_int_safe.loc[:, ['cds_start', 'cds_end']] = exons_int_safe.loc[:, ['cds_start', 'cds_end']].astype(int)

In [26]:
genes = ens_min['ensembl_gene_name'].unique()
len(genes)

16846

In [38]:
def connect_dups(dup_list):
    G = networkx.Graph()
    
    G.add_edges_from(dup_list)
    
    return [list(i) for i in connected_components(G)]

In [39]:
def find_dups(gene):
    v = ens_min[(ens_min['ensembl_gene_name'] == gene) & (ens_min['trs_type'] == 'protein_coding')]['ensembl_trs_id'].values
    cdsrg = []
    
    for trs in v:
        cdsrg.append(np.sort(exons_int_safe[exons_int_safe['ensembl_trs_id'] == trs].iloc[:, -2:].to_numpy().ravel()))
    
    hits = []
    
    for i in combinations(range(len(cdsrg)), 2):
        if np.array_equal(cdsrg[i[0]], cdsrg[i[1]]):
            hits.append(list(v[list(i)]))
            
    hits = connect_dups(hits)
    
    return hits

In [45]:
all_hits = pd.Series(index=genes, dtype='O')

for x, g in enumerate(genes):
    all_hits[g] = find_dups(g)
    print(f'Done {x+1} out of {len(genes)}        ', end='\r')

Done 16846 out of 16846        

In [47]:
all_hits = all_hits.explode()

In [58]:
all_hits = all_hits.dropna()

In [59]:
all_hits

USP9Y                     [ENST00000338981, ENST00000651177]
BPY2                      [ENST00000382585, ENST00000331070]
PCDH11Y                   [ENST00000333703, ENST00000622698]
BPY2B                     [ENST00000615850, ENST00000382392]
TGIF2LY                   [ENST00000559055, ENST00000321217]
                                 ...                        
CTPS1      [ENST00000649124, ENST00000470271, ENST0000037...
CTPS1                     [ENST00000696111, ENST00000696110]
AGT        [ENST00000681514, ENST00000366667, ENST0000068...
EIF3I      [ENST00000677701, ENST00000677711, ENST0000067...
UBAP2L                    [ENST00000361546, ENST00000428931]
Length: 7376, dtype: object

In [56]:
ens_min['_protein_id'] = ens_min['uniprot_isoform_id'].combine_first(ens_min['uniprot_trembl_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ens_min['_protein_id'] = ens_min['uniprot_isoform_id'].combine_first(ens_min['uniprot_trembl_id'])


In [120]:
ens_min.loc[0, '_id_warning'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [151]:
xc = count()

for j, i in enumerate(all_hits.values):
    v = ens_min.loc[i, :]
    try:
        if not np.all((v.iloc[:, -2].to_numpy()) == v.iloc[0,-2]):
            cons_id = pd.concat([v['uniprot_isoform_id'], v['uniprot_trembl_id']]).dropna().to_numpy()

            if len(cons_id) != 0:
                ens_min.loc[i, '_protein_id'] = cons_id[0]
                ens_min.loc[i, '_id_warning'] = '!conf'
            else:
                if v.iloc[:, -2].isna().all():
                    ens_min.loc[i, '_protein_id'] = '_' + str(next(xc))
                    ens_min.loc[i, '_id_warning'] = '!nan'
                else:
                    raise

        else:
            ens_min.loc[i, '_id_warning'] = 'dup'
    except TypeError:
        print(v)
        raise
    print(f'{j+1} / {len(all_hits)}         ', end='\r')
    #if ens_min[ens_min['ensembl_trs_id'] == i[0]]['_protein_id'].item() != np.nan:

7376 / 7376         

In [115]:
ens_min[ens_min['ensembl_trs_id'].isin(['ENST00000380152', 'ENST00000680887', 'ENST00000544455'])]

Unnamed: 0,ensembl_gene_name,ensembl_trs_id,trs_type,uniprot_isoform_id,uniprot_trembl_id,cds_start,cds_end,_protein_id,_id_warning
37721,BRCA2,ENST00000544455,protein_coding,P51587-1,,32316461,32398770,P51587-1,
37737,BRCA2,ENST00000380152,protein_coding,P51587-1,,32316461,32398770,P51587-1,
37742,BRCA2,ENST00000680887,protein_coding,,A0A7P0T9D7,32316461,32398770,A0A7P0T9D7,


In [152]:
ens_min[ens_min['ensembl_gene_name'] == 'OXGR1']

Unnamed: 0_level_0,ensembl_gene_name,trs_type,uniprot_isoform_id,uniprot_trembl_id,cds_start,cds_end,_protein_id,_id_warning
ensembl_trs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENST00000543457,OXGR1,protein_coding,Q96P68-1,B2R986,96986746,96987759,Q96P68-1,!conf
ENST00000541038,OXGR1,protein_coding,,"B2R986, F5H6U5",96986746,96987759,Q96P68-1,!conf
ENST00000541518,OXGR1,protein_coding,,"B2R986, F5H3P1",96986746,96987759,Q96P68-1,!conf
ENST00000298440,OXGR1,protein_coding,Q96P68-1,B2R986,96986746,96987759,Q96P68-1,!conf


In [156]:
ens_min['idd'] = np.arange(len(ens_min)) + 1000

In [157]:
ens_min['idd'] = ens_min['idd'].map(lambda x: '_' + str(x))

In [158]:
ens_min['pr2'] = ens_min['uniprot_isoform_id'].combine_first(ens_min['uniprot_trembl_id']).combine_first(ens_min['idd'])

In [138]:
v = ens_min.loc[['ENST00000541518', 'ENST00000543457', 'ENST00000541038', 'ENST00000298440'], :]

In [140]:
if not np.all((v.iloc[:, -2].to_numpy()) == v.iloc[0,-2]):
    cons_id = pd.concat([v['uniprot_isoform_id'], v['uniprot_trembl_id']]).dropna().to_numpy()

In [141]:
cons_id

array(['Q96P68-1', 'Q96P68-1', 'B2R986, F5H3P1', 'B2R986',
       'B2R986, F5H6U5', 'B2R986'], dtype=object)

In [104]:
next(xc)

1

In [137]:
all_hits.loc['OXGR1']

['ENST00000541518', 'ENST00000543457', 'ENST00000541038', 'ENST00000298440']

In [124]:
ens_min = ens_min.dropna(subset=['ensembl_trs_id'])

In [122]:
ens_min[ens_min['ensembl_trs_id'].isin(['ENST00000303728'])]['_id_warning'] = 'check'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [126]:
ens_min = ens_min.set_index('ensembl_trs_id', drop=True)

In [148]:
v.loc[:, '_protein_id'] = 'x'

In [160]:
ens_min['_protein_id'] = ens_min['_protein_id'].combine_first(ens_min['pr2'])

In [150]:
ens_min.loc[['ENST00000541518', 'ENST00000543457', 'ENST00000541038', 'ENST00000298440'], :]

Unnamed: 0_level_0,ensembl_gene_name,trs_type,uniprot_isoform_id,uniprot_trembl_id,cds_start,cds_end,_protein_id,_id_warning
ensembl_trs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENST00000541518,OXGR1,protein_coding,,"B2R986, F5H3P1",96986746,96987759,"B2R986, F5H3P1",
ENST00000543457,OXGR1,protein_coding,Q96P68-1,B2R986,96986746,96987759,Q96P68-1,
ENST00000541038,OXGR1,protein_coding,,"B2R986, F5H6U5",96986746,96987759,"B2R986, F5H6U5",
ENST00000298440,OXGR1,protein_coding,Q96P68-1,B2R986,96986746,96987759,Q96P68-1,


In [163]:
ens_min = ens_min.drop(['idd', 'pr2'], axis=1)

In [164]:
ens_min = ens_min.reset_index()

In [168]:
ens_min = ens_min.drop_duplicates('_protein_id')

In [167]:
all_hits

USP9Y                     [ENST00000338981, ENST00000651177]
BPY2                      [ENST00000382585, ENST00000331070]
PCDH11Y                   [ENST00000333703, ENST00000622698]
BPY2B                     [ENST00000615850, ENST00000382392]
TGIF2LY                   [ENST00000559055, ENST00000321217]
                                 ...                        
CTPS1      [ENST00000649124, ENST00000470271, ENST0000037...
CTPS1                     [ENST00000696111, ENST00000696110]
AGT        [ENST00000681514, ENST00000366667, ENST0000068...
EIF3I      [ENST00000677701, ENST00000677711, ENST0000067...
UBAP2L                    [ENST00000361546, ENST00000428931]
Length: 7376, dtype: object

In [174]:
ens_min[ens_min['ensembl_gene_name'] == 'UGP2']

Unnamed: 0,ensembl_trs_id,ensembl_gene_name,trs_type,uniprot_isoform_id,uniprot_trembl_id,cds_start,cds_end,_protein_id,_id_warning
110808,ENST00000466642,UGP2,nonsense_mediated_decay,,F8WC70,63856320.0,63857594.0,F8WC70,
110822,ENST00000484142,UGP2,protein_coding,,C9J6Q0,63842186.0,63857863.0,C9J6Q0,
110899,ENST00000482668,UGP2,protein_coding,,C9JTZ5,63856320.0,63882557.0,C9JTZ5,
110959,ENST00000467648,UGP2,protein_coding,Q16851-2,A0A140VKE1,63856320.0,63891227.0,Q16851-2,dup
110963,ENST00000497510,UGP2,nonsense_mediated_decay,,F2Z3H1,63856320.0,63862849.0,F2Z3H1,
110966,ENST00000483461,UGP2,retained_intron,,,,,_111966,
110967,ENST00000480679,UGP2,protein_coding,,C9J3M0,63856320.0,63856433.0,C9J3M0,
110969,ENST00000613823,UGP2,protein_coding,,A0A087WYS1,63856320.0,63891160.0,A0A087WYS1,
110971,ENST00000483108,UGP2,nonsense_mediated_decay,,F2Z3P4,63856320.0,63856820.0,F2Z3P4,
110977,ENST00000487469,UGP2,processed_transcript,,,,,_111977,


In [171]:
gene_size2 = ens_main[ens_main['trs_type'] == 'protein_coding'].groupby('ensembl_gene_name').size()
gene_size2 = gene_size2[gene_size2 > 1].index.to_list()

In [173]:
len(gene_size2)

15129

In [176]:
ens_min.to_csv('data/interim/ens_min_040722.csv', index=False)

In [180]:
gene_cds = pd.DataFrame(index=gene_size2, columns=['cds_arr', 'cds_unique', 'cds_mtx', 'cds_good', 'exons_mtx'])

In [181]:
gene_cds

Unnamed: 0,cds_arr,cds_unique,cds_mtx,cds_good,exons_mtx
A1BG,,,,,
A1CF,,,,,
A2M,,,,,
A2ML1,,,,,
A4GALT,,,,,
...,...,...,...,...,...
ZYG11A,,,,,
ZYG11B,,,,,
ZYX,,,,,
ZZEF1,,,,,


In [182]:
def pop_cds_arr(gene):
    v = ens_min[ens_min['ensembl_gene_name'] == gene]
    v = v[v['trs_type'] == 'protein_coding']

    return v.loc[:, ['cds_start', 'cds_end']].to_numpy()

In [184]:
def pop_cds_unique(cds_arr):
    return np.sort(pd.unique(cds_arr.ravel()))

In [192]:
def pop_cds_mtx(cds_arr):
    u = np.sort(pd.unique(cds_arr.ravel()))
    
    return (u == cds_arr[..., None]).any(1).astype(int)

In [186]:
gene_cds['cds_arr'] = gene_cds.index.map(pop_cds_arr)

In [188]:
gene_cds['cds_unique'] = gene_cds['cds_arr'].map(pop_cds_unique)

In [227]:
gene_cds['cds_mtx'] = gene_cds['cds_arr'].map(pop_cds_mtx)

In [198]:
x = gene_cds.loc['TEF', 'cds_mtx'].copy()

In [226]:
def pop_cds_good(cds_mtx):
    """At least two trs must have one 0 after substracting sum"""
    cds_mtx2 = cds_mtx - cds_mtx.sum(0)
    zeros = np.count_nonzero(cds_mtx2==0, axis=1)
    if len(zeros[zeros == 1]) >=2:
        return True
    else:
        return False

In [199]:
x -= x.sum(0)

In [210]:
np.unique(x, return_counts=True)

(array([-2, -1,  0]), array([ 1, 10,  4], dtype=int64))

In [216]:
np.count_nonzero(x==0, axis=1)

array([1, 1, 2], dtype=int64)

In [228]:
gene_cds['cds_good'] = gene_cds['cds_mtx'].map(pop_cds_good)

In [230]:
gene_cds2 = gene_cds[gene_cds['cds_good']]

In [231]:
gene_cds2

Unnamed: 0,cds_arr,cds_unique,cds_mtx,cds_good,exons_mtx
A2M,"[[9115764, 9067839], [9115764, 9110334]]","[9067839, 9110334, 9115764]","[[1, 0, 1], [0, 1, 1]]",True,
A2ML1,"[[8822652, 8875011], [8843236, 8875011], [8845...","[8822652, 8843236, 8845172, 8845300, 8846124, ...","[[1, 0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0, 1],...",True,
AAAS,"[[53314751, 53307713], [53321343, 53307713], [...","[53307713, 53308133, 53308960, 53314751, 53315...","[[1, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1], [1, 0...",True,
AADAC,"[[151814163, 151828172], [151814163, 151824846]]","[151814163, 151824846, 151828172]","[[1, 0, 1], [1, 1, 0]]",True,
AAGAB,"[[67254559, 67202898], [67236408, 67202898], [...","[67202898, 67231897, 67236408, 67254559]","[[1, 0, 0, 1], [1, 0, 1, 0], [0, 1, 1, 0]]",True,
...,...,...,...,...,...
ZXDC,"[[126474959, 126438461], [126474959, 126462220]]","[126438461, 126462220, 126474959]","[[1, 0, 1], [0, 1, 1]]",True,
ZYG11A,"[[52860749, 52892957], [52842884, 52892957]]","[52842884, 52860749, 52892957]","[[0, 1, 1], [1, 0, 1]]",True,
ZYG11B,"[[52726654, 52821629], [52726654, 52805476]]","[52726654, 52805476, 52821629]","[[1, 0, 1], [1, 1, 0]]",True,
ZYX,"[[143381295, 143382925], [143381572, 143390682...","[143381295, 143381572, 143381575, 143382656, 1...","[[1, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, ...",True,


In [285]:
def pop_exons_mtx(gene, cds_unique):
    x = (ens_exons[ens_exons['ensembl_gene_name'] == gene]
         .groupby('ensembl_trs_id')[['exon_start', 'exon_end']]
         .apply(lambda z: [np.arange(a, b+1) for a, b in zip(z['exon_start'], z['exon_end'])]).apply(np.hstack))

    return np.vstack([np.isin(cds_unique, xi).astype(int) for xi in x]).sum(axis=0)

In [289]:
exons_mtx_l = []
for i,j in enumerate(gene_cds2.index.to_list()):
    exons_mtx_l.append(pop_exons_mtx(j, gene_cds2.loc[j, 'cds_unique']))
    print(f'{i+1} / {len(gene_cds2.index)}          ', end='\r')

9477 / 9477          

In [291]:
gene_cds2['exons_mtx'] = exons_mtx_l

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_cds2['exons_mtx'] = exons_mtx_l


In [294]:
#exons_mtx_l

In [295]:
def final(exons_mtx):
    if np.count_nonzero(exons_mtx == 1) > 1:
        return True
    else:
        return False

In [296]:
gene_cds2['f'] = gene_cds2['exons_mtx'].map(final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_cds2['f'] = gene_cds2['exons_mtx'].map(final)


In [299]:
gene_cds3 = gene_cds2[gene_cds2['f']]

In [342]:
def superfinal(gene):
    
    cds_mtx = gene_cds3.loc[gene, 'cds_mtx']
    cds_mtx2 = cds_mtx - cds_mtx.sum(0)
    zeros = np.count_nonzero(cds_mtx2==0, axis=1)
    
    s = list(np.where(zeros == 1)[0])
    solutions = []

    for indices in sorted(list(chain.from_iterable(combinations(s, r) for r in range(2, len(s)+1))), key=len, reverse=True):
        cds_sum_loc = cds_mtx[indices, :].sum(axis=0)
        c = Counter(cds_sum_loc)
        if c[len(indices)] == 1 and c[1] == len(indices):
            mask = np.where(cds_sum_loc >= 1)[0]
            if not (cds_sum_loc[mask] - gene_cds3.loc[gene, 'exons_mtx'][mask]).any():
                solutions.append(indices)
                
    solutions_ids = []
    if solutions:
        for i in solutions:
            v = ens_min[ens_min['ensembl_gene_name'] == gene]
            solutions_ids.append(v.iloc[[*i],:]['ensembl_trs_id'].to_list())
        return solutions_ids
    else:
        return np.nan

In [343]:
zz = []
for i, gene in enumerate(gene_cds3.index):
    zz.append(superfinal(gene))
    print(f'{i} / {len(gene_cds3.index)}       ', end='\r')

1584 / 1585       

In [344]:
gene_cds3['sol'] = zz

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene_cds3['sol'] = zz


In [346]:
gene_cds4 = gene_cds3[~gene_cds3['sol'].isna()]

In [352]:
gene_cds4.loc['ASB5']['cds_mtx']

array([[1, 0, 0, 1, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 1]])

In [353]:
gene_cds4.loc['ASB5']['exons_mtx']

array([3, 1, 1, 1, 2, 2])

In [355]:
ens_min[ens_min['ensembl_gene_name'] == 'ASB5']

Unnamed: 0,ensembl_trs_id,ensembl_gene_name,trs_type,uniprot_isoform_id,uniprot_trembl_id,cds_start,cds_end,_protein_id,_id_warning
60430,ENST00000296525,ASB5,protein_coding,Q8WWX0-1,Q5HYF3,176268913.0,176215727.0,Q8WWX0-1,
60434,ENST00000672074,ASB5,protein_coding,,A0A5F9ZHS2,176231163.0,176215727.0,A0A5F9ZHS2,
60439,ENST00000512254,ASB5,protein_coding,Q8WWX0-2,,176241462.0,176215727.0,Q8WWX0-2,
60444,ENST00000511879,ASB5,processed_transcript,,,,,_61444,
60446,ENST00000510578,ASB5,processed_transcript,,,,,_61446,
60453,ENST00000505299,ASB5,protein_coding,,D6R9Q2,176269049.0,176269108.0,D6R9Q2,


In [357]:
ens_exons[ens_exons['ensembl_trs_id'] == 'ENST00000296525']

Unnamed: 0,ensembl_gene_name,ensembl_trs_id,ensembl_exon_id,5utr_start,5utr_end,3utr_start,3utr_end,exon_start,exon_end,cds_start,cds_end
721031,ASB5,ENST00000296525,ENSE00001081670,176269109.0,176269222.0,,,176268913,176269222,176268913,176269108
721032,ASB5,ENST00000296525,ENSE00003603238,,,,,176225262,176225341,176225262,176225341
721033,ASB5,ENST00000296525,ENSE00003502184,,,,,176222313,176222420,176222313,176222420
721034,ASB5,ENST00000296525,ENSE00001081665,,,,,176221450,176221600,176221450,176221600
721035,ASB5,ENST00000296525,ENSE00001081666,,,,,176221155,176221289,176221155,176221289
721036,ASB5,ENST00000296525,ENSE00001254223,,,,,176216818,176217009,176216818,176217009
721037,ASB5,ENST00000296525,ENSE00001254259,,,176213673.0,176215599.0,176213673,176215727,176215600,176215727


In [300]:
cds_mtx = gene_cds2.loc['TEF', 'cds_mtx']
cds_mtx2 = cds_mtx - cds_mtx.sum(0)
zeros = np.count_nonzero(cds_mtx2==0, axis=1)

In [302]:
np.where(zeros == 1)

(array([0, 1], dtype=int64),)

In [311]:
list(chain.from_iterable(combinations([4,5], r) for r in range(2, len([4,5])+1)))

[(4, 5)]

In [263]:
pop_exons_mtx('TEF', gene_cds2.loc['TEF', 'cds_unique'])

array([1, 1, 1, 3, 2])

In [284]:
gene = 'A2M'

x = (ens_exons[ens_exons['ensembl_gene_name'] == gene]
.groupby('ensembl_trs_id')[['exon_start', 'exon_end']]
.apply(lambda x: [np.arange(a, b+1) for a, b in zip(x['exon_start'], x['exon_end'])]).apply(np.hstack))

np.vstack([np.isin(gene_cds2.loc['A2M', 'cds_unique'], xi).astype(int) for xi in x]).sum(axis=0)

array([3, 3, 5])

In [283]:
x

ensembl_trs_id
ENST00000318602    [9115764, 9115765, 9115766, 9115767, 9115768, ...
ENST00000404455    [9113360, 9113361, 9113362, 9113363, 9113364, ...
ENST00000462568    [9090356, 9090357, 9090358, 9090359, 9090360, ...
ENST00000467091    [9115764, 9115765, 9115766, 9115767, 9115768, ...
ENST00000472360    [9101447, 9101448, 9101449, 9101450, 9101451, ...
ENST00000495442    [9068740, 9068741, 9068742, 9068743, 9068744, ...
ENST00000495709    [9068740, 9068741, 9068742, 9068743, 9068744, ...
ENST00000497324    [9115764, 9115765, 9115766, 9115767, 9115768, ...
ENST00000539638    [9113360, 9113361, 9113362, 9113363, 9113364, ...
ENST00000542567    [9080274, 9080275, 9080276, 9080277, 9080278, ...
ENST00000543436    [9091201, 9091202, 9091203, 9091204, 9091205, ...
ENST00000545828    [9106236, 9106237, 9106238, 9106239, 9106240, ...
ENST00000546069    [9101144, 9101145, 9101146, 9101147, 9101148, ...
dtype: object

In [237]:
y

array([41367533, 41382045, 41382935, 41394274, 41395960], dtype=object)

In [256]:
np.vstack([np.isin(y, xi).astype(int) for xi in x]).sum(axis=0)

array([1, 1, 1, 3, 2])

In [255]:
[np.isin(y, xi).astype(int) for xi in x]

[array([0, 1, 0, 1, 1]), array([1, 0, 0, 1, 1]), array([0, 0, 1, 1, 0])]

In [246]:
y[..., None]

array([[41367533],
       [41382045],
       [41382935],
       [41394274],
       [41395960]], dtype=object)

In [251]:
np.isin(y, x[0]).astype(int)

array([0, 1, 0, 1, 1])

In [248]:
x[0]

array([41387351, 41387352, 41387353, ..., 41399324, 41399325, 41399326])

In [312]:
ens_min

Unnamed: 0,ensembl_trs_id,ensembl_gene_name,trs_type,uniprot_isoform_id,uniprot_trembl_id,cds_start,cds_end,_protein_id,_id_warning
0,ENST00000303728,PRY,protein_coding,O14603-1,A0A384MTZ8,22501565,22514070,O14603-1,
1,ENST00000477123,PRY,nonsense_mediated_decay,O14603-2,,22501565,22512668,O14603-2,
2,ENST00000457658,USP9Y,processed_transcript,,,,,_1002,
3,ENST00000440408,USP9Y,processed_transcript,,,,,_1003,
4,ENST00000651177,USP9Y,protein_coding,O00507-1,,12709448,12859416,O00507-1,dup
...,...,...,...,...,...,...,...,...,...
162358,ENST00000435087,HAX1,protein_coding,,Q5VYD6,154272724,154275701,Q5VYD6,
162359,ENST00000532105,HAX1,protein_coding,,E9PIQ7,154273842,154275701,E9PIQ7,
162360,ENST00000471326,HAX1,retained_intron,,,,,_163360,
162361,ENST00000492550,HAX1,retained_intron,,,,,_163361,
