In [None]:
import os
import re
import io
import sys
import glob
import json
import dask
import requests
import datetime
import urllib
import psycopg2
import numpy as np
import pandas as pd

import dask.diagnostics
import sqlalchemy as db
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 1

sys.path.append('../..')
%aimport opencell.database.operations
%aimport opencell.database.uniprot_utils
from opencell.database import models, operations, ms_utils, uniprot_utils
from opencell.database import utils as db_utils

In [None]:
url = db_utils.url_from_credentials('../../db-credentials-test.json')
url = db_utils.url_from_credentials('../../db-credentials-dev.json')
url = db_utils.url_from_credentials('../../db-credentials-cap.json')

engine = db.create_engine(url)
session_factory = db.orm.sessionmaker(bind=engine)
Session = db.orm.scoped_session(session_factory)
url

In [None]:
uniprot_utils.map_uniprot_to_ensg_using_mygene('A0A0G2JHL0')

In [None]:
uniprot_utils.map_uniprot_to_ensg_using_uniprot('Q5T1J5')

### Map uniprot_ids to group_ids offline but using consensus uniprot metadata

In [None]:
metadata = uniprot_utils.export_uniprot_metadata(engine)

In [None]:
group_uniprot_ids = pd.read_sql(
    '''
    select id as protein_group_id, unnest(uniprot_ids) as uniprot_id
    from mass_spec_protein_group;
    ''',
    engine
)

In [None]:
# drop isoform-specific uniprot_ids
group_uniprot_ids['uniprot_id'] = group_uniprot_ids.uniprot_id.apply(lambda s: s.split('-')[0])

In [None]:
# group_id - ensg_id associations
group_ensg_ids = pd.merge(metadata, group_uniprot_ids, on='uniprot_id', how='inner')
group_ensg_ids = group_ensg_ids.groupby(['protein_group_id', 'ensg_id']).first().reset_index()

In [None]:
# group_id - consensus_uniprot_id associations
group_consensus_ids = pd.merge(
    group_ensg_ids[['protein_group_id', 'ensg_id']], 
    metadata.loc[metadata.is_reference], 
    on='ensg_id', 
    how='inner'
)

In [None]:
group_ensg_ids.shape, group_consensus_ids.shape

### Inspect a given hit

In [None]:
hit = Session.query(models.MassSpecHit).filter(models.MassSpecHit.id == 890085).one()
hit.as_dict()

In [None]:
hit.pulldown.cell_line.crispr_design

In [None]:
hit.protein_group.uniprot_ids

In [None]:
[d.as_dict() for d in hit.protein_group.uniprot_metadata]

### Inspecting the protein_group - crispr_design mapping

This mapping is done using ENSG IDs in the CLI method `database_cli.populate_protein_group_crispr_design_associations`

Here, we explore and sanity-check the results of this mapping. 

In [None]:
# all uniprot_ids from all groups
d = pd.read_sql(
    'select id, unnest(uniprot_ids) as uniprot_id from mass_spec_protein_group', 
    engine
)

# unique uniprot_ids
all_uniprot_ids = d.uniprot_id.apply(lambda s: s.split('-')[0]).unique()
len(all_uniprot_ids)

In [None]:
existing_uniprot_ids = [
    row.uniprot_id for row in Session.query(models.UniprotMetadata).all()
]

new_uniprot_ids = set(all_uniprot_ids).difference(existing_uniprot_ids)
len(existing_uniprot_ids), len(new_uniprot_ids)

In [None]:
crispr_designs = pd.read_sql(
    '''select * from crispr_design inner join uniprot_metadata using (uniprot_id)''',
    engine
)

In [None]:
groups = (
    Session.query(models.MassSpecProteinGroup)
    .options(
        db.orm.joinedload(models.MassSpecProteinGroup.uniprot_metadata),
        db.orm.joinedload(models.MassSpecProteinGroup.crispr_designs)
    )
    .all()
)
len(groups)

In [None]:
# the number of groups with a given number of crispr_designs
counts = [len(group.crispr_designs) for group in groups]
pd.Series(data=counts).value_counts()

### Totally offline mapping of protein groups to targets

This is a hackish/offline implementation of this mapping, using ENSG IDs, with various sanity checks (e.g., protein groups with more than one ENSG ID, targets that don't map to any groups, targets without pulldowns, etc). 

In [None]:
def get_targets():
    d = pd.read_sql(
        '''select * from crispr_design inner join uniprot_metadata using (uniprot_id)''',
        engine
    )
    return d

In [None]:
def get_lines():
    d = pd.read_sql(
        '''
        select * from cell_line 
        left join crispr_design on crispr_design.id = cell_line.crispr_design_id
        left join uniprot_metadata using (uniprot_id)
        where line_type = 'POLYCLONAL'
        ''',
        engine
    )
    return d

In [None]:
umd = pd.read_sql('select * from uniprot_metadata', engine)
def get_groups():
    
    uniprot_metadata = umd.loc[umd.ensg_id.notna()]
    groups = pd.read_sql('''select * from mass_spec_protein_group''', engine)

    # append the list of ENSG IDs to each protein group using the uniprot metadata
    groups['ensg_ids'] = None
    for ind, row in groups.iterrows():
        uniprot_ids = set([uniprot_id.split('-')[0] for uniprot_id in row.uniprot_ids])
        groups.at[ind, 'ensg_ids'] = list(
            uniprot_metadata.loc[uniprot_metadata.uniprot_id.isin(uniprot_ids)].ensg_id.unique()
        )

    return groups

In [None]:
targets = get_targets()
lines = get_lines()
groups = get_groups()

In [None]:
# targets whose uniprot_ids did not map to an ensg_id
len(targets.loc[targets.ensg_id.isna()])

In [None]:
targets['gene_name'] = targets.gene_names.apply(lambda s: s.split(' '))
target_names = set(targets.target_name.values)
targets.shape, lines.shape

In [None]:
# number of groups without any ENSG IDs and with more than one ENSG ID
(
    groups.shape, 
    groups.loc[groups.ensg_ids.apply(len) == 0].shape, 
    groups.loc[groups.ensg_ids.apply(len) > 1].shape,
)

In [None]:
# unique ensg_ids from all protein_groups
all_ensg_ids = []
_ = [all_ensg_ids.extend(ensg_ids) for ensg_ids in groups.ensg_ids if ensg_ids is not None]

In [None]:
# find targets that do not appear in any protein groups
targets['in_ms'] = False
for ind, target in targets.iterrows():
    if target.ensg_id in all_ensg_ids:
        targets.at[ind, 'in_ms'] = True

In [None]:
targets.in_ms.sum(), (~targets.in_ms).sum()

In [None]:
# targets that do not have any pulldowns
d = pd.read_sql(
    '''
    select crispr_design_id, target_name from crispr_design cd 
    left join cell_line on cd.id = cell_line.crispr_design_id
    where cell_line.id not in (select cell_line_id from mass_spec_pulldown)
    ''',
    engine
)

targets_wo_ms = d.crispr_design_id.values
len(targets_wo_ms)

In [None]:
# targets that do not appear in any protein groups
targets_wo_groups = np.array(sorted(targets.loc[~targets.in_ms].id.values))
len(set(targets_wo_groups))

In [None]:
len(set(targets_wo_groups).intersection(targets_wo_ms))

In [None]:
# targets not in any protein groups but that do have pulldowns
targets_wo_groups_w_ms = set(targets_wo_groups).difference(targets_wo_ms)
len(targets_wo_groups_w_ms)

In [None]:
# the names of these targets
targets.loc[targets.id.isin(targets_wo_groups_w_ms)].target_name.sort_values().values

In [None]:
# targets not in any protein groups that do have pulldowns but do not have ensg_ids
targets_wo_ensg = targets.loc[targets.ensg_id.isna()].id.values
targets_wo_groups_w_ms_wo_ensg = set(targets_wo_groups_w_ms).intersection(targets_wo_ensg)
len(targets_wo_ensg), len(targets_wo_groups_w_ms_wo_ensg)

In [None]:
# for reference: 
# how to retrieve protein groups whose list of gene_names intersects a given list of names
gene_names = ['CAD', 'CALD1']    
gene_names = ','.join(gene_names)

groups = (
    Session.query(models.MassSpecProteinGroup)
    .filter(models.MassSpecProteinGroup.gene_names.overlap(f'{{{gene_names}}}'))
    .all()
)

In [None]:
# map each group to one or more targets
groups['target_ids_by_ensg_id'] = None
groups['target_names_by_ensg_id'] = None

for ind, group in groups.iterrows():
    if group.ensg_ids is None:
        continue
    
    matching_targets = targets.loc[targets.ensg_id.isin(group.ensg_ids)]
    if matching_targets.shape[0]:
        groups.at[ind, 'target_ids_by_ensg_id'] = list(matching_targets.id.unique())
        groups.at[ind, 'target_names_by_ensg_id'] = list(matching_targets.target_name.unique())
    
groups.target_ids_by_ensg_id.notna().sum()

In [None]:
# groups that match to more than one target (there are 264 of these)
groups.loc[groups.target_ids_by_ensg_id.apply(lambda s: len(s) > 1 if s else False)]

### Deprecated: Looking up uniprot gene names from ENST IDs using Uniprot mapper API

This requires two steps. For some reason, retrieving the gene name directly from the ENST ID (using `to=GENENAME`) does not work - no names are returned. Instead, we first look up the uniprot_id (what Uniprot calls 'ACC') and then from these ids, look up the gene name.

In [None]:
targets = get_targets()

In [None]:
# all unique enst_ids
all_enst_ids = list(set(targets.loc[~targets.transcript_id.isna()].transcript_id.tolist()))
len(all_enst_ids), targets.shape

In [None]:
enst_to_uniprot = uniprot_utils.uniprot_id_mapper(
    all_enst_ids, input_type='ENSEMBL_TRS_ID', output_type='ACC'
)

In [None]:
# look up uniprot gene names for all of the uniprot_ids
all_uniprot_ids = enst_to_uniprot['ACC'].unique()
uniprot_to_name = uniprot_utils.uniprot_id_mapper(
    all_uniprot_ids, input_type='ACC', output_type='GENENAME'
)

In [None]:
enst_to_name = pd.merge(
    enst_to_uniprot, uniprot_to_name, how='inner', left_on='ACC', right_on='ACC'
)

In [None]:
# merge the uniprot gene names with the targets
targets = targets.merge(enst_to_name, how='left', left_on='transcript_id', right_on='ENSEMBL_TRS_ID')

In [None]:
targets['GENENAME'] = targets.GENENAME.apply(lambda s: s.upper() if not pd.isna(s) else None)

In [None]:
# targets whose target names do not match the gene name extracted from the uniprotKB IDs
(
    targets.loc[
        (~targets.GENENAME.isna()) & (targets.GENENAME != targets.target_name.apply(str.upper))
    ]
    [['target_name', 'GENENAME', 'ACC']]
    .sort_values(by='target_name')
)