browser.get('...')

### NCG6 (http://ncg.kcl.ac.uk/download.php)
NCG6_cancergenes (supporting literature)<br>
NCG6_strong_candidates_more_than_one_mutational_screening<br>
NCG6_strong_candidates_with_140_donors<br>
NCG6_systemslevelproperties<br>
NCG6_systemslevelproperties_including_expression<br>
NCG6_tsgoncogene<br>
 
 
### COSMIC (https://cancer.sanger.ac.uk/cosmic/download)
CosmicStructExport<br>
CosmicBreakpointsExport<br>
CosmicCompleteCNA<br>
CosmicCompleteDifferentialMethylation<br>
CosmicCompleteGeneExpression<br>
CosmicCompleteTargetedScreensMutantExport<br>
CosmicFusionExport<br>
CosmicGenomeScreensMutantExport<br>
CosmicMutantExport<br>
CosmicMutantExportCensus<br>
CosmicMutationTracking<br>
CosmicNCV<br>
CosmicSample<br>
classification<br>
cancer_gene_census<br>
<br>





In [1]:
import pandas as pd
import numpy as np
import os
import glob
import psycopg2
from io import StringIO
import urllib.parse
import urllib.request


In [2]:
class Browser:
    
    def __init__(self):
        pass
        
    def get(self, filename, names=None, dtype=None):
        self.data = {}
        data_folders = glob.glob('data/*')
        # iterating over all folders
        for data_folder in data_folders:
            # iterating over all files
            for file in os.listdir(data_folder):
                name = file.split('.')[0]
                # file is file we are searching for
                if name == filename:
                    file_path = os.path.join(data_folder, file)
                    file_path_abs = os.path.abspath(file_path)

                    # comma sep is default
                    sep = ','

                    # all tab separated files
                    if file[-3:] == 'tsv' or file[-7:] == 'tab.txt' or file[-3:] == 'tab':
                        sep = '\t'

                    return pd.read_csv(file_path_abs, sep=sep, names=names, dtype=dtype)
                    
                    

In [3]:
browser = Browser()

In [4]:
browser.get('NCG6_cancergenes').head()


Unnamed: 0,entrez,symbol,pubmed_id,type,primary_site,cancer_type,method
0,25,ABL1,29625053,WES,multiple,pan-cancer_adult,PanSoftWare
1,25,ABL1,14993899,Known Cancer,multiple,-,-
2,25,ABL1,29489755,WGS-WES-RNAseq,multiple,pan-cancer_paediatric,"GRIN, MutSig"
3,25,ABL1,23539594,Known Cancer,multiple,-,-
4,27,ABL2,14993899,Known Cancer,multiple,-,-


first we work only with the NCG6 dataset
we want to find a general format which can later be picked up by a dataloader to dynmaically fill the database

generic file format:
- gene_name
- cancer type

ADDITIONAL: Dataset of origin needs top be passed as an argument

## PROCESS NCG6 DATA

NOTE: 2372 unique cancer-genes are in total in the dataset, only 2088 have a cancer type assigned

In [5]:
"""
df = browser.get('NCG6_cancergenes')[['symbol', 'cancer_type', 'entrez']]
print(len(df['symbol'].unique()))
print(len(df[df.cancer_type != '-']['symbol'].unique()))

df = df.drop_duplicates()

# add the complete NCG6 dataset
dbc.add_gene_dataset(
    dataset_name='NCG6', 
    cancer_genes=df['symbol'].to_list(), 
    cancer_types=df['cancer_type'].to_list(), 
    entrez_ids=df['entrez'].to_list()
)
"""

"\ndf = browser.get('NCG6_cancergenes')[['symbol', 'cancer_type', 'entrez']]\nprint(len(df['symbol'].unique()))\nprint(len(df[df.cancer_type != '-']['symbol'].unique()))\n\ndf = df.drop_duplicates()\n\n# add the complete NCG6 dataset\ndbc.add_gene_dataset(\n    dataset_name='NCG6', \n    cancer_genes=df['symbol'].to_list(), \n    cancer_types=df['cancer_type'].to_list(), \n    entrez_ids=df['entrez'].to_list()\n)\n"

In [6]:
df = browser.get('NCG6_cancergenes')
df[df['symbol'] == 'RALGDS']

Unnamed: 0,entrez,symbol,pubmed_id,type,primary_site,cancer_type,method
3090,5900,RALGDS,14993899,Known Cancer,multiple,-,-


## PROCESS COSMIC DATA

In [7]:

df = browser.get('cancer_gene_census')
df['Tumour Types'] = df['Tumour Types(Somatic)'].astype(str) + ', ' + df['Tumour Types(Germline)'].astype(str)
# convert 'Tumour Types(Somatic)' into list
df['Tumour Types'] = df['Tumour Types'].astype(str)

df['Tumour Types'] = df['Tumour Types'].apply(lambda x: x.split(', '))

df = df.explode('Tumour Types')
df['Tumour Types'] = df['Tumour Types'].astype(str).replace('nan', np.nan)

df = df.dropna(subset=['Tumour Types'])
#df['Tumour Types'] = df['Tumour Types'].apply(lambda x: x.replace(' ', '_'))

# MALAT1: 378938
# DUX4L1: 22947
# HMGN2P46: 283651
# MDS2: 259283

fill_in_gene_ids = [
    ('MALAT1', 378938), 
    ('DUX4L1', 22947), 
    ('HMGN2P46', 283651), 
    ('MDS2', 259283),
    ('EIF1AX', 1964)]

for gene, entrez_id in fill_in_gene_ids:
    index = df[df['Gene Symbol'] == gene].index[0]
    df.at[index, 'Entrez GeneId'] = entrez_id


df = df[['Gene Symbol', 'Entrez GeneId',  'Tumour Types', 'Entrez GeneId']]
df 




Unnamed: 0,Gene Symbol,Entrez GeneId,Tumour Types,Entrez GeneId.1
0,A1CF,29974.0,melanoma,29974.0
1,ABI1,10006.0,AML,10006.0
2,ABL1,25.0,CML,25.0
2,ABL1,25.0,ALL,25.0
2,ABL1,25.0,T-ALL,25.0
...,...,...,...,...
721,ZNRF3,84133.0,colorectal cancer,84133.0
721,ZNRF3,84133.0,adrenocortical carcinoma,84133.0
721,ZNRF3,84133.0,gastric cancer,84133.0
722,ZRSR2,8233.0,MDS,8233.0


In [8]:
df.drop_duplicates()

Unnamed: 0,Gene Symbol,Entrez GeneId,Tumour Types,Entrez GeneId.1
0,A1CF,29974.0,melanoma,29974.0
1,ABI1,10006.0,AML,10006.0
2,ABL1,25.0,CML,25.0
2,ABL1,25.0,ALL,25.0
2,ABL1,25.0,T-ALL,25.0
...,...,...,...,...
721,ZNRF3,84133.0,colorectal cancer,84133.0
721,ZNRF3,84133.0,adrenocortical carcinoma,84133.0
721,ZNRF3,84133.0,gastric cancer,84133.0
722,ZRSR2,8233.0,MDS,8233.0


In [9]:
df[df['Gene Symbol'] == 'EIF1AX']

Unnamed: 0,Gene Symbol,Entrez GeneId,Tumour Types,Entrez GeneId.1
191,EIF1AX,1964.0,uveal melanoma,1964.0
191,EIF1AX,1964.0,thyroid cancer (PDTC and ATC),1964.0
191,EIF1AX,1964.0,low grade serous ovarian cancer,1964.0


## PROCESS BIOGRID DATA

### Drugs

In [10]:

df = browser.get('BIOGRID-CHEMICALS-3').drop_duplicates(subset=['Chemical Name'])

df = df[
    [
    'Chemical Name', 
     'Action',
     'Chemical Synonyms', 
     'Molecular Formula', 
     'Chemical Type'
    ]
]

df = df.rename(columns={'Chemical Name': 'name',
                        'Action': 'action', 
                        'Chemical Synonyms': 'synonym', 
                        'Molecular Formula': 'molecular_formula', 
                        'Chemical Type': 'type'})

df = df.replace('unknown', None)
drugs = df
df


Unnamed: 0,name,action,synonym,molecular_formula,type
0,Lepirudin,inhibitor,Lepirudin recombinant|Hirudin variant-1,C287H440N80O110S6,biologic
6,Cetuximab,inhibitor,Immunoglobulin G 1 (human-mouse monoclonal C 2...,C6484H10042N1732O2023S36,biologic
34,Denileukin diftitox,agonist,Diphtheria toxin precursor|DT|NAD(+--diphthami...,C2560H4042N678O799S17,biologic
49,Etanercept,agonist,p75|CD120b|Tumor necrosis factor receptor 2|TN...,C2224H3475N621O698S36,biologic
85,Bivalirudin,inhibitor,-,C98H138N24O33,small molecule
...,...,...,...,...,...
28087,Tubastatin A,inhibitor,"N-hydroxy-4-((2-methyl-3,4-dihydro-1H-pyrido[4...",C20H21N3O2,small molecule
28088,Marizomib,inhibitor,"(1R,4R,5S)-4-(2-chloroethyl)-1-[(S)-(1S)-cyclo...",C15H20ClNO4,small molecule
28089,Ixazomib,inhibitor,-,C14H19BCl2N2O4,small molecule
28090,Oprozomib,inhibitor,ONX 0912|MZ37792Y8J|9597|Oprozomib|D-erythro-3...,C25H32N4O7S,small molecule


In [11]:
len(set(df['name'].to_list()))

5173

### TARGET TARGET INTERACTIONS

In [12]:
# target_target_interactions

df = browser.get('BIOGRID-ORGANISM-Homo_sapiens-3')[
    ['#ID Interactor A', 
     'ID Interactor B', 
     'Interaction Detection Method', 
     'Publication Identifiers', 
     'Interaction Types', 
     'Confidence Values']]

def parse_interactor(x):
    # format entrez gene/locuslink:6416
    # wanted: 6416
    return x.split(':')[-1]

def parse_publication_identifiers(x):
    # format: pubmed:9006895
    # wanted: 9006895
    return x.split(':')[-1]

def parse_interaction_detection_method_get_id(x):
    # format: psi-mi:"MI:0018"(two hybrid)
    # wanted: MI:0018
    return x.split('"')[1]

def parse_interaction_detection_method_get_name(x):
    # format: psi-mi:"MI:0018"(two hybrid)
    # wanted: two hybrid
    return x.split('(')[1][:-1]

def parse_interaction_types_get_id(x):
    # format: psi-mi:"MI:0407"(direct interaction)
    # wanted: MI:0407
    return x.split('"')[1]

def parse_interaction_types_get_name(x):
    # format: psi-mi:"MI:0407"(direct interaction)
    # wanted: direct interaction
    return x.split('(')[1][:-1]

def parse_confidence_value(x):
    # format: score:7.732982515 or '-'
    # wanted: 7.732982515 or None
    if x == '-':
        return '-'
    else:
        return x.split(':')[1]

df['#ID Interactor A'] = df['#ID Interactor A'].map(parse_interactor)
df['ID Interactor B'] = df['ID Interactor B'].map(parse_interactor)

df['Interaction Detection Method ID'] = df['Interaction Detection Method'].map(parse_interaction_detection_method_get_id)
df['Interaction Detection Method Name'] = df['Interaction Detection Method'].map(parse_interaction_detection_method_get_name)
df = df.drop('Interaction Detection Method', axis=1)

df['Publication Identifiers'] = df['Publication Identifiers'].map(parse_publication_identifiers)

df['Interaction Types ID'] = df['Interaction Types'].map(parse_interaction_types_get_id)
df['Interaction Types Name'] = df['Interaction Types'].map(parse_interaction_types_get_name)
df = df.drop('Interaction Types', axis=1)

df['Confidence Values'] = df['Confidence Values'].map(parse_confidence_value)

df = df.rename(
    columns=
    {'#ID Interactor A': 'entrez_a', 
     'ID Interactor B': 'entrez_b', 
     'Publication Identifiers': 'pubmed_id', 
     'Confidence Values': 'confidence_value', 
     'Interaction Detection Method ID': 'detection_method_psi_mi', 
     'Interaction Detection Method Name': 'detection_method_name', 
     'Interaction Types ID': 'type_psi_mi', 
     'Interaction Types Name': 'type_name'})

df.head()

Unnamed: 0,entrez_a,entrez_b,pubmed_id,confidence_value,detection_method_psi_mi,detection_method_name,type_psi_mi,type_name
0,6416,2318,9006895,-,MI:0018,two hybrid,MI:0407,direct interaction
1,84665,88,11309420,-,MI:0018,two hybrid,MI:0407,direct interaction
2,90,2339,8599089,-,MI:0018,two hybrid,MI:0407,direct interaction
3,2624,5371,10938104,-,MI:0018,two hybrid,MI:0407,direct interaction
4,6118,6774,10875894,-,MI:0018,two hybrid,MI:0407,direct interaction


In [13]:
df['detection_method_psi_mi'].apply(len).max()

7

In [14]:
to_remove = ['P0DTC1', 'P0DTD2', 'Q7TLC7']
df[~df['entrez_a'].isin(to_remove)]['entrez_a'].astype(int)
df['entrez_b'].astype(int)

0          2318
1            88
2          2339
3          5371
4          6774
          ...  
613863     3827
613864      462
613865     5624
613866    10280
613867     4988
Name: entrez_b, Length: 613868, dtype: int32

### DRUG TARGET INTERACTIONS

In [15]:
df = browser.get('BIOGRID-CHEMICALS-3')

df = df[['Chemical Name', 'Entrez Gene ID', 'Pubmed ID']]
df = df.drop_duplicates()

df.head()

Unnamed: 0,Chemical Name,Entrez Gene ID,Pubmed ID
0,Lepirudin,2147,11055889
1,Lepirudin,2147,10912644
2,Lepirudin,2147,11807012
3,Lepirudin,2147,11467439
4,Lepirudin,2147,10505536


In [16]:
df.columns

Index(['Chemical Name', 'Entrez Gene ID', 'Pubmed ID'], dtype='object')

In [17]:
df

Unnamed: 0,Chemical Name,Entrez Gene ID,Pubmed ID
0,Lepirudin,2147,11055889
1,Lepirudin,2147,10912644
2,Lepirudin,2147,11807012
3,Lepirudin,2147,11467439
4,Lepirudin,2147,10505536
...,...,...,...
28088,Marizomib,5696,15916417
28089,Ixazomib,5696,20160034
28090,Oprozomib,5696,20110419
28091,Oprozomib,5693,20110419


### All Genes

In [18]:
df = browser.get('BIOGRID-ORGANISM-Homo_sapiens-3')[
            ['#ID Interactor A', 'ID Interactor B', 'Alt IDs Interactor A', 'Alt IDs Interactor B',
             'Aliases Interactor A', 'Aliases Interactor B']]


def parse_interactor(x):
    # format entrez gene/locuslink:6416
    # wanted: 6416
    return x.split(':')[-1]

    
def parse_gene_name(x):
    #'format: biogrid:112315|entrez gene/locuslink:MAP2K4
    # |uniprot/swiss-prot:P45985|refseq:NP_003001|refseq:NP_001268364'
    
    # wanted: MAP2K4
    
    #x1 = entrez gene/locuslink:MAP2K4
    x1 = x.split('|')[1]
    
    return x1.split(':')[1]
    
def parse_aliase_interactors(x):
    # format: entrez gene/locuslink:JNKK(gene name synonym)|
    # entrez gene/locuslink:JNKK1(gene name synonym)|
    # entrez gene/locuslink:MAPKK4(gene name synonym)|
    # entrez gene/locuslink:MEK4(gene name synonym)|
    # entrez gene/locuslink:MKK4(gene name synonym)|
    # entrez gene/locuslink:PRKMK4(gene name synonym)|
    # entrez gene/locuslink:SAPKK-1(gene name synonym)|
    # entrez gene/locuslink:SAPKK1(gene name synonym)|
    # entrez gene/locuslink:SEK1(gene name synonym)|
    # entrez gene/locuslink:SERK1(gene name synonym)|
    # entrez gene/locuslink:SKK1(gene name synonym)
    
    # wanted: JNKK,JNKK1,MAPKK4,...
    
    x_cut1 = x.replace('entrez gene/locuslink:', '')
    x_cut2 = x_cut1.replace('(gene name synonym)', '')
    x_cut3 = x_cut2.replace(' ', '')
    x_cut4 = x_cut3.replace('-', '')
    
    names = x_cut4.split('|')

    return ','.join(names)
    

df['#ID Interactor A'] = df['#ID Interactor A'].map(parse_interactor)
df['ID Interactor B'] = df['ID Interactor B'].map(parse_interactor)

df['Alt IDs Interactor A'] = df['Alt IDs Interactor A'].map(parse_gene_name)
df['Alt IDs Interactor B'] = df['Alt IDs Interactor B'].map(parse_gene_name)

df['Aliases Interactor A'] = df['Aliases Interactor A'].map(parse_aliase_interactors)
df['Aliases Interactor B'] = df['Aliases Interactor B'].map(parse_aliase_interactors)

df = df.rename(
    columns=
    {'#ID Interactor A': 'entrez_a',
     'ID Interactor B': 'entrez_b',
     'Alt IDs Interactor A': 'gene_name_a',
     'Alt IDs Interactor B': 'gene_name_b',
     'Aliases Interactor A': 'alias_gene_a',
     'Aliases Interactor B': 'alias_gene_b'
})

df_a = df[['entrez_a', 'gene_name_a', 'alias_gene_a']]
df_b = df[['entrez_b', 'gene_name_b', 'alias_gene_b']]

df_a = df_a.rename(
    columns=
        {'entrez_a': 'entrez_id',
        'gene_name_a': 'name',
        'alias_gene_a': 'alias'}
)

df_b = df_b.rename(
    columns=
        {'entrez_b': 'entrez_id',
        'gene_name_b': 'name',
        'alias_gene_b': 'alias'}
)

df = pd.concat([df_a, df_b])

# remove all dirty entries where entrez_id is not a real entrez_id
# entrez_ids are numbers    
df = df[df['entrez_id'].str.isnumeric()]


### Genes to Proteins

In [19]:
def uniprot_to_entrez(uniprot_ids: list):
    # make sure uniprot_ids are strings
    
    uniprot_ids = list(map(str, uniprot_ids))
    
    columns = [
        'UniProtKB-AC', 
        'UniProtKB-ID', 
        'GeneID (EntrezGene)',
        'RefSeq',
        'GI',
        'PDB',
        'GO',
        'UniRef100',
        'UniRef90',
        'UniRef50',
        'UniParc',
        'PIR',
        'NCBI-taxon',
        'MIM',
        'UniGene',
        'PubMed',
        'EMBL',
        'EMBL-CDS',
        'Ensembl',
        'Ensembl_TRS',
        'Ensembl_PRO',
        'Additional PubMed'
    ]

    df = browser.get('HUMAN_9606_idmapping_selected', names=columns, dtype='str')
    
    # get sub_df containing only rows with given uniprot_ids
    df_sub = df[df['UniProtKB-AC'].isin(uniprot_ids)]
    
    # create a lookup series with uniprot_id: entrez_id relations
    lookup_series = df_sub[['UniProtKB-AC', 'GeneID (EntrezGene)']].set_index('UniProtKB-AC')
    
    return lookup_series

def entrez_to_uniprot(entrez_ids: list):
    
    # make sure entrez_ids are strings
    
    entrez_ids = list(map(str, entrez_ids))
    
    columns = [
        'UniProtKB-AC', 
        'UniProtKB-ID', 
        'GeneID (EntrezGene)',
        'RefSeq',
        'GI',
        'PDB',
        'GO',
        'UniRef100',
        'UniRef90',
        'UniRef50',
        'UniParc',
        'PIR',
        'NCBI-taxon',
        'MIM',
        'UniGene',
        'PubMed',
        'EMBL',
        'EMBL-CDS',
        'Ensembl',
        'Ensembl_TRS',
        'Ensembl_PRO',
        'Additional PubMed'
    ]

    df = browser.get('HUMAN_9606_idmapping_selected', names=columns, dtype='str')
    
    # get sub_df containing only rows with given entrez_ids
    df_sub = df[df['GeneID (EntrezGene)'].isin(entrez_ids)]
    
    # create a lookup series with entrez_id: uniprot_id relations
    lookup_series = df_sub[['UniProtKB-AC', 'GeneID (EntrezGene)']].set_index('GeneID (EntrezGene)')
    
    return lookup_series

def uniprot_to_protein_names(uniprot_kb_list:list):
    
    # fetch all reviewed, human related entries from uniprot in tab format
    url = 'https://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606&format=tab'
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as f:
        response = f.read()
    data = response.decode('utf-8')

    data_string = StringIO(data)

    df = pd.read_csv(data_string, sep='\t', dtype='str')
    # columns are: Entry, Entry name, Status, Protein names, Gene names, Organism, Length
    
    # filter entries to only rows with uniprot_kbs from uniprot_kb_list 
    df_sub = df[df['Entry'].isin(uniprot_kb_list)]
    
    # create lookup_series with uniprot_kb: Protein_names relations
    lookup_series = df_sub[['Entry', 'Protein names']].set_index('Entry')
    
    return lookup_series
    
def entrez_to_protein(entrez_ids: list):

    # 1 read HUMAN_9606_idmapping_selected.tab to map entrez id to UniProtKB-ID
    entrez_to_uniprot_series = entrez_to_uniprot(entrez_ids)

    uniprot_ids = entrez_to_uniprot_series['UniProtKB-AC'].to_list()

    # 2 lookup uniprot_id to find protein names in data downloaded from uniprot
    uniprot_to_protein_series = uniprot_to_protein_names(uniprot_ids)

    # 3 check fo reach entrez id if uniprotkb exists and if so, if protein names exist, else return None
    
    # relations stores [entrez, uniprot_acs, protein_name] lists
    relations = []
    for entrez_id in entrez_ids:

        # make sure entrez_id is string
        entrez_id = str(entrez_id)

        # try to fetch uniprot id
        try:
            uniprot_acs = entrez_to_uniprot_series.loc[entrez_id]['UniProtKB-AC'].to_list()

            for uniprot_ac in uniprot_ids:

                # try to fetch protein names
                try:
                    protein_name = uniprot_to_protein_series.loc[uniprot_ac]['Protein names']
                    relations.append([entrez_id, uniprot_ac, protein_name])
                except KeyError:
                    relations.append([entrez_id, uniprot_ac, None])
                    pass
            
        except KeyError:
            relations.append([entrez_id, None, None])
            
    return pd.DataFrame(relations, columns=['entrez_id', 'uniprot_ac', 'protein'])
    




# COMMON

### protein-protein

In [20]:
browser.get('protein-protein-interaction')

Unnamed: 0,from_protein_ac,to_protein_ac
0,Q07687,P12956
1,Q9Y468,P28070
2,Q9Y468,Q96KQ7
3,Q9Y468,Q9NQR1
4,Q9Y468,Q9Y2X3
...,...,...
329210,Q9UBP5,O15162
329211,Q9UBP5,P06733
329212,Q9UBP5,Q9Y297
329213,Q9UBP5,Q14469


### drugs

In [21]:
df = browser.get('drug-intrial-link-file')
df.head()

Unnamed: 0,drug_id,name,drug_status,in_trial,in_literature,links
0,DB00001,Lepirudin,approved,False,False,
1,DB00002,Cetuximab,approved,False,False,
2,DB00004,Denileukin diftitox,approved,False,False,
3,DB00005,Etanercept,approved,False,False,
4,DB00006,Bivalirudin,approved,False,False,


In [22]:
df = browser.get('protein-names-file')

In [23]:
df['entrez_id'] = None
uniprot_to_entrez_series = uniprot_to_entrez(df['protein_ac'].to_list())
for index, protein_ac in df['protein_ac'].iteritems():
    try:
        entrez_id = uniprot_to_entrez_series.loc[protein_ac][0]
        if entrez_id is np.nan:
            continue
        # entrez id is given, set it
        df.at[index, 'entrez_id'] = entrez_id
    except KeyError:
        # lookup does not contain protein_ac
        continue

In [24]:
df

Unnamed: 0,protein_ac,gene_name,protein_name,entrez_id
0,Q07687,DLX2,Homeobox protein DLX-2,1746
1,Q9Y468,L3MBTL1,Lethal(3)malignant brain tumor-like protein 1,26013
2,Q5DJT8,CT45A2,Cancer/testis antigen family 45 member A2,102723680; 102723737; 728911
3,Q9NY99,SNTG2,Gamma-2-syntrophin,54221
4,P51508,ZNF81,Zinc finger protein 81,347344
...,...,...,...,...
17661,Q14498,RBM39,RNA-binding protein 39,9584
17662,Q6P4H8,ATPSCKMT,ATP synthase subunit C lysine N-methyltransferase,134145
17663,O14967,CLGN,Calmegin,1047
17664,P24530,EDNRB,Endothelin receptor type B,1910


In [25]:
df = browser.get('drug-protein-interaction')

df_proteins = browser.get('protein-names-file')


In [28]:
df_proteins

Unnamed: 0,protein_ac,gene_name,protein_name
0,Q07687,DLX2,Homeobox protein DLX-2
1,Q9Y468,L3MBTL1,Lethal(3)malignant brain tumor-like protein 1
2,Q5DJT8,CT45A2,Cancer/testis antigen family 45 member A2
3,Q9NY99,SNTG2,Gamma-2-syntrophin
4,P51508,ZNF81,Zinc finger protein 81
...,...,...,...
17661,Q14498,RBM39,RNA-binding protein 39
17662,Q6P4H8,ATPSCKMT,ATP synthase subunit C lysine N-methyltransferase
17663,O14967,CLGN,Calmegin
17664,P24530,EDNRB,Endothelin receptor type B


In [33]:
        columns = [
            'UniProtKB-AC',
            'UniProtKB-ID',
            'GeneID (EntrezGene)',
            'RefSeq',
            'GI',
            'PDB',
            'GO',
            'UniRef100',
            'UniRef90',
            'UniRef50',
            'UniParc',
            'PIR',
            'NCBI-taxon',
            'MIM',
            'UniGene',
            'PubMed',
            'EMBL',
            'EMBL-CDS',
            'Ensembl',
            'Ensembl_TRS',
            'Ensembl_PRO',
            'Additional PubMed'
        ]

        df = browser.get('HUMAN_9606_idmapping_selected', names=columns, dtype='str')

In [34]:
df

Unnamed: 0,UniProtKB-AC,UniProtKB-ID,GeneID (EntrezGene),RefSeq,GI,PDB,GO,UniRef100,UniRef90,UniRef50,...,NCBI-taxon,MIM,UniGene,PubMed,EMBL,EMBL-CDS,Ensembl,Ensembl_TRS,Ensembl_PRO,Additional PubMed
0,P31946,1433B_HUMAN,7529,NP_003395.1; NP_647539.1; XP_016883528.1,4507949; 377656701; 67464627; 377656702; 78101...,2BQ0:A; 2BQ0:B; 2C23:A; 4DNK:A; 4DNK:B; 5N10:A...,GO:0005737; GO:0005829; GO:0070062; GO:0005925...,UniRef100_P31946,UniRef90_P31946,UniRef50_P31946,...,9606,601289,,8515476; 14702039; 11780052; 15489334; 2357255...,X57346; AK292717; AL008725; CH471077; CH471077...,CAA40621.1; BAF85406.1; -; EAW75893.1; EAW7589...,ENSG00000166913,ENST00000353703; ENST00000372839,ENSP00000300161; ENSP00000361930,11996670; 12364343; 12437930; 12468542; 124825...
1,P62258,1433E_HUMAN,7531,NP_006752.1,30583161; 374074368; 984319; 62131678; 1943797...,2BR9:A; 3UAL:A; 3UBW:A; 6EIH:A,GO:0090724; GO:0005737; GO:0005829; GO:0070062...,UniRef100_P62258,UniRef90_P62258,UniRef50_P62258,...,9606,605066,,7644510; 8858348; 8684458; 20417184; 14702039;...,U20972; U54778; U43399; U43430; U28936; AB0171...,AAC50175.1; AAC50710.1; AAC50625.1; AAD00026.1...,ENSG00000108953; ENSG00000274474,ENST00000264335; ENST00000571732; ENST00000616...,ENSP00000264335; ENSP00000461762; ENSP00000481...,15838597; 11782387; 12657644; 14966136; 153649...
2,Q04917,1433F_HUMAN,7533,NP_003396.1,1345593; 4507951; 460779; 83754684; 83754685; ...,2C63:A; 2C63:B; 2C63:C; 2C63:D; 2C74:A; 2C74:B,GO:0005737; GO:0005829; GO:0070062; GO:0014704...,UniRef100_Q04917,UniRef90_Q04917,UniRef50_Q04917,...,9606,113508,,8218406; 1578511; 8561965; 8812417; 15461802; ...,L20422; X80536; X78138; X57345; D78577; S80794...,AAA35483.1; CAA56676.1; CAA55017.1; CAA40620.1...,ENSG00000128245,ENST00000248975,ENSP00000248975,10206237; 11121172; 11996670; 12176995; 124801...
3,P61981,1433G_HUMAN,7532,NP_036611.2,6016838; 21464101; 82407952; 380764684; 484287...,2B05:A; 2B05:B; 2B05:C; 2B05:D; 2B05:E; 2B05:F...,GO:0005829; GO:0070062; GO:0005925; GO:0016020...,UniRef100_P61981,UniRef90_P61981,UniRef50_P61981,...,9606,605356; 617665,,10433554; 10486217; 12853948; 15489334; 235725...,AF142498; AB024334; CR541904; CR541925; AC0063...,AAD48408.1; BAA85184.1; CAG46702.1; CAG46723.1...,ENSG00000170027,ENST00000307630,ENSP00000306330,11824616; 11996670; 12364343; 12482592; 150572...
4,P31947,1433S_HUMAN,2810,NP_006133.1,436408756; 306991738; 969812714; 350610438; 96...,1YWT:A; 1YWT:B; 1YZ5:A; 1YZ5:B; 3IQJ:A; 3IQU:A...,GO:0005829; GO:0070062; GO:0005615; GO:0005739...,UniRef100_P31947,UniRef90_P31947,UniRef50_P31947,...,9606,601290,,1390337; 8515476; 9659898; 16710414; 15489334;...,M93010; X57348; AF029081; AF029082; CR541905; ...,AAA59546.1; CAA40623.1; AAC52029.1; AAC52030.1...,ENSG00000175793,ENST00000339276,ENSP00000340989,10969776; 12582028; 12730237; 12787309; 145172...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192651,Q71VB8,Q71VB8_HUMAN,,,2576418,,,UniRef100_Q71VB8,,,...,9606,,,9380697,AF019742,AAB82538.1,,,,15948133; 17334644; 11748236; 11845324; 119153...
192652,A1YZL2,A1YZL2_HUMAN,,,120431537,,GO:0003964,UniRef100_A1YZL2,UniRef90_A1YZL2,UniRef50_A1YZL2,...,9606,,,,EF153386,ABM21729.1,,,,
192653,B7Z2R6,B7Z2R6_HUMAN,,,221040550,,,UniRef100_B7Z2R6,UniRef90_B7Z2R6,UniRef50_B7Z2R6,...,9606,,,,AK295023,BAH11952.1,,,,22356826; 23260140; 23362268; 25604084; 263582...
192654,D3IP05,D3IP05_HUMAN,,,284155470,,GO:0016021,UniRef100_D3IP05,UniRef90_C4MDQ5,UniRef50_C4MDQ5,...,9606,,,20017671,GQ162808,ADB78785.1,,,,16633140; 16638864; 16849011; 17178267; 172704...


In [35]:
df_proteins

Unnamed: 0,protein_ac,gene_name,protein_name
0,Q07687,DLX2,Homeobox protein DLX-2
1,Q9Y468,L3MBTL1,Lethal(3)malignant brain tumor-like protein 1
2,Q5DJT8,CT45A2,Cancer/testis antigen family 45 member A2
3,Q9NY99,SNTG2,Gamma-2-syntrophin
4,P51508,ZNF81,Zinc finger protein 81
...,...,...,...
17661,Q14498,RBM39,RNA-binding protein 39
17662,Q6P4H8,ATPSCKMT,ATP synthase subunit C lysine N-methyltransferase
17663,O14967,CLGN,Calmegin
17664,P24530,EDNRB,Endothelin receptor type B


In [36]:
df_proteins[df_proteins['protein_ac'].isin(df['UniProtKB-AC'])]

Unnamed: 0,protein_ac,gene_name,protein_name
0,Q07687,DLX2,Homeobox protein DLX-2
1,Q9Y468,L3MBTL1,Lethal(3)malignant brain tumor-like protein 1
2,Q5DJT8,CT45A2,Cancer/testis antigen family 45 member A2
3,Q9NY99,SNTG2,Gamma-2-syntrophin
4,P51508,ZNF81,Zinc finger protein 81
...,...,...,...
17661,Q14498,RBM39,RNA-binding protein 39
17662,Q6P4H8,ATPSCKMT,ATP synthase subunit C lysine N-methyltransferase
17663,O14967,CLGN,Calmegin
17664,P24530,EDNRB,Endothelin receptor type B


In [55]:
df = pd.read_csv('data/STRINGDB/9606.protein.links.v11.0.txt', sep=' ')
df_mapping = pd.read_csv('data/STRINGDB/human.entrez_2_string.2018.tsv', sep='\t')

string_to_entrez = df_mapping.set_index('STRING ')['entrez'].to_dict()
df['entrez_a'] = df['protein1'].map(string_to_entrez)
df['entrez_b'] = df['protein2'].map(string_to_entrez)

df = df.drop(columns=['protein1', 'protein2', 'combined_score'])

df = df.dropna()

df['entrez_a'] = df['entrez_a'].map(lambda x: x.split('|')[-1] if '|' in x else x)
df['entrez_b'] = df['entrez_b'].map(lambda x: x.split('|')[-1] if '|' in x else x)



In [56]:
df

Unnamed: 0,protein1,protein2,combined_score,entrez_a,entrez_b
1,9606.ENSP00000000233,9606.ENSP00000253401,198,381,23229
2,9606.ENSP00000000233,9606.ENSP00000401445,159,381,2081
3,9606.ENSP00000000233,9606.ENSP00000418915,606,381,1029
4,9606.ENSP00000000233,9606.ENSP00000327801,167,381,5034
5,9606.ENSP00000000233,9606.ENSP00000466298,267,381,8677
...,...,...,...,...,...
11759449,9606.ENSP00000485678,9606.ENSP00000310488,167,219952,125958
11759450,9606.ENSP00000485678,9606.ENSP00000342448,175,219952,390148
11759451,9606.ENSP00000485678,9606.ENSP00000350222,195,219952,390038
11759452,9606.ENSP00000485678,9606.ENSP00000367590,900,219952,51308


In [52]:
df[['entrez_a', 'entrez_b']].iloc[539317]

entrez_a    100533496|201158
entrez_b               57111
Name: 557367, dtype: object

In [47]:
pd.to_numeric(df['entrez_a'])

ValueError: Unable to parse string "100533496|201158" at position 539317

In [57]:
pd.to_numeric(df['entrez_b'])

1            23229
2             2081
3             1029
4             5034
5             8677
             ...  
11759449    125958
11759450    390148
11759451    390038
11759452     51308
11759453    390439
Name: entrez_b, Length: 11054730, dtype: int64

In [5]:
df = pd.read_csv('data/STRINGDB/9606.protein.links.full.v11.0.txt.gz', sep=' ')

In [6]:
df

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,9606.ENSP00000272298,0,0,0,332,0,0,62,0,181,0,0,0,125,490
1,9606.ENSP00000000233,9606.ENSP00000253401,0,0,0,0,0,0,0,0,186,0,0,0,56,198
2,9606.ENSP00000000233,9606.ENSP00000401445,0,0,0,0,0,0,0,0,160,0,0,0,0,159
3,9606.ENSP00000000233,9606.ENSP00000418915,0,0,0,0,0,0,61,0,158,0,0,542,0,606
4,9606.ENSP00000000233,9606.ENSP00000327801,0,0,0,0,0,69,61,0,78,0,0,0,89,167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11759449,9606.ENSP00000485678,9606.ENSP00000310488,0,0,0,0,898,168,0,0,0,0,0,0,0,167
11759450,9606.ENSP00000485678,9606.ENSP00000342448,0,0,0,0,892,175,0,0,0,0,0,0,0,175
11759451,9606.ENSP00000485678,9606.ENSP00000350222,0,0,0,0,773,195,0,0,0,0,0,0,0,195
11759452,9606.ENSP00000485678,9606.ENSP00000367590,0,0,0,0,0,0,0,0,0,900,0,0,0,900
