# Protein Protein Interaction Data
**[Work in progress]**

This notebook downloads and standardizes viral-host protein data from IntAct for ingestion into the Knowledge Graph.

Data source: [IntAct](https://www.ebi.ac.uk/intact/query/pubid:IM-27814)

Authors: Kaushik Ganapathy, Eric Yu, Peter Rose (krganapa@ucsd.edu, ery010@ucsd.edu, pwrose@ucsd.edu)

In [1]:
import os
import re
import hashlib 

import pandas as pd
import numpy as np

from pathlib import Path
from Bio import SeqIO

pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [2]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-19636412-9e74-4bac-8a4c-c6c8b49bb9d3/installation-4.1.0/import


https://www.uniprot.org/help/uniprotkb_column_names
https://www.uniprot.org/uniprot/P0DTD1#PRO_0000449630

### Retrieve interaction data from IntAct

##### Query for interactions with SARS-CoV-2: "taxid:2697049"

In [3]:
data = pd.read_csv("https://www.ebi.ac.uk/intact/export?format=mitab_25&query=taxid%3A2697049&negative=false&spoke=false&ontology=false&sort=intact-miscore&asc=false", sep='\t')

In [4]:
print('Number of interactions:', data.shape[0])

Number of interactions: 2200


In [5]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s)
0,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,"psi-mi:""MI:0410""(3D electron microscopy)",Walls et al. (2020),pubmed:32155444|imex:IM-27846,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25495631|wwpdb:6vyb|wwpdb:6vxx|imex...,intact-miscore:0.74
1,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,"psi-mi:""MI:0114""(x-ray crystallography)",Xia et al. (2020),imex:IM-27873|pubmed:32231345,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25503580|wwpdb:6LXT|imex:IM-27873-1,intact-miscore:0.74
2,uniprotkb:P0DTD1-PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Kim. et al. (2020),pubmed:32304108|imex:IM-27884,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25504928|imex:IM-27884-1|wwpdb:6vww...,intact-miscore:0.44
3,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Viswanathan et al. (2020),pubmed:32511383|imex:IM-27944,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25589802|wwpdb:6wks|imex:IM-27944-1,intact-miscore:0.76
4,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:1247""(microscale thermophoresis)",Viswanathan et al. (2020),pubmed:32511383|imex:IM-27944,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25634975|imex:IM-27944-4,intact-miscore:0.76


### Process Data

In [6]:
data.rename(columns={'#ID(s) interactor A': 'interactorA', 'ID(s) interactor B': 'interactorB'}, inplace=True)

In [7]:
data = data.replace('uniprotkb','uniprot', regex=True)

#### Extract UniProt accession number and Uniprot protein id from interactor columns

In [8]:
# uniprot:P0DTD1-PRO_0000449619 -> P0DTD1-PRO_0000449619
data['id_a'] = data['interactorA'].str.replace('uniprot:', '')
data['id_b'] = data['interactorB'].str.replace('uniprot:', '')

# P0DTD1-PRO_0000449619 -> P0DTD1 (UniProt accession number)
data['accession_a'] = data['id_a'].str.split('-', expand=True)[0]
data['accession_b'] = data['id_b'].str.split('-', expand=True)[0]

# P0DTD1-PRO_0000449619 -> PRO_0000449619 (UniProt protein id)
data['pro_id_a'] = data['id_a'].str.split('-', expand=True)[1]
data['pro_id_b'] = data['id_b'].str.split('-', expand=True)[1]

# Add CURIE "uniprot.chain" as prefix (see https://registry.identifiers.org/registry/uniprot.chain)
data['pro_id_a'] = data['pro_id_a'].str.replace('PRO_', 'uniprot.chain:PRO_')
data['pro_id_b'] = data['pro_id_b'].str.replace('PRO_', 'uniprot.chain:PRO_')

data.fillna('', inplace=True)

#### Extract pubmed id
Example: imex:IM-27912|pubmed:32275855 -> 2275855

In [9]:
position_pattern = re.compile('pubmed:(\d*).')

def extract_pubmed_id(s):
    groups = position_pattern.search(s)
    if groups == None:
        return ''
    else:
        return groups.group(1)

In [10]:
data['pubmedId'] = data['Publication Identifier(s)'].apply(extract_pubmed_id)

#### Extract taxonomy id
Example: taxid:9606(human)|taxid:9606(Homo sapiens) -> 9606

In [11]:
position_pattern = re.compile('taxid:(\d*)\(')

def extract_tax_id(s):
    groups = position_pattern.search(s)
    if groups == None:
        return ''
    else:
        return groups.group(1)

In [12]:
data['taxonomy_id_a'] = data['Taxid interactor A'].apply(extract_tax_id)
data['taxonomy_id_b'] = data['Taxid interactor B'].apply(extract_tax_id)

In [13]:
data = data[['id_a', 'id_b', 'accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'taxonomy_id_a', 'taxonomy_id_b', 'pubmedId']]

In [14]:
data.head()

Unnamed: 0,id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,taxonomy_id_a,taxonomy_id_b,pubmedId
0,P0DTC2,P0DTC2,P0DTC2,P0DTC2,,,2697049,2697049,32155444
1,P0DTC2,P0DTC2,P0DTC2,P0DTC2,,,2697049,2697049,3223134
2,P0DTD1-PRO_0000449632,P0DTD1-PRO_0000449632,P0DTD1,P0DTD1,uniprot.chain:PRO_0000449632,uniprot.chain:PRO_0000449632,2697049,2697049,32304108
3,P0DTD1-PRO_0000449628,P0DTD1-PRO_0000449633,P0DTD1,P0DTD1,uniprot.chain:PRO_0000449628,uniprot.chain:PRO_0000449633,2697049,2697049,32511383
4,P0DTD1-PRO_0000449628,P0DTD1-PRO_0000449633,P0DTD1,P0DTD1,uniprot.chain:PRO_0000449628,uniprot.chain:PRO_0000449633,2697049,2697049,32511383


#### Restrict data to SARS-CoV-2 - human protein-protein interactions (TODO: expand to SARS, MERS, etc.)

In [15]:
data = data[((data['taxonomy_id_a'] == '2697049') & (data['taxonomy_id_b'] == '9606')) |
            ((data['taxonomy_id_b'] == '2697049') & (data['taxonomy_id_a'] == '9606'))]

In [16]:
data.drop_duplicates(inplace=True)

#### Retrieve Protein names and sequences from UniProt.org

Collect unique accession numbers

In [17]:
a = data['accession_a'].unique()
b = data['accession_b'].unique()
ids = np.unique(np.concatenate((a, b)))

In [18]:
def split_chains(accession, chains):
    chain_list = []
    for chain in chains:
        items = chain.split(';')
        feature_dict = {}
        
        try:
            if len(items) > 1:
                feature_dict['accession'] = accession
                for item in items:
                    item = item.strip()
                    if '..' in item:
                        feature_dict['start'] = int(items[0].split('..')[0])
                        feature_dict['end'] = int(items[0].split('..')[1])
                    elif item.startswith("/note="):
                        name = item[6:].replace('\"', '')
                        feature_dict['name'] = name
                    elif item.startswith("/id="):
                        pro_id = item[4:].replace('\"', '')
                        feature_dict['proId'] = 'uniprot.chain:' + pro_id
                
                chain_list.append(feature_dict)
        except:
            print('Skipping incomplete features for', accession, ':', chain)
            
    df = pd.DataFrame(chain_list)
    df.dropna(inplace=True)
    return df

In [19]:
seq_dfs = []
for accession in ids:
    url = f'https://www.uniprot.org/uniprot/?query=accession:{accession}&columns=id,sequence,length,protein%20names,feature(CHAIN),organism-id&format=tab'
    try:
        df = pd.read_csv(url, sep='\t')
        df.fillna('', inplace=True)
        
        whole_chain = df.copy()
        
        start = 1
        end = whole_chain.iloc[0]['Length']

        whole_chain['accession'] = accession
        whole_chain['start'] = start
        whole_chain['end'] = end
        whole_chain['name'] = whole_chain['Protein names']
        whole_chain['name'] = whole_chain['Protein names'].str.split('\(', expand=True)
        whole_chain['proId'] =''
        whole_chain = whole_chain[['accession', 'start', 'end', 'name', 'proId']]
        
        # process features (Chains)
        feature_string = df['Chain'].values[0]
        if (feature_string != ''):
            chains = feature_string.split("CHAIN ")
            chain_df = split_chains(accession, chains)
            if chain_df.shape[0] > 0:
                # Add the whole chain if it is not in the list of chains
                if chain_df.iloc[0]['start'] != start or chain_df.iloc[0]['end'] != end:
                    chain_df = pd.concat([whole_chain, chain_df])
                    
                df = df.merge(chain_df, left_on='Entry', right_on='accession', how='left')
                print(accession, end=' ')
                seq_dfs.append(df)
        else:
            df = df.merge(whole_chain, left_on='Entry', right_on='accession', how='left')
            seq_dfs.append(df)
    except:
        print("Uniprot accession not found:", accession)
        
seq = pd.concat(seq_dfs, axis=0, ignore_index=True)

A0MZ66 A0PJW6 A1L3X0 A3KN83 A4D1U4 A6NFN3 A6NGB7 A6NNL5 A7MCY6 B2RUZ4 B7ZAP0 B7ZAQ6 O00116 O00124 O00148 O00151 O00165 O00186 O00203 O00217 O00220 O00231 O00238 O00308 O00391 O00400 O00411 O00461 O00469 O00482 O00566 O00567 O14578 O14653 O14656 O14657 O14662 O14684 O14745 O14763 O14818 O14828 O14874 O14975 O14980 O14981 O15027 O15063 O15121 O15126 O15127 O15155 O15226 O15260 O15270 O15287 O15305 O15321 O15354 O15374 O15381 O15397 O15400 O15403 O43156 O43169 O43194 O43251 O43264 O43291 O43292 O43306 O43395 O43402 O43488 O43493 O43505 O43520 O43556 O43570 O43592 O43633 O43660 O43752 O43761 O43795 O43818 O43823 O43826 O43852 O43865 O43909 O43929 O60238 O60271 O60291 O60293 O60318 O60462 O60499 O60503 O60518 O60524 O60573 O60613 O60674 O60683 O60749 O60762 O60779 O60830 O60831 O60832 O60884 O60885 O60941 O75063 O75121 O75306 O75309 O75325 O75347 O75354 O75379 O75436 O75439 O75448 O75506 O75509 O75534 O75569 O75592 O75674 O75688 O75746 O75841 O75844 O75874 O75886 O75934 O75947 O75964 O75973

Handle missing values and make sure start and end are integers

In [20]:
seq.dropna(inplace=True)
seq['start'] = seq['start'].astype(int)
seq['end'] = seq['end'].astype(int)

#### Cleave sequences into peptides

In [21]:
def get_subsequence(row):
    start = row['start']
    end = row['end']
    sequence = row['Sequence']
    return sequence[start-1: end]

In [22]:
seq['sequence'] = seq.apply(lambda row: get_subsequence(row), axis=1)

Set flag if protein chain is full length

In [23]:
seq['fullLength'] = (seq['start'] == 1) & (seq['end'] == seq['Length'])

In [24]:
seq['id'] = seq['sequence'].apply(lambda seq: 'md5:' + hashlib.md5(seq.encode()).hexdigest())

In [25]:
seq.rename(columns={'Organism ID': 'taxonomyId'}, inplace=True)

In [26]:
seq['name'] = seq['name'].str.strip()

In [27]:
seq.head()

Unnamed: 0,Entry,Sequence,Length,Protein names,Chain,taxonomyId,accession,start,end,name,proId,sequence,fullLength,id
0,A0A663DJA2,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT,38,ORF10 proteiin (ORF10 protein),,2697049,A0A663DJA2,1,38,ORF10 proteiin,,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT,True,md5:06bfef85cf1319e805e05f5bee3ec601
1,A0MZ66,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,631,Shootin-1 (Shootin1),"CHAIN 1..631; /note=""Shootin-1""; /id=""PRO_00...",9606,A0MZ66,1,631,Shootin-1,uniprot.chain:PRO_0000295740,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,True,md5:90d13de0b4e72915010714047d45d79c
2,A0PJW6,MAAPWRRWPTGLLAVLRPLLTCRPLQGTTLQRDVLLFEHDRGRFFT...,202,Transmembrane protein 223,"CHAIN 1..202; /note=""Transmembrane protein 22...",9606,A0PJW6,1,202,Transmembrane protein 223,uniprot.chain:PRO_0000321833,MAAPWRRWPTGLLAVLRPLLTCRPLQGTTLQRDVLLFEHDRGRFFT...,True,md5:3fd811e1f3e924f32b8088d6bd3e1cb7
3,A1L3X0,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,281,Elongation of very long chain fatty acids prot...,"CHAIN 1..281; /note=""Elongation of very long ...",9606,A1L3X0,1,281,Elongation of very long chain fatty acids prot...,uniprot.chain:PRO_0000311988,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,True,md5:8be30446ba90dce3da3a4ed115206e19
4,A3KN83,MVEPGQDLLLAALSESGISPNDLFDIDGGDAGLATPMPTPSVQQSV...,1393,Protein strawberry notch homolog 1 (Monocyte p...,"CHAIN 1..1393; /note=""Protein strawberry notc...",9606,A3KN83,1,1393,Protein strawberry notch homolog 1,uniprot.chain:PRO_0000314555,MVEPGQDLLLAALSESGISPNDLFDIDGGDAGLATPMPTPSVQQSV...,True,md5:a8f6908047b03fcfde873a516979cacd


### Save proteins

In [28]:
proteins = seq[['id', 'name', 'accession', 'proId', 'sequence', 'start', 'end', 'fullLength', 'taxonomyId']].copy()
proteins['accession'] = 'uniprot:' + proteins['accession']
proteins['taxonomyId'] = 'taxonomy:' + proteins['taxonomyId'].astype(str)
proteins.to_csv(NEO4J_IMPORT / '01e-ProteinProteinInteractionProtein.csv', index = False)

In [29]:
print('Number of proteins:', proteins.shape[0])

Number of proteins: 2073


In [30]:
proteins.head()

Unnamed: 0,id,name,accession,proId,sequence,start,end,fullLength,taxonomyId
0,md5:06bfef85cf1319e805e05f5bee3ec601,ORF10 proteiin,uniprot:A0A663DJA2,,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT,1,38,True,taxonomy:2697049
1,md5:90d13de0b4e72915010714047d45d79c,Shootin-1,uniprot:A0MZ66,uniprot.chain:PRO_0000295740,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,1,631,True,taxonomy:9606
2,md5:3fd811e1f3e924f32b8088d6bd3e1cb7,Transmembrane protein 223,uniprot:A0PJW6,uniprot.chain:PRO_0000321833,MAAPWRRWPTGLLAVLRPLLTCRPLQGTTLQRDVLLFEHDRGRFFT...,1,202,True,taxonomy:9606
3,md5:8be30446ba90dce3da3a4ed115206e19,Elongation of very long chain fatty acids prot...,uniprot:A1L3X0,uniprot.chain:PRO_0000311988,MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVY...,1,281,True,taxonomy:9606
4,md5:a8f6908047b03fcfde873a516979cacd,Protein strawberry notch homolog 1,uniprot:A3KN83,uniprot.chain:PRO_0000314555,MVEPGQDLLLAALSESGISPNDLFDIDGGDAGLATPMPTPSVQQSV...,1,1393,True,taxonomy:9606


### Merge interaction data with sequence data

In [31]:
sequences = seq[['accession', 'proId', 'id']].copy()
sequences_full_length = seq.query('fullLength')
sequences_full_length = sequences_full_length[['accession', 'proId', 'id']].copy()

In [32]:
sequences_full_length.head()

Unnamed: 0,accession,proId,id
0,A0A663DJA2,,md5:06bfef85cf1319e805e05f5bee3ec601
1,A0MZ66,uniprot.chain:PRO_0000295740,md5:90d13de0b4e72915010714047d45d79c
2,A0PJW6,uniprot.chain:PRO_0000321833,md5:3fd811e1f3e924f32b8088d6bd3e1cb7
3,A1L3X0,uniprot.chain:PRO_0000311988,md5:8be30446ba90dce3da3a4ed115206e19
4,A3KN83,uniprot.chain:PRO_0000314555,md5:a8f6908047b03fcfde873a516979cacd


In [33]:
sequences_full_length.query("accession == 'Q9BYF1'")

Unnamed: 0,accession,proId,id
1723,Q9BYF1,,md5:906dc56d5c9c5513eef859ee82e80267


In [34]:
interact = data[['accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'pubmedId']]

In [35]:
interact.query("accession_a == 'P0DTC2'")

Unnamed: 0,accession_a,accession_b,pro_id_a,pro_id_b,pubmedId
99,P0DTC2,Q9C0B5,,,32353859.0
100,P0DTC2,Q7Z5G4,,,32353859.0
613,P0DTC2,P62269,,,
614,P0DTC2,Q96N67,,,
615,P0DTC2,P63244,,,
616,P0DTC2,P40227,,,
617,P0DTC2,P07195,,,
618,P0DTC2,P08621,,,
619,P0DTC2,P60660,,,
620,P0DTC2,P47756,,,


Split dataframe into two parts, depending on the presence of a protein id for interactor A. Each part is processed differently.

In [36]:
interact_pro_a = interact.query("pro_id_a != ''").copy()
interact_acc_a = interact.query("pro_id_a == ''").copy()

In [37]:
interact_pro_a = interact_pro_a.merge(sequences, left_on='pro_id_a', right_on='proId', how='left')
interact_pro_a.fillna('', inplace=True)

In [38]:
interact_acc_a = interact_acc_a.merge(sequences_full_length, left_on='accession_a', right_on='accession', how='left')
interact_acc_a.fillna('', inplace=True)

Concatenate the dataframes back together

In [39]:
interact = pd.concat([interact_pro_a, interact_acc_a])
interact.rename(columns={'id': 'id_a'}, inplace=True)

In [40]:
print('total interactions:', interact.shape[0])

total interactions: 2031


Split dataframe into two parts, depending on the presence of a protein id for interactor B. Each part is processed differently.

In [41]:
interact_pro_b = interact.query("pro_id_b != ''").copy()
interact_acc_b = interact.query("pro_id_b == ''").copy()

In [42]:
interact_pro_b = interact_pro_b.merge(sequences, left_on='pro_id_b', right_on='proId', how='left')
interact_pro_b.fillna('', inplace=True)

In [43]:
interact_acc_b = interact_acc_b.merge(sequences_full_length, left_on='accession_b', right_on='accession', how='left')
interact_acc_b.fillna('', inplace=True)

Concatenate the dataframes back together

In [44]:
interact = pd.concat([interact_pro_b, interact_acc_b])
interact.rename(columns={'id': 'id_b'}, inplace=True)
interact.head()

Unnamed: 0,accession_a,accession_b,pro_id_a,pro_id_b,pubmedId,accession_x,proId_x,id_a,accession_y,proId_y,id_b
0,O75347,P0DTC1,,uniprot.chain:PRO_0000449645,32353859.0,O75347,,md5:895df31d01af35825589954917c6b682,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66
1,Q92769,P0DTD1,,uniprot.chain:PRO_0000449623,32353859.0,Q92769,uniprot.chain:PRO_0000114693,md5:6d48f5d9d96f10557d680247a7cdcfe9,P0DTD1,uniprot.chain:PRO_0000449623,md5:dc6436f559bc873ac013085f6e56d467
2,Q9UJZ1,intact:EBI,,25475912,32353859.0,Q9UJZ1,,md5:4f0a474a755af072900d84d3cacc6d92,,,
3,P61158,P0DTD1,,uniprot.chain:PRO_0000449633,,P61158,,md5:c07966db47593469f42f4ee1a7b0ccdf,P0DTD1,uniprot.chain:PRO_0000449633,md5:42b973dceb72cdea04fda203c55b67bc
4,Q53GL7,P0DTD1,,uniprot.chain:PRO_0000449621,3251141.0,Q53GL7,uniprot.chain:PRO_0000252435,md5:b953312246cc1c2b9cb917ebeac7dbaa,P0DTD1,uniprot.chain:PRO_0000449621,md5:73935ca55d0ab6130627210ef6743c39


In [45]:
interact.query("accession_a == 'P0DTC1'")

Unnamed: 0,accession_a,accession_b,pro_id_a,pro_id_b,pubmedId,accession_x,proId_x,id_a,accession_y,proId_y,id_b
234,P0DTC1,Q9UM54,uniprot.chain:PRO_0000449645,,,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66,Q9UM54,uniprot.chain:PRO_0000123464,md5:6c4869132f6880065ac9e8d67ff15787
235,P0DTC1,O43795,uniprot.chain:PRO_0000449645,,,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66,O43795,uniprot.chain:PRO_0000123442,md5:250d73405f887504ee3046934f5cd258
236,P0DTC1,P61158,uniprot.chain:PRO_0000449645,,,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66,P61158,,md5:c07966db47593469f42f4ee1a7b0ccdf


### Save interactions

Remove interactions without md5 ids

In [46]:
interact = interact[(interact['id_a'] != '') & (interact['id_b'] != '')]

In [47]:
interact = interact[['id_a', 'id_b']]
interact.drop_duplicates(inplace=True)
interact.to_csv(NEO4J_IMPORT / '01e-ProteinProteinInteraction.csv', index = False)

In [48]:
print('Number of interactions:', interact.shape[0])

Number of interactions: 1986


In [49]:
interact.head()

Unnamed: 0,id_a,id_b
0,md5:895df31d01af35825589954917c6b682,md5:63d2c81f37726f44c600eb5225676a66
1,md5:6d48f5d9d96f10557d680247a7cdcfe9,md5:dc6436f559bc873ac013085f6e56d467
3,md5:c07966db47593469f42f4ee1a7b0ccdf,md5:42b973dceb72cdea04fda203c55b67bc
4,md5:b953312246cc1c2b9cb917ebeac7dbaa,md5:73935ca55d0ab6130627210ef6743c39
5,md5:a8110ce1e1d70c50593356e8ea695702,md5:92ebecdcff1a59d6123f4e3133264695
