In [29]:
import pandas as pd
import mygene

workdir = "/Users/fernando/Documents/Research/HSV1-2_pipeline/seed_gene_selection/"
hpidb_seeds = pd.read_excel(f"{workdir}hpidb_hsv1/hsv1_interactions.xlsx")
phisto_seeds = pd.read_excel(f"{workdir}phisto_hsv1/hsv1_interactions.xlsx")

 This chunk processes the PHISTO data by selecting relevant columns and renaming them for consistency. It also adds a 'source' column to identify the data origin.

In [30]:

# Finding common columns
phisto_seeds = phisto_seeds[['Taxonomy ID', 'Uniprot ID', 'Pathogen Protein', 'Uniprot ID.1', 'Human Protein', 'Experimental Method', 'Pubmed ID']]
phisto_seeds.rename(columns={'Taxonomy ID': 'Pathogen_Taxonomy_ID',
                             'Uniprot ID': 'Pathogen_Uniprot_ID',
                             'Pathogen Protein': 'Pathogen_Protein_Name',
                             'Uniprot ID.1': 'Human_Uniprot_ID',
                             'Human Protein': 'Human_Protein_Name',
                             'Experimental Method': 'Experimental_Method',
                             'Pubmed ID': 'Pubmed_ID'}, inplace=True)
phisto_seeds['source'] = 'PHISTO'
phisto_seeds


Unnamed: 0,Pathogen_Taxonomy_ID,Pathogen_Uniprot_ID,Pathogen_Protein_Name,Human_Uniprot_ID,Human_Protein_Name,Experimental_Method,Pubmed_ID,source
0,10298,Q9J0X9,Q9J0X9_HHV1,Q96SB4,SRPK1_HUMAN,anti bait coimmunoprecipitation,12660167,PHISTO
1,10298,Q9J0X9,Q9J0X9_HHV1,P25963,IKBA_HUMAN,anti bait coimmunoprecipitation,18539148,PHISTO
2,10298,Q9J0X9,Q9J0X9_HHV1,P25963,IKBA_HUMAN,anti tag coimmunoprecipitation,18539148,PHISTO
3,10298,Q9J0X9,Q9J0X9_HHV1,P84103,SRSF3_HUMAN,two hybrid,12660167,PHISTO
4,10298,Q9J0X9,Q9J0X9_HHV1,Q96SB4,SRPK1_HUMAN,experimental interaction detection,12660167,PHISTO
...,...,...,...,...,...,...,...,...
1021,10299,P08393,ICP0_HHV11,Q9NZM5,GSCR2_HUMAN,Other Methods,10196275,PHISTO
1022,10299,P08393,ICP0_HHV11,Q9UKV0,HDAC9_HUMAN,Other Methods,15194749,PHISTO
1023,10299,P08393,ICP0_HHV11,Q9UQL6,HDAC5_HUMAN,Other Methods,15194749,PHISTO
1024,10299,P04485,ICP22_HHV11,Q96EZ8,MCRS1_HUMAN,Other Methods,9765390,PHISTO


This chunk processes the HPIDB data by extracting and transforming relevant information into a consistent format with the PHISTO data.

In [31]:

hpidb_seeds['Pubmed_ID'] = hpidb_seeds['pmid'].str.extract(r'pubmed:(\d+)').astype(int)
hpidb_seeds['Pathogen_Taxonomy_ID'] = hpidb_seeds['protein_taxid_2']
hpidb_seeds['Pathogen_Uniprot_ID'] = hpidb_seeds['protein_xref_2_unique'].str.replace('UNIPROT_AC:','')
hpidb_seeds['Pathogen_Protein_Name'] = hpidb_seeds['protein_xref_2'].str.replace(r'uniprotkb:', '')
hpidb_seeds['Human_Uniprot_ID'] = hpidb_seeds['protein_xref_1_unique'].str.replace('UNIPROT_AC:','')
hpidb_seeds['Human_Protein_Name'] = hpidb_seeds['protein_xref_1']
hpidb_seeds['Experimental_Method'] = hpidb_seeds['detection_method']
hpidb_seeds['source'] = 'HPIDB'
hpidb_seeds = hpidb_seeds[['Pathogen_Taxonomy_ID', 'Pathogen_Uniprot_ID', 'Pathogen_Protein_Name', 'Human_Uniprot_ID', 'Human_Protein_Name', 'Experimental_Method', 'Pubmed_ID', 'source']]
hpidb_seeds

Unnamed: 0,Pathogen_Taxonomy_ID,Pathogen_Uniprot_ID,Pathogen_Protein_Name,Human_Uniprot_ID,Human_Protein_Name,Experimental_Method,Pubmed_ID,source
0,10298,P08393,entrez gene/locuslink:2703390,P19838,entrez gene/locuslink:4790,psi-mi:MI:0004(affinity chromatography technol...,24067962,HPIDB
1,10298,NCBI_ACC:NP_044650,dip:DIP-524N|refseq:NP_044650,Q00403,dip:DIP-1077N|refseq:NP_001505|uniprotkb:Q00403,psi-mi:MI:0045(experimental interaction detect...,7642611,HPIDB
2,10298,NCBI_ACC:NP_044650,dip:DIP-524N|refseq:NP_044650,Q00403,dip:DIP-1077N|refseq:NP_001505|uniprotkb:Q00403,psi-mi:MI:0045(experimental interaction detect...,7824954,HPIDB
3,10298,Q9J0X9,Q9J0X9,P21333,uniprotkb:P21333,psi-mi:MI:0006(anti bait coimmunoprecipitation),15567442,HPIDB
4,10298,Q9J0X9,Q9J0X9,Q04637,uniprotkb:Q04637,psi-mi:MI:0006(anti bait coimmunoprecipitation),15567442,HPIDB
...,...,...,...,...,...,...,...,...
985,10299,P04487,P04487,Q9Y3U8,uniprotkb:Q9Y3U8,psi-mi:MI:0676(tandem affinity purification),22810585,HPIDB
986,10299,P10204,P10204,Q9Y4P9,uniprotkb:Q9Y4P9,psi-mi:MI:0018(two hybrid),23950709,HPIDB
987,10299,P08393,P08393,Q9Y5A6,uniprotkb:Q9Y5A6,psi-mi:MI:0018(two hybrid),23950709,HPIDB
988,10299,P10186,P10186,Q9Y5F6,uniprotkb:Q9Y5F6,psi-mi:MI:0018(two hybrid),23950709,HPIDB


In [32]:
print(hpidb_seeds.Experimental_Method.value_counts())
print(phisto_seeds.Experimental_Method.value_counts())

Experimental_Method
psi-mi:MI:0018(two hybrid)                                           347
psi-mi:MI:0676(tandem affinity purification)                         201
psi-mi:MI:0004(affinity chromatography technology)                   129
psi-mi:MI:0006(anti bait coimmunoprecipitation)                       86
psi-mi:MI:0096(pull down)                                             71
psi-mi:MI:0019(coimmunoprecipitation)                                 60
psi-mi:MI:0416(fluorescence microscopy)                               19
psi-mi:MI:0059(gst pull down)                                         18
psi-mi:MI:0007(anti tag coimmunoprecipitation)                        13
psi-mi:MI:0428(imaging technique)                                      8
psi-mi:MI:0114(x-ray crystallography)                                  5
psi-mi:MI:0077(nuclear magnetic resonance)                             4
psi-mi:MI:0413(electrophoretic mobility shift assay)                   4
psi-mi:MI:0415(enzymatic study)

In [33]:
def standardize_method(method):
    # Remove any asterisks
    method = method.replace('*', '')
    
    # Mapping of non-standard names to PSI-MI format
    mapping = {
        'two hybrid': 'psi-mi:MI:0018(two hybrid)',
        'tandem affinity purification': 'psi-mi:MI:0676(tandem affinity purification)',
        'affinity chromatography technology': 'psi-mi:MI:0004(affinity chromatography technology)',
        'anti bait coimmunoprecipitation': 'psi-mi:MI:0006(anti bait coimmunoprecipitation)',
        'coimmunoprecipitation': 'psi-mi:MI:0019(coimmunoprecipitation)',
        'pull down': 'psi-mi:MI:0096(pull down)',
        'fluorescence microscopy': 'psi-mi:MI:0416(fluorescence microscopy)',
        'imaging technique': 'psi-mi:MI:0428(imaging technique)',
        'anti tag coimmunoprecipitation': 'psi-mi:MI:0007(anti tag coimmunoprecipitation)',
        'x-ray crystallography': 'psi-mi:MI:0114(x-ray crystallography)',
        'nuclear magnetic resonance': 'psi-mi:MI:0077(nuclear magnetic resonance)',
        'electrophoretic mobility shift assay': 'psi-mi:MI:0413(electrophoretic mobility shift assay)',
        'enzymatic study': 'psi-mi:MI:0415(enzymatic study)',
        'enzyme linked immunosorbent assay': 'psi-mi:MI:0411(enzyme linked immunosorbent assay)',
        'experimental interaction detection': 'psi-mi:MI:0045(experimental interaction detection)',
        'fluorescence-activated cell sorting': 'psi-mi:MI:0054(fluorescence-activated cell sorting)',
        'cosedimentation in solution': 'psi-mi:MI:0028(cosedimentation in solution)',
        'comigration in non denaturing gel electrophoresis': 'psi-mi:MI:0404(comigration in non denaturing gel electrophoresis)',
        'protein kinase assay': 'psi-mi:MI:0424(protein kinase assay)',
        'bimolecular fluorescence complementation': 'psi-mi:MI:0809(bimolecular fluorescence complementation)',
        'surface plasmon resonance': 'psi-mi:MI:0107(surface plasmon resonance)',
        'molecular sieving': 'psi-mi:MI:0071(molecular sieving)',
        'isothermal titration calorimetry': 'psi-mi:MI:0065(isothermal titration calorimetry)',
        'unspecified method': 'psi-mi:MI:0686(unspecified method)',
        'x-ray fiber diffraction': 'psi-mi:MI:0114(x-ray crystallography)',  # Assuming this is equivalent
    }
    
    # If the method is already in PSI-MI format, return it as is
    if method.startswith('psi-mi:'):
        return method
    
    # If the method is in our mapping, return the PSI-MI format
    if method in mapping:
        return mapping[method]
    
    # If not found, return the original method
    return method

# Assuming hpidb_seeds and phisto_seeds are your original DataFrames
hpidb_standardized = hpidb_seeds.copy()
phisto_standardized = phisto_seeds.copy()

# Apply the standardization to both datasets
hpidb_standardized['Standardized_Method'] = hpidb_standardized['Experimental_Method'].apply(standardize_method)
phisto_standardized['Standardized_Method'] = phisto_standardized['Experimental_Method'].apply(standardize_method)

# Display the results
print("HPIDB Standardized Methods:")
print(hpidb_standardized['Standardized_Method'].value_counts())
print("\nPHISTO Standardized Methods:")
print(phisto_standardized['Standardized_Method'].value_counts())

# Check for any remaining non-standard methods
non_standard_hpidb = hpidb_standardized[~hpidb_standardized['Standardized_Method'].str.startswith('psi-mi:')]
non_standard_phisto = phisto_standardized[~phisto_standardized['Standardized_Method'].str.startswith('psi-mi:')]

print("\nNon-standard methods in HPIDB:")
print(non_standard_hpidb['Standardized_Method'].value_counts())
print("\nNon-standard methods in PHISTO:")
print(non_standard_phisto['Standardized_Method'].value_counts())

hpidb_seeds['Experimental_Method'] = hpidb_seeds['Experimental_Method'].apply(standardize_method)
phisto_seeds['Experimental_Method'] = phisto_seeds['Experimental_Method'].apply(standardize_method)


phisto_seeds

HPIDB Standardized Methods:
Standardized_Method
psi-mi:MI:0018(two hybrid)                                           347
psi-mi:MI:0676(tandem affinity purification)                         201
psi-mi:MI:0004(affinity chromatography technology)                   129
psi-mi:MI:0006(anti bait coimmunoprecipitation)                       86
psi-mi:MI:0096(pull down)                                             71
psi-mi:MI:0019(coimmunoprecipitation)                                 60
psi-mi:MI:0416(fluorescence microscopy)                               19
psi-mi:MI:0059(gst pull down)                                         18
psi-mi:MI:0007(anti tag coimmunoprecipitation)                        13
psi-mi:MI:0428(imaging technique)                                      8
psi-mi:MI:0114(x-ray crystallography)                                  5
psi-mi:MI:0077(nuclear magnetic resonance)                             4
psi-mi:MI:0413(electrophoretic mobility shift assay)                   4
psi

Unnamed: 0,Pathogen_Taxonomy_ID,Pathogen_Uniprot_ID,Pathogen_Protein_Name,Human_Uniprot_ID,Human_Protein_Name,Experimental_Method,Pubmed_ID,source
0,10298,Q9J0X9,Q9J0X9_HHV1,Q96SB4,SRPK1_HUMAN,psi-mi:MI:0006(anti bait coimmunoprecipitation),12660167,PHISTO
1,10298,Q9J0X9,Q9J0X9_HHV1,P25963,IKBA_HUMAN,psi-mi:MI:0006(anti bait coimmunoprecipitation),18539148,PHISTO
2,10298,Q9J0X9,Q9J0X9_HHV1,P25963,IKBA_HUMAN,psi-mi:MI:0007(anti tag coimmunoprecipitation),18539148,PHISTO
3,10298,Q9J0X9,Q9J0X9_HHV1,P84103,SRSF3_HUMAN,psi-mi:MI:0018(two hybrid),12660167,PHISTO
4,10298,Q9J0X9,Q9J0X9_HHV1,Q96SB4,SRPK1_HUMAN,psi-mi:MI:0045(experimental interaction detect...,12660167,PHISTO
...,...,...,...,...,...,...,...,...
1021,10299,P08393,ICP0_HHV11,Q9NZM5,GSCR2_HUMAN,Other Methods,10196275,PHISTO
1022,10299,P08393,ICP0_HHV11,Q9UKV0,HDAC9_HUMAN,Other Methods,15194749,PHISTO
1023,10299,P08393,ICP0_HHV11,Q9UQL6,HDAC5_HUMAN,Other Methods,15194749,PHISTO
1024,10299,P04485,ICP22_HHV11,Q96EZ8,MCRS1_HUMAN,Other Methods,9765390,PHISTO


This chunk merges the processed HPIDB and PHISTO data, then cleans up some Uniprot IDs by removing prefixes and suffixes.


In [34]:
seeds_selection = hpidb_seeds.merge(phisto_seeds, how='outer')
seeds_selection['Pathogen_Uniprot_ID'] = seeds_selection['Pathogen_Uniprot_ID'].str.replace("NCBI_ACC:", "")
seeds_selection['Human_Uniprot_ID']= seeds_selection['Human_Uniprot_ID'].str.replace("-PRO_0000030311", "")
seeds_selection

Unnamed: 0,Pathogen_Taxonomy_ID,Pathogen_Uniprot_ID,Pathogen_Protein_Name,Human_Uniprot_ID,Human_Protein_Name,Experimental_Method,Pubmed_ID,source
0,10298,B9VQD3,entrez gene/locuslink:2703429,Q16666,entrez gene/locuslink:3428,psi-mi:MI:0004(affinity chromatography technol...,25693804,HPIDB
1,10298,B9VQD4,entrez gene/locuslink:2703431,Q16666,entrez gene/locuslink:3428,psi-mi:MI:0004(affinity chromatography technol...,25693804,HPIDB
2,10298,D3YP88,D3YP88,H0YL14,uniprotkb:H0YL14,psi-mi:MI:0007(anti tag coimmunoprecipitation),21667337,HPIDB
3,10298,D3YP88,D3YP88,H0YL14,uniprotkb:H0YL14,psi-mi:MI:0416(fluorescence microscopy),21667337,HPIDB
4,10298,D3YPE9,entrez gene/locuslink:2703434,P51610,entrez gene/locuslink:3054,psi-mi:MI:0004(affinity chromatography technol...,20133788,HPIDB
...,...,...,...,...,...,...,...,...
2011,10299,Q69091,GD_HHV11,Q92956,TNR14_HUMAN,psi-mi:MI:0114(x-ray crystallography),11511370,PHISTO
2012,10299,Q69091,Q69091,Q15223,uniprotkb:Q15223,psi-mi:MI:0019(coimmunoprecipitation),10683337,HPIDB
2013,10299,Q69091,Q69091,Q15223,uniprotkb:Q15223,psi-mi:MI:0054(fluorescence-activated cell sor...,12011057,HPIDB
2014,10299,Q69091,Q69091,Q15223,uniprotkb:Q15223,psi-mi:MI:0411(enzyme linked immunosorbent assay),11277703,HPIDB


In [35]:
seeds_selection["Experimental_Method"].value_counts()

Experimental_Method
psi-mi:MI:0018(two hybrid)                                           695
psi-mi:MI:0676(tandem affinity purification)                         435
psi-mi:MI:0004(affinity chromatography technology)                   273
psi-mi:MI:0006(anti bait coimmunoprecipitation)                      161
psi-mi:MI:0019(coimmunoprecipitation)                                135
psi-mi:MI:0096(pull down)                                            132
psi-mi:MI:0416(fluorescence microscopy)                               38
Other Methods                                                         21
psi-mi:MI:0007(anti tag coimmunoprecipitation)                        19
psi-mi:MI:0428(imaging technique)                                     18
psi-mi:MI:0059(gst pull down)                                         18
psi-mi:MI:0045(experimental interaction detection)                    11
psi-mi:MI:0415(enzymatic study)                                        9
psi-mi:MI:0114(x-ray crystallog

In [36]:

def aggregate_strings(x):
    unique_values = set(x.dropna().astype(str))
    return '$ '.join(unique_values) if len(unique_values) > 1 else next(iter(unique_values))

def aggregate_numeric(x):
    return x.mode().iloc[0] if len(x.mode()) > 0 else None

# Identify duplicates based on Pathogen and Human Uniprot IDs
duplicates = seeds_selection.duplicated(subset=['Pathogen_Uniprot_ID', 'Human_Uniprot_ID'], keep=False)

# Aggregate the data
aggregated = seeds_selection.groupby(['Pathogen_Uniprot_ID', 'Human_Uniprot_ID']).agg({
    'Pathogen_Taxonomy_ID': 'first',
    'Pathogen_Protein_Name': aggregate_strings,
    'Human_Protein_Name': aggregate_strings,
    'Experimental_Method': aggregate_strings,  # Use Standardized_Method instead of Experimental_Method
    'Pubmed_ID': aggregate_strings,
    'source': aggregate_strings
}).reset_index()

# Print original duplicate rows
print("Original duplicate rows:")
print(seeds_selection[duplicates])

# Print aggregated results
print("\nAggregated results:")
print(aggregated)

# Count of interactions by standardized method after aggregation
print("\nInteractions by Standardized Method after aggregation:")
print(aggregated['Experimental_Method'].value_counts())

# Check for any remaining duplicates
remaining_duplicates = aggregated.duplicated(subset=['Pathogen_Uniprot_ID', 'Human_Uniprot_ID'], keep=False)
if remaining_duplicates.any():
    print("\nWarning: Some duplicate interactions remain after aggregation:")
    print(aggregated[remaining_duplicates])
else:
    print("\nNo duplicate interactions remain after aggregation.")

Original duplicate rows:
      Pathogen_Taxonomy_ID Pathogen_Uniprot_ID          Pathogen_Protein_Name  \
2                    10298              D3YP88                         D3YP88   
3                    10298              D3YP88                         D3YP88   
4                    10298              D3YPE9  entrez gene/locuslink:2703434   
5                    10298              D3YPE9  entrez gene/locuslink:2703434   
11                   10298              G8HBG2                         G8HBG2   
...                    ...                 ...                            ...   
2011                 10299              Q69091                      GD_HHV11    
2012                 10299              Q69091                         Q69091   
2013                 10299              Q69091                         Q69091   
2014                 10299              Q69091                         Q69091   
2015                 10299              Q69091                         Q69091   

  

In [37]:
aggregated

Unnamed: 0,Pathogen_Uniprot_ID,Human_Uniprot_ID,Pathogen_Taxonomy_ID,Pathogen_Protein_Name,Human_Protein_Name,Experimental_Method,Pubmed_ID,source
0,B9VQD3,Q16666,10298,entrez gene/locuslink:2703429,entrez gene/locuslink:3428,psi-mi:MI:0004(affinity chromatography technol...,25693804,HPIDB
1,B9VQD4,Q16666,10298,entrez gene/locuslink:2703431,entrez gene/locuslink:3428,psi-mi:MI:0004(affinity chromatography technol...,25693804,HPIDB
2,B9VQG6,O94776,10299,B9VQG6_HHV1,MTA2_HUMAN,psi-mi:MI:0004(affinity chromatography technol...,20585571,PHISTO
3,B9VQG6,Q13547,10299,B9VQG6_HHV1,HDAC1_HUMAN,psi-mi:MI:0004(affinity chromatography technol...,20585571,PHISTO
4,B9VQJ7,K9JA46,10299,B9VQJ7_HHV1,K9JA46_HUMAN,psi-mi:MI:0676(tandem affinity purification),22810585,PHISTO
...,...,...,...,...,...,...,...,...
841,Q9J0X9,Q96SB4,10298,Q9J0X9_HHV1,SRPK1_HUMAN,psi-mi:MI:0006(anti bait coimmunoprecipitation...,12660167,PHISTO
842,Q9J0X9,Q99613,10298,Q9J0X9$ Q9J0X9_HHV1,uniprotkb:Q99613$ EIF3C_HUMAN,psi-mi:MI:0006(anti bait coimmunoprecipitation),15567442,PHISTO$ HPIDB
843,Q9QNF3,Q16666,10298,entrez gene/locuslink:2703365,entrez gene/locuslink:3428,psi-mi:MI:0004(affinity chromatography technol...,25693804,HPIDB
844,U5TQE9,Q13546,10298,U5TQE9$ U5TQE9_HHV1,uniprotkb:Q13546$ RIPK1_HUMAN,psi-mi:MI:0007(anti tag coimmunoprecipitation),26559832,PHISTO$ HPIDB


In [38]:
aggregated.to_excel(f"{workdir}hpidb_phisto_aggregated_interactome.xlsx", index=False)

# Selecting seeds

In [39]:
viral_seeds = set(aggregated[aggregated.source == "PHISTO$ HPIDB"]['Pathogen_Uniprot_ID'])
human_seeds = set(aggregated[aggregated.source == "PHISTO$ HPIDB"]['Human_Uniprot_ID'])