# Jan 24

Andrew Chung // hc893

In [167]:
import pandas as pd
import numpy as np

In [13]:
# read PPI predicted scores

af3_pred = pd.read_csv("final_af3_results.csv")
dsc_pred = pd.read_csv("dscript_ppis.csv")

## Pre-Processing Steps

In [17]:
af3_pred.head(1)
af3_pred.shape

(6768, 27)

In [18]:
dsc_pred.head(1)
dsc_pred.shape

(9908, 10)

### 1. Drop homo-dimer (same-protein PPI) instances

In [19]:
# drop row if prot1 == prot2
af3_homo_dropped = af3_pred[af3_pred['prot1'] != af3_pred['prot2']]
dsc_homo_dropped = dsc_pred[dsc_pred['prot1'] != dsc_pred['prot2']]

print("Dimensions of AF3: {}".format(af3_homo_dropped.shape))
print("Dimensions of DSC: {}".format(dsc_homo_dropped.shape))

Dimensions of AF3: (6440, 27)
Dimensions of DSC: (9578, 10)


- AF3: **328** homo-dimers
- D-script: **330** homo-dimers

### 2. Drop duplicate instances

In [22]:
# create a sorted instance to identify unique pairs
af3_homo_dropped['pair'] = af3_homo_dropped.apply(lambda row: tuple(sorted([row['prot1'], row['prot2']])), axis = 1)
dsc_homo_dropped['pair'] = dsc_homo_dropped.apply(lambda row: tuple(sorted([row['prot1'], row['prot2']])), axis = 1)

# drop duplicates
af3_uniq = af3_homo_dropped.drop_duplicates(subset = 'pair')
dsc_uniq = dsc_homo_dropped.drop_duplicates(subset = 'pair')

print("Dimensions of AF3: {}".format(af3_uniq.shape))
print("Dimensions of DSC: {}".format(dsc_uniq.shape))

Dimensions of AF3: (6393, 28)
Dimensions of DSC: (9557, 11)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  af3_homo_dropped['pair'] = af3_homo_dropped.apply(lambda row: tuple(sorted([row['prot1'], row['prot2']])), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dsc_homo_dropped['pair'] = dsc_homo_dropped.apply(lambda row: tuple(sorted([row['prot1'], row['prot2']])), axis = 1)


- AF3: **47** duplicates
- D-script: **21** duplicates

### 3. Drop non-mutual columns (not shared between AF3 and D-script)

In [28]:
# identify and drop pairs exclusive to either AF3 or DSC
common_pairs = set(af3_uniq['pair']).intersection(set(dsc_uniq['pair']))

af3_filt = af3_uniq[af3_uniq['pair'].isin(common_pairs)].reset_index(drop = True)
dsc_filt = dsc_uniq[dsc_uniq['pair'].isin(common_pairs)].reset_index(drop = True)

print("Dimensions of AF3: {}".format(af3_filt.shape))
print("Dimensions of DSC: {}".format(dsc_filt.shape))

Dimensions of AF3: (6377, 28)
Dimensions of DSC: (6377, 11)


**6,377** common pairs between AF3 and D-script.

## Combine and Filter with RF2-PPI instances

In [31]:
# import RF2-PPI data
rf2 = pd.read_csv("RF2_unique_protein_pairs.csv")

# serialize pairs
rf2['pair'] = rf2.apply(lambda row: tuple(sorted([row['Protein1'], row['Protein2']])), axis = 1)
rf2.shape

(5930, 3)

There exist **447** protein pairs that were dropped and unable to generate RF2-PPI MSAs.

In [34]:
# extract the non-MSA pairs
non_MSA = sorted(set(af3_filt['pair']) - set(rf2['pair']))
len(non_MSA)

447

## Analyze Protein Pair Instances with Uniprot

To comprehensively analyze each of the 447 pairs, I will utilize the Uniprot RESTful API.

In [48]:
import requests
import tqdm

In [41]:
# extract unique proteins across the non-MSA pairs
unique_proteins = set(protein for pair in non_MSA for protein in pair)
len(unique_proteins)

502

Example API Payload: Protein `Q8NBD8`

In [78]:
response = requests.get('https://rest.uniprot.org/uniprotkb/{}'.format('Q8NBD8'))

if response.status_code == 200:
  print(response.json())
else:
  print('Error {}'.format(response.status_code))

{'entryType': 'UniProtKB reviewed (Swiss-Prot)', 'primaryAccession': 'Q8NBD8', 'uniProtkbId': 'T229B_HUMAN', 'entryAudit': {'firstPublicDate': '2006-12-12', 'lastAnnotationUpdateDate': '2024-10-02', 'lastSequenceUpdateDate': '2002-10-01', 'entryVersion': 133, 'sequenceVersion': 1}, 'annotationScore': 5.0, 'organism': {'scientificName': 'Homo sapiens', 'commonName': 'Human', 'taxonId': 9606, 'lineage': ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']}, 'proteinExistence': '1: Evidence at protein level', 'proteinDescription': {'recommendedName': {'fullName': {'value': 'Transmembrane protein 229B'}}}, 'genes': [{'geneName': {'value': 'TMEM229B'}, 'synonyms': [{'value': 'C14orf83'}]}], 'comments': [{'commentType': 'INTERACTION', 'interactions': [{'interactantOne': {'uniProtKBAccession': 'Q8NBD8', 'intActId': 'EBI-12195227'}, 'interactantTwo': {'uniProtKBAc

Build a list of JSON-formatted Uniprot data, then flatten it into a DF

In [51]:
protein_data = []
url = 'https://rest.uniprot.org/uniprotkb/'

for protein in tqdm.tqdm(unique_proteins):
  response = requests.get(url + protein)

  if response.status_code == 200:
    protein_data.append(response.json())
  else:
    print('Protein {}: Error {}'.format(protein, response.status_code()))
    protein_data.append({})

100%|██████████| 502/502 [03:20<00:00,  2.50it/s]


In [130]:
uniprot_raw = pd.DataFrame(protein_data)
uniprot_raw.head()

Unnamed: 0,entryType,primaryAccession,secondaryAccessions,uniProtkbId,entryAudit,annotationScore,organism,proteinExistence,proteinDescription,genes,comments,features,keywords,references,uniProtKBCrossReferences,sequence,extraAttributes,geneLocations,inactiveReason
0,UniProtKB reviewed (Swiss-Prot),Q9H1A4,"[Q2M3H8, Q9BSE6, Q9H8D0]",APC1_HUMAN,"{'firstPublicDate': '2003-10-03', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'An...,"[{'geneName': {'value': 'ANAPC1'}, 'synonyms':...",[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Chain', 'location': {'start': {'val...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '11...","[{'database': 'EMBL', 'id': 'AJ278357', 'prope...",{'value': 'MSNFYEERTTMIAARDLQEFVPFGRDHCKHHPNAL...,"{'countByCommentType': {'FUNCTION': 1, 'PATHWA...",,
1,UniProtKB reviewed (Swiss-Prot),Q9H4K7,"[A6NDR3, B4DR85, Q96I17, Q9NVG9, Q9UFR4]",MTG2_HUMAN,"{'firstPublicDate': '2003-07-03', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'Mi...,"[{'geneName': {'value': 'MTG2'}, 'synonyms': [...",[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Chain', 'location': {'start': {'val...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '14...","[{'database': 'EMBL', 'id': 'AK001603', 'prope...",{'value': 'MAPARCFSARLRTVFQGVGHWALSTWAGLKPSRLL...,"{'countByCommentType': {'FUNCTION': 1, 'COFACT...",,
2,UniProtKB reviewed (Swiss-Prot),P14678,"[Q15490, Q6IB35, Q9UIS5]",RSMB_HUMAN,"{'firstPublicDate': '1990-04-01', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'Sm...,"[{'geneName': {'value': 'SNRPB'}, 'synonyms': ...",[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Chain', 'location': {'start': {'val...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '25...","[{'database': 'EMBL', 'id': 'X17567', 'propert...",{'value': 'MTVGKSSKMLQHIDYRMRCILQDGRIFIGTFKAFD...,"{'countByCommentType': {'FUNCTION': 1, 'SUBUNI...",,
3,UniProtKB reviewed (Swiss-Prot),O43676,[Q6IB80],NDUB3_HUMAN,"{'firstPublicDate': '1999-07-15', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'NA...,[{'geneName': {'value': 'NDUFB3'}}],[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Initiator methionine', 'location': ...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '94...","[{'database': 'EMBL', 'id': 'AF047183', 'prope...",{'value': 'MAHEHGHEHGHHKMELPDYRQWKIEGTPLETIQKK...,"{'countByCommentType': {'FUNCTION': 1, 'SUBUNI...",,
4,UniProtKB reviewed (Swiss-Prot),P28749,"[A8K2W5, Q4VXA0, Q8N5K6, Q9H1L5, Q9H1M1]",RBL1_HUMAN,"{'firstPublicDate': '1992-12-01', 'lastAnnotat...",5.0,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'Re...,[{'geneName': {'value': 'RBL1'}}],[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Chain', 'location': {'start': {'val...","[{'id': 'KW-0002', 'category': 'Technical term...","[{'referenceNumber': 1, 'citation': {'id': '83...","[{'database': 'EMBL', 'id': 'L14812', 'propert...",{'value': 'MFEDKPHAEGAAVVAAAGEALQALCQELNLDEGSA...,"{'countByCommentType': {'FUNCTION': 1, 'SUBUNI...",,


### Organization of Protein Data

In [175]:
# data organization
uniprot = uniprot_raw.drop(columns = [
  'entryType', 'entryAudit', 'annotationScore', 
  'keywords', 'references', 'uniProtKBCrossReferences',
  'geneLocations', 'inactiveReason'
]).rename(columns = {'primaryAccession' : 'protein'})

uniprot.head(2)


Unnamed: 0,protein,secondaryAccessions,uniProtkbId,organism,proteinExistence,proteinDescription,genes,comments,features,sequence,extraAttributes
0,Q9H1A4,"[Q2M3H8, Q9BSE6, Q9H8D0]",APC1_HUMAN,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'An...,"[{'geneName': {'value': 'ANAPC1'}, 'synonyms':...",[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Chain', 'location': {'start': {'val...",{'value': 'MSNFYEERTTMIAARDLQEFVPFGRDHCKHHPNAL...,"{'countByCommentType': {'FUNCTION': 1, 'PATHWA..."
1,Q9H4K7,"[A6NDR3, B4DR85, Q96I17, Q9NVG9, Q9UFR4]",MTG2_HUMAN,"{'scientificName': 'Homo sapiens', 'commonName...",1: Evidence at protein level,{'recommendedName': {'fullName': {'value': 'Mi...,"[{'geneName': {'value': 'MTG2'}, 'synonyms': [...",[{'texts': [{'evidences': [{'evidenceCode': 'E...,"[{'type': 'Chain', 'location': {'start': {'val...",{'value': 'MAPARCFSARLRTVFQGVGHWALSTWAGLKPSRLL...,"{'countByCommentType': {'FUNCTION': 1, 'COFACT..."


In [176]:
## extract common species name
uniprot['organism'] = uniprot['organism'].apply(
  lambda entry: entry.get('commonName') if isinstance(entry, dict) else None
)

## extract sequence and length
uniprot['sequence'] = uniprot['sequence'].apply(
  lambda entry: entry.get('value') if isinstance(entry, dict) else None
)

def seq_length(seq):
  try:
    return len(seq) if isinstance(seq, str) else 0
  except TypeError:
    return 0

uniprot['length'] = uniprot['sequence'].apply(seq_length)

In [189]:
min(list(uniprot['length']))

0