In [1]:
import sys

# allows importing local scripts (utils folder)
sys.path.append("..")

# Getting the PPI

Only other way to get even more rows is by altering the regex to also get the IDs that don't start with `NP_`

In [108]:
import re
import pandas as pd


def magic2dataframe(path):
    pattern = r"^\d+\s+(NP_\d+)\.\d+\s+(NP_\d+)\.\d+"
    matched_lines = []

    with open(path, "r") as f:
        for line in f:
            match = re.match(pattern, line)
            if match is not None:
                protein_1_id = match.group(1)
                protein_2_id = match.group(2)
                matched_lines.append(protein_1_id + " " + protein_2_id)

    data_list_of_lists = [string.split() for string in matched_lines]
    df = pd.DataFrame(data_list_of_lists, columns=["Protein_1_ID", "Protein_2_ID"])
    return df

In [109]:
# Files from: http://www.csbio.sjtu.edu.cn/bioinf/LR_PPI/Data.htm
POSITIVE_PPI_PATH = "../datasets/raw/Supp-A.txt"
NEGATIVE_PPI_PATH = "../datasets/raw/Supp-B.txt"

pos_df = magic2dataframe(POSITIVE_PPI_PATH)
neg_df = magic2dataframe(NEGATIVE_PPI_PATH)

pos_df["Interact?"] = "1"
neg_df["Interact?"] = "0"

neg_df.head()

Unnamed: 0,Protein_1_ID,Protein_2_ID,Interact?
0,NP_002189,NP_004160,0
1,NP_004070,NP_000846,0
2,NP_003073,NP_000365,0
3,NP_000624,NP_068815,0
4,NP_001941,NP_002717,0


In [110]:
all_np_prots = pd.concat([pos_df[pos_df.columns[0]], pos_df[pos_df.columns[1]]])
len(all_np_prots), all_np_prots.nunique()  # we lost some prots, file says 9476. It's fine tho. Definitely due to regex

(69602, 9140)

In [112]:
import numpy as np

pos_arr = pos_df.to_numpy()
np.save("../datasets/processed/positive_ppi.npy", pos_arr)

neg_arr = neg_df.to_numpy()
np.save("../datasets/processed/negative_ppi.npy", neg_arr)

In [113]:
pos_df

Unnamed: 0,Protein_1_ID,Protein_2_ID,Interact?
0,NP_663777,NP_001233,1
1,NP_055436,NP_001420,1
2,NP_003630,NP_001073594,1
3,NP_001001998,NP_067000,1
4,NP_001888,NP_001782,1
...,...,...,...
34796,NP_005357,NP_060529,1
34797,NP_863651,NP_612815,1
34798,NP_006368,NP_008918,1
34799,NP_002046,NP_055517,1


In [114]:
interaction_df = pd.concat([pos_df, neg_df])
interaction_df.reset_index(inplace=True, drop=True)

interaction_df.iloc[len(pos_df) - 3 : len(pos_df) + 3]

Unnamed: 0,Protein_1_ID,Protein_2_ID,Interact?
34798,NP_006368,NP_008918,1
34799,NP_002046,NP_055517,1
34800,NP_055482,NP_068780,1
34801,NP_002189,NP_004160,0
34802,NP_004070,NP_000846,0
34803,NP_003073,NP_000365,0


In [115]:
len(interaction_df)

46426

To get the PDB protein identifiers, we have to use Uniprot as a middleman

`RefSeq ID --> UniProt ID --> PDB ID`

### RefSeq ID --> UniProt ID

In [116]:
import os
from utils.uniprot_idmapping import (
    submit_id_mapping,
    check_id_mapping_results_ready,
    get_id_mapping_results_link,
    get_id_mapping_results_search,
)


if not os.path.exists("../datasets/processed/refseq_to_uniprot_df.csv"):
    ids = all_np_prots.tolist()

    job_id = submit_id_mapping(
        from_db="RefSeq_Protein",
        to_db="UniProtKB",
        ids=ids,
    )

    if check_id_mapping_results_ready(job_id):
        link = get_id_mapping_results_link(job_id)
        results = get_id_mapping_results_search(link)
else:
    results = pd.read_csv("../datasets/processed/refseq_to_uniprot_df.csv")["Uniprot"].tolist()

In [117]:
if not os.path.exists("../datasets/processed/refseq_to_uniprot_df.csv"):
    refseq_to_uniprot_df = []
    for i, _ in enumerate(results["results"]):
        refseq_to_uniprot_df.append(
            [
                results["results"][i]["from"],
                results["results"][i]["to"]["primaryAccession"],
            ]
        )

    refseq_to_uniprot_df = pd.DataFrame(refseq_to_uniprot_df, columns=["RefSeq", "Uniprot"])
    refseq_to_uniprot_df.to_csv("../datasets/processed/refseq_to_uniprot_df.csv", index=False)
    
    print(len(refseq_to_uniprot_df))

In [118]:
refseq_to_uniprot_df = pd.read_csv("../datasets/processed/refseq_to_uniprot_df.csv")
refseq_to_uniprot_df

Unnamed: 0,RefSeq,Uniprot
0,NP_663777,Q13114
1,NP_055436,P09016
2,NP_003630,Q9Y6K9
3,NP_001001998,Q01780
4,NP_001888,Q6UVK1
...,...,...
12648,NP_055642,O75152
12649,NP_079191,Q6ISB3
12650,NP_060754,Q9H0R5
12651,NP_068780,P28347


In [119]:
len(results), len(np.unique(results)), results[:5]

(12653, 12613, ['Q13114', 'P09016', 'Q9Y6K9', 'Q01780', 'Q6UVK1'])

### UniProt ID --> PDB ID

In [120]:
if not os.path.exists("../datasets/processed/uniprot_to_pdb_df.csv"):
    job_id = submit_id_mapping(
        from_db="UniProtKB_AC-ID",
        to_db="PDB",
        ids=results,
    )

    if check_id_mapping_results_ready(job_id):
        link = get_id_mapping_results_link(job_id)
        results = get_id_mapping_results_search(link)

        results = [[result["from"], result["to"]] for result in results["results"]]

In [121]:
if not os.path.exists("../datasets/processed/uniprot_to_pdb_df.csv"):
    uniprot_to_pdb_df = pd.DataFrame(
        results,
        columns=["Uniprot", "PDB"],
    ).drop_duplicates(subset=["Uniprot"], keep="first")

    uniprot_to_pdb_df.to_csv("../datasets/processed/uniprot_to_pdb_df.csv", index=False)
else:
    uniprot_to_pdb_df = pd.read_csv("../datasets/processed/uniprot_to_pdb_df.csv")

uniprot_to_pdb_df

Unnamed: 0,Uniprot,PDB
0,Q13114,1FLK
1,Q9Y6K9,2JVX
2,Q01780,2CPR
3,Q6UVK1,7ML7
4,P14672,7WSM
...,...,...
5399,Q14117,2VR2
5400,Q14324,2E7C
5401,Q9NWV4,5ZLQ
5402,P49458,1E8O


In [122]:
refseq_to_uniprot_df.columns, uniprot_to_pdb_df.columns

(Index(['RefSeq', 'Uniprot'], dtype='object'),
 Index(['Uniprot', 'PDB'], dtype='object'))

In [123]:
# TODO: Join `uniprot_to_pdb_df` and `refseq_to_uniprot_df`
final_idmapping_df = refseq_to_uniprot_df \
                .merge(uniprot_to_pdb_df, on="Uniprot") \
                .drop_duplicates(subset=["RefSeq"])
final_idmapping_df

Unnamed: 0,RefSeq,Uniprot,PDB
0,NP_663777,Q13114,1FLK
1,NP_003630,Q9Y6K9,2JVX
2,NP_001001998,Q01780,2CPR
3,NP_001888,Q6UVK1,7ML7
4,NP_001033,P14672,7WSM
...,...,...,...
5425,NP_001376,Q14117,2VR2
5426,NP_004524,Q14324,2E7C
5427,NP_060357,Q9NWV4,5ZLQ
5428,NP_003124,P49458,1E8O


In [144]:
ppi_dataset = pd.merge(interaction_df, final_idmapping_df, left_on='Protein_1_ID', right_on='RefSeq', how='left')
ppi_dataset = pd.merge(ppi_dataset, final_idmapping_df, left_on='Protein_2_ID', right_on='RefSeq', how='left', suffixes=('_1', '_2'))

ppi_dataset.drop(['RefSeq_1', 'RefSeq_2'], axis=1, inplace=True)  # cols are duplicates, line might be useless TBH
ppi_dataset.dropna(subset=['PDB_1', 'PDB_2'], inplace=True)
ppi_dataset.drop_duplicates(inplace=True)

COLS_ORDER = ["Protein_1_ID", "Uniprot_1", "PDB_1", "Protein_2_ID", "Uniprot_2", "PDB_2", "Interact?"]
ppi_dataset = ppi_dataset[COLS_ORDER]

ppi_dataset

Unnamed: 0,Protein_1_ID,Uniprot_1,PDB_1,Protein_2_ID,Uniprot_2,PDB_2,Interact?
0,NP_663777,Q13114,1FLK,NP_001233,P26842,5TL5,1
2,NP_003630,Q9Y6K9,2JVX,NP_001073594,Q14790,1F9E,1
3,NP_001001998,Q01780,2CPR,NP_067000,Q9Y333,3JCR,1
4,NP_001888,Q6UVK1,7ML7,NP_001782,P60953,1A4R,1
5,NP_001033,P14672,7WSM,NP_002037,P04406,1U8F,1
...,...,...,...,...,...,...,...
46416,NP_060589,Q9NVU0,7A6H,NP_002878,P54136,4Q2T,0
46418,NP_003176,O00268,1H3O,NP_003240,P52888,1S4B,0
46421,NP_000779,P27707,1P5Z,NP_005557,P00338,1I10,0
46423,NP_008835,P78527,5LUQ,NP_006576,P50990,6NR8,0


In [147]:
# shuffled for no other reason than looking cool
shuffled_ppi_dataset = ppi_dataset.sample(frac=1)
shuffled_ppi_dataset.to_csv("../datasets/processed/ppi_dataset.csv", index=False)

shuffled_ppi_dataset

Unnamed: 0,Protein_1_ID,Uniprot_1,PDB_1,Protein_2_ID,Uniprot_2,PDB_2,Interact?
26923,NP_604391,P16220,2LXT,NP_003063,P51532,2GRC,1
34262,NP_000263,O15259,1S1N,NP_001447,P21333,2AAV,1
664,NP_057424,Q8TEU7,2D93,NP_066361,P10114,1KAO,1
29716,NP_001505,Q00403,1C9B,NP_003241,P10827,1NAV,1
33408,NP_006316,P62826,1I2M,NP_001092755,P06454,2L9I,1
...,...,...,...,...,...,...,...
20780,NP_004738,Q8TDM6,1UIT,NP_001895,P35222,1G3J,1
1519,NP_055040,P30153,1B3U,NP_000827,O15399,7YFM,1
25655,NP_006182,Q9UQ80,2Q8K,NP_001973,P21860,1M6B,1
40769,NP_003205,Q99594,5EMW,NP_002717,P48147,3DDU,0


In [143]:
len(ppi_dataset[ppi_dataset['Interact?'] == '1']), len(ppi_dataset[ppi_dataset['Interact?'] == '0'])

(20851, 5654)

In [126]:
ppi_dataset_np = shuffled_ppi_dataset.to_numpy()
len(np.unique(ppi_dataset_np[:, 2])), len(np.unique(ppi_dataset_np[:, 5])), len(ppi_dataset_np)

(3956, 3649, 26505)

# Getting PDB Files for Single Protein Structure

In [132]:
pdb_ids = np.append(ppi_dataset_np[:, 2], ppi_dataset_np[:, 5])
unique_pdb_ids = np.unique(pdb_ids)

len(pdb_ids), len(unique_pdb_ids)

(53010, 4644)

In [None]:
import os
import requests

n_exists = 0
n_success = 0
n_failure = 0

PDB_ROOT = "../datasets/raw/pdbs/"

for pdb_id in unique_pdb_ids:
    file_path = os.path.join(PDB_ROOT, f"{pdb_id}.pdb")

    if os.path.exists(file_path):
        print(f"PDB file `{pdb_id}.pdb` already exists")
        n_exists += 1
        continue

    api_url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(api_url)

    if response.status_code == 200:
        pdb_content = response.text

        with open(file_path, "w") as pdb_file:
            pdb_file.write(pdb_content)

        n_success += 1
        print(f"PDB file downloaded and saved as '{file_path}'")
    else:
        n_failure += 1
        print(f"Failed to download the PDB file for ID: {pdb_id}")

In [134]:
print(f"Already have {n_exists} PDB files")
print(f"Downloaded {n_success} PDB files")
print(f"Failed to get {n_failure} PDB files")

Already have 3795 PDB files
Downloaded 810 PDB files
Failed to get 39 PDB files


In [136]:
print("You have", len(os.listdir("../datasets/raw/pdbs/")), "PDB files")

You have 4788 PDB files
