# Filtering and parsing string-db for COAD associated interactions

Download string-db for humans from [here]([https://string-db.org/cgi/download?sessionId=bpHd4Ji7A7W6&species_text=Homo+sapiens&settings_expanded=0&min_download_score=0&filter_redundant_pairs=0&delimiter_type=txt]) to same directory as this notebook.

And download the CPTAC phospho-proteomic data [here](https://kb.linkedomics.org/download#COAD) to same directory as this notebook.

Strategy:
- Identify top 50 genes associated with CPTAC from Open Targets Platform
- Subset interaction database to only include interactions that include at least one of these CPTAC associated genes
- Remove interactions with score < 400 (this is an arbitrary threshold for now, but this can be made more statistically rigorous)
- Cross-reference proteins against those measured in CPTAC phosphoproteomics (the rationale for this is that we want to use actual patient omics data for prediction, so the graph should be restricted to genes/proteins for which we will have measurements), and keep only those interactions where both proteins are present in the CPTAC data. **This is the final relevant PPI for the graph neural network to take as input**

In [1]:
import matplotlib.pyplot as plt
from typing import List, Set
import numpy as np
import pandas as pd
import requests


In [2]:
def subset_interaction_db(
    input_file: str,
    output_file: str,
    proteins_of_interest: List[str],
    how: str = "either",
    delimiter: str = " "
):
    """
    Subset a protein-protein interaction file:
    - Removes the '9606.' prefix from protein IDs
    - Keeps only rows with proteins of interest
    - Tracks and returns all scores for distribution analysis
    
    Args:
        input_file: Path to the PPI file (with header).
        output_file: Path to save filtered results.
        proteins_of_interest: List of protein IDs (without 9606. prefix).
        delimiter: Column separator (default: space).
    
    Returns:
        List of scores (floats).
    """
    poi_set: Set[str] = set(proteins_of_interest)
    scores = []

    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        header = infile.readline().strip().split(delimiter)
        outfile.write(delimiter.join(header) + "\n")  # write header

        for line in infile:
            parts = line.strip().split(delimiter)
            if len(parts) < 3:
                continue  # skip malformed lines

            p1 = parts[0]
            p2 = parts[1]
            score = float(parts[2])
            
            if how == "either":
                if (p1 in poi_set) | (p2 in poi_set):
                    outfile.write(f"{p1}{delimiter}{p2}{delimiter}{score}\n")
                    scores.append(score)

            else:
                if (p1 in poi_set) & (p2 in poi_set):
                    outfile.write(f"{p1}{delimiter}{p2}{delimiter}{score}\n")
                    scores.append(score)

    return scores

In [3]:
def plot_score_distribution(scores, bins=50):
    """Plot histogram of scores."""
    plt.figure(figsize=(7, 5))
    plt.hist(scores, bins=bins, edgecolor="black", alpha=0.7)
    plt.xlabel("PPI Score")
    plt.ylabel("Frequency")
    plt.title("Distribution of Protein-Protein Interaction Scores\nfor Involving COAD-associated Proteins from Open Targets")
    plt.show()

In [4]:
def map_genes_to_string_ids(gene_list, species=9606):
    """
    Maps the gene names to canonical protein IDs
    """
    url = "https://string-db.org/api/json/get_string_ids"
    params = {
        "identifiers": "%0d".join(gene_list),
        "species": species
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    
    mapping = {}
    for entry in data:
        mapping[entry["queryItem"]] = entry["stringId"]  # e.g., "9606.ENSP00000269305"
    return mapping


These are the top 50 genes associated with COAD in Open Targets Platform

In [5]:
top_associated_genes = ["APC", "KRAS", "EGFR", "BRAF", "PIK3CA", "FBXW7", "TP53", "TCF7L2", "ERBB2",
                        "ATM", "AMER1", "MET", "SMAD4", "NRAS", "MSH6", "PTEN", "ARID1A", "RNF43", "NTRK1", "SMAD2", "ERBB3", 
                        "AKT1", "BCL9L","FBXO11","POLE","GNAS","KDR","MLH1","MSH2","ACVR2A","MAP2K1","PIK3R1","SMAD3","ALK",
                        "MUTYH","FLT4","TGFBR2","RBM10","MTOR","AR","TERT","PMS2","USP6","BCORL1","PCBP1","FAT4","EP300","RAF1","BCOR","PDGFRB"]

In [6]:
ensembleIDs = map_genes_to_string_ids(top_associated_genes)

In [None]:
scores = subset_interaction_db(input_file="./9606.protein.links.v12.0.txt", 
                               output_file="./open.targets.COAD.associated.protein.links.txt", 
                               proteins_of_interest=list(ensembleIDs.values()))

In [None]:
scores = np.array(scores)

In [None]:
plot_score_distribution(scores)

In [None]:
def filter_by_score(
    input_file: str,
    output_file: str,
    threshold: float,
    delimiter: str = " "
) -> Set[str]:
    """
    Filter a subsetted PPI file by score threshold and collect unique proteins.

    Args:
        input_file: Path to the subsetted PPI file (with header).
        output_file: Path to save the filtered results.
        threshold: Minimum score to keep.
        delimiter: Column separator (default: space).
    
    Returns:
        A set of unique protein IDs present in the filtered file.
    """
    unique_proteins: Set[str] = set()

    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        header = infile.readline().strip()
        outfile.write(header + "\n")

        for line in infile:
            parts = line.strip().split(delimiter)
            if len(parts) < 3:
                continue
            p1, p2, score_str = parts
            score = float(score_str)

            if score >= threshold:
                outfile.write(line)
                unique_proteins.update([p1, p2])

    return unique_proteins

In [None]:
unique_protein_IDs = filter_by_score(input_file="./open.targets.COAD.associated.protein.links.txt", 
                output_file="./open.targets.COAD.associated.protein.links.filtered.txt", threshold=400)

In [None]:
stringDB_protein_IDs = [prot for prot in unique_protein_IDs]

## Cross referencing these against proteins in phosphoproteomic data

In [None]:
### modify the file path here!!!
phosphoproteome = pd.read_csv("./COAD_phospho_site_abundance_log2_reference_intensity_normalized_Tumor.txt",
                             sep="\t", index_col=0).T

In [None]:
phosphoproteome

In [None]:
cptac_protein_IDS = [col.split("|")[1] for col in phosphoproteome.columns]
cptac_protein_IDS = ["9606."+prot.split(".")[0] for prot in cptac_protein_IDS]

In [None]:
len(set(stringDB_protein_IDs) & set(cptac_protein_IDS))

Excellent: there are ~2,300 proteins in the protein-protein interaction network that are shared with the phosphoproteomic data. Since we can only model these genes with data from CPTAC, I'll create one further subset of the interaction network where both of the interacting genes are also present in the multi-omic data. 

In [None]:
scores_final = subset_interaction_db(input_file="./open.targets.COAD.associated.protein.links.filtered.txt",
                      output_file="./open.targets.COAD.associated.protein.links.cptac.txt",
                      proteins_of_interest=list(set(stringDB_protein_IDs) & set(cptac_protein_IDS)),
                      how='both')

In [None]:
print(f"Final PPI network generated with {len(scores_final)} edges \
and {len(set(stringDB_protein_IDs) & set(cptac_protein_IDS))} nodes.")