In [7]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import networkx as nx
import os
import dotenv
dotenv.load_dotenv()

class ProteinGraph:
    def __init__(self, info_file_path, links_file_path, start_protein):
        # Load protein information
        self.protein_info = pd.read_csv(
            info_file_path, sep='\t', compression='gzip')
        # info_file_path, sep='\t')

        # Load protein links
        self.protein_links = pd.read_csv(
            links_file_path, sep=' ', compression='gzip')
        # links_file_path, sep=' ')

        # Create a mapping from preferred names to protein IDs
        self.name_to_id = dict(
            zip(self.protein_info['preferred_name'], self.protein_info['#string_protein_id']))

        # Create a mapping from protein IDs to preferred names
        self.id_to_name = dict(
            zip(self.protein_info['#string_protein_id'], self.protein_info['preferred_name']))

        # Initialize an empty graph
        self.graph = nx.Graph()

        self.start_protein = start_protein

        self.visited = set()

    def get_interacting_proteins(self, protein_name):
        if protein_name not in self.name_to_id:
            return f"Protein '{protein_name}' not found in the dataset."

        protein_id = self.name_to_id[protein_name]

        # Filter interactions involving the protein of interest
        interacting_proteins = self.protein_links[
            (self.protein_links['protein1'] == protein_id) |
            (self.protein_links['protein2'] == protein_id)
        ]

        # Flip the proteins if necessary
        def flip_proteins(row, target_protein):
            if row['protein2'] == target_protein:
                return row['protein2'], row['protein1'], row['combined_score']
            return row['protein1'], row['protein2'], row['combined_score']

        # Apply the flip_proteins function and explicitly cast the 'combined_score' to float to avoid dtype incompatibility issues
        flipped_proteins = interacting_proteins.apply(
            lambda row: flip_proteins(row, protein_id), axis=1, result_type='expand')
        interacting_proteins['protein1'] = flipped_proteins[0]
        interacting_proteins['protein2'] = flipped_proteins[1]
        interacting_proteins['combined_score'] = flipped_proteins[2].astype(
            float)

        # Group by protein pairs and keep the interaction with the highest score
        interacting_proteins = interacting_proteins.groupby(
            ['protein1', 'protein2'], as_index=False
        ).agg({'combined_score': 'max'})

        # Map protein IDs to their preferred names
        interacting_proteins['protein1'] = interacting_proteins['protein1'].map(
            self.id_to_name)
        interacting_proteins['protein2'] = interacting_proteins['protein2'].map(
            self.id_to_name)

        # Sort by combined_score in descending order before returning
        interacting_proteins = interacting_proteins.sort_values(
            by='combined_score', ascending=False).reset_index(drop=True)

        # Return the dataframe with protein names instead of IDs
        return interacting_proteins

    def get_protein_info(self, protein_name):
        if protein_name not in self.name_to_id:
            return f"Protein '{protein_name}' not found in the dataset."

        protein_id = self.name_to_id[protein_name]
        return self.protein_info[self.protein_info['#string_protein_id'] == protein_id]

    def get_protein_info_by_id(self, protein_id):
        return self.protein_info[self.protein_info['#string_protein_id'] == protein_id]

    def explore_protein(self, protein_name=None, top_n=5):
        if protein_name is None:
            if not self.graph.nodes:
                print("No starting protein specified.")
                return

            # Find all leaf nodes (nodes with degree 1)
            not_vis_nodes = [
                node for node in self.graph.nodes() if node not in self.visited]

            # If there are no leaf nodes, end the function
            if not not_vis_nodes:
                print("No leaf nodes to explore from current graph.")
                return

            # Calculate the depth of each leaf node from 'ARF5' and sort them
            not_vis_nodes_depth = {node: nx.shortest_path_length(
                self.graph, source=self.start_protein, target=node) for node in not_vis_nodes}
            # Sort the leaf nodes by depth (or any other criteria you have)
            sorted_not_vis_nodes = sorted(
                not_vis_nodes_depth, key=not_vis_nodes_depth.get)

            # Get the leaf node with the lowest depth (or other criteria)
            protein_name = sorted_not_vis_nodes[0]

        # Add node and edges for the specified protein name
        self.visited.add(protein_name)
        edges = self.add_protein_and_edges(protein_name, top_n)
        return protein_name, edges

    def add_protein_and_edges(self, protein_name, top_n):
        if protein_name not in self.name_to_id:
            print(f"Protein '{protein_name}' not found in the dataset.")
            return

        # Get interacting proteins, map to names, and sort by score
        potential_edges = self.get_interacting_proteins(
            protein_name).head(top_n)

        # Add edges to the graph
        for _, row in potential_edges.iterrows():
            self.graph.add_edge(
                row['protein1'], row['protein2'], weight=row['combined_score'])

        return potential_edges

    def visualize_graph(self):
        plt.figure(figsize=(10, 10))
        # Positioning the nodes using the spring layout
        pos = nx.spring_layout(self.graph, seed=42)
        nx.draw(self.graph, pos, with_labels=True, node_color='skyblue',
                node_size=2000, font_size=10, font_weight='bold')
        labels = nx.get_edge_attributes(self.graph, 'weight')
        nx.draw_networkx_edge_labels(self.graph, pos, edge_labels=labels)
        plt.title("Protein Interaction Graph")
        plt.show()

    def get_graph_data_for_visualization(self):
        # Convert the networkx graph into a format suitable for D3.js
        nodes = [{"id": node, "label": node} for node in self.graph.nodes()]
        edges = [{"source": u, "target": v, "weight": d["weight"]}
                 for u, v, d in self.graph.edges(data=True)]
        return nodes, edges

def printj(json_obj):
    print(json.dumps(json_obj, indent=4))


def clean_indent(s):
    return '\n'.join([line.lstrip() for line in s.split('\n')])


if __name__ == '__main__':
    info_file_path = './9606.protein.info.v12.0.txt.gz'
    links_file_path = './9606.protein.links.v12.0.txt.gz'
    protein_graph = ProteinGraph(
        info_file_path, links_file_path, start_protein='ARF5')

In [2]:
import pandas as pd
info_file_path = './9606.protein.info.v12.0.txt.gz'
links_file_path = './9606.protein.links.v12.0.txt.gz'

# Load protein information
protein_info = pd.read_csv(
    info_file_path, sep='\t', compression='gzip')

# Load protein links
protein_links = pd.read_csv(
    links_file_path, sep=' ', compression='gzip')

In [3]:
# Create a new column with sorted tuples of the protein pairs
protein_links['sorted_proteins'] = protein_links.apply(lambda x: tuple(sorted([x['protein1'], x['protein2']])), axis=1)

protein_links

Unnamed: 0,protein1,protein2,combined_score,sorted_proteins
0,9606.ENSP00000000233,9606.ENSP00000356607,173,"(9606.ENSP00000000233, 9606.ENSP00000356607)"
1,9606.ENSP00000000233,9606.ENSP00000427567,154,"(9606.ENSP00000000233, 9606.ENSP00000427567)"
2,9606.ENSP00000000233,9606.ENSP00000253413,151,"(9606.ENSP00000000233, 9606.ENSP00000253413)"
3,9606.ENSP00000000233,9606.ENSP00000493357,471,"(9606.ENSP00000000233, 9606.ENSP00000493357)"
4,9606.ENSP00000000233,9606.ENSP00000324127,201,"(9606.ENSP00000000233, 9606.ENSP00000324127)"
...,...,...,...,...
13715399,9606.ENSP00000501317,9606.ENSP00000475489,195,"(9606.ENSP00000475489, 9606.ENSP00000501317)"
13715400,9606.ENSP00000501317,9606.ENSP00000370447,158,"(9606.ENSP00000370447, 9606.ENSP00000501317)"
13715401,9606.ENSP00000501317,9606.ENSP00000312272,226,"(9606.ENSP00000312272, 9606.ENSP00000501317)"
13715402,9606.ENSP00000501317,9606.ENSP00000402092,169,"(9606.ENSP00000402092, 9606.ENSP00000501317)"


In [4]:
# Drop duplicates based on the sorted protein pairs
df_unique_edges = protein_links.drop_duplicates(subset='sorted_proteins', keep='first')

# Optionally, drop the 'sorted_proteins' column if it's no longer needed
df_unique_edges = df_unique_edges.drop(columns=['sorted_proteins'])

In [5]:
df_unique_edges

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,173
1,9606.ENSP00000000233,9606.ENSP00000427567,154
2,9606.ENSP00000000233,9606.ENSP00000253413,151
3,9606.ENSP00000000233,9606.ENSP00000493357,471
4,9606.ENSP00000000233,9606.ENSP00000324127,201
...,...,...,...
13697500,9606.ENSP00000500595,9606.ENSP00000500990,222
13697880,9606.ENSP00000500595,9606.ENSP00000501092,410
13700159,9606.ENSP00000500934,9606.ENSP00000501025,226
13705748,9606.ENSP00000501092,9606.ENSP00000501146,240


In [7]:
def get_top_n_interactions(df, n=5):
    # Group by 'protein1' and 'protein2' separately and get the top N scores for each
    top_interactions_p1 = df.groupby('protein1').apply(lambda x: x.nlargest(n, 'combined_score')).reset_index(drop=True)
    top_interactions_p2 = df.groupby('protein2').apply(lambda x: x.nlargest(n, 'combined_score')).reset_index(drop=True)
    
    # Concatenate the results
    top_interactions = pd.concat([top_interactions_p1, top_interactions_p2], axis=0)
    
    # Drop any potential duplicates since some interactions could be top N for both proteins
    top_interactions = top_interactions.drop_duplicates(subset=['protein1', 'protein2'])
    
    return top_interactions

# Apply the function to the example protein_links_df
top_protein_links_df = get_top_n_interactions(df_unique_edges)
top_protein_links_df

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000262305,952
1,9606.ENSP00000000233,9606.ENSP00000429900,892
2,9606.ENSP00000000233,9606.ENSP00000306010,888
3,9606.ENSP00000000233,9606.ENSP00000440005,862
4,9606.ENSP00000000233,9606.ENSP00000296557,861
...,...,...,...
97534,9606.ENSP00000275162,9606.ENSP00000501265,520
97535,9606.ENSP00000296490,9606.ENSP00000501265,509
97536,9606.ENSP00000432119,9606.ENSP00000501265,508
97546,9606.ENSP00000290524,9606.ENSP00000501317,780


In [None]:
top_protein_links_df.to_csv('top_protein_links.csv', index=False)

In [None]:
# Create a mapping from preferred names to protein IDs
name_to_id = dict(
    zip(protein_info['preferred_name'], protein_info['#string_protein_id']))

# Create a mapping from protein IDs to preferred names
id_to_name = dict(
    zip(protein_info['#string_protein_id'], protein_info['preferred_name']))