In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
from networkx import gml

In [None]:
#Input files
cluster_fp = "cluster.tsv"
graph_fp = "graph.gml" 

In [None]:
#Get the node that includes the desired protein
def get_node_by_protein(protein, cluster_tsv):
    cluster_df = pd.read_csv(cluster_tsv, sep="\t", header=None, colnames=["Node", "Protein"])
    cluster_df = cluster_df[cluster_df["Protein"] == protein]
    return cluster_df["Node"].values[0]

def get_node_neighborhood(graph_gml, node):
    G = nx.read_gml(graph_gml)
    return list(G.neighbors(node))

def output_cosmograph_tsv(cluster_tsv, graph_gml, protein):
    node = get_node_by_protein(protein, cluster_tsv)
    neighborhood = get_node_neighborhood(graph_gml, node)
    neighborhood.append(node)
    with open("cosmograph.tsv", "w") as f:
        f.write("Node\tProtein\n")
        for n in neighborhood:
            f.write(f"{n}\t{protein}\n")

In [None]:
#Read in graph!
G = gml.read_gml('data/GRAPH_ALL_Guaymas2020_hottest_clu30_May62024.gml')

In [None]:
#Get me an ego graph!!
Guapo_G = nx.ego_graph(G, 'D4994_C39_H1-scaffold_122948_1', radius=4)

In [None]:
#Write out smol graph to tsvs for cosmograph! 
output_nodes_tsv = 'data/GuaPO_clu30_nodes.tsv'
output_edges_tsv = 'data/GuaPO_clu30_edges.tsv'

with open(output_nodes_tsv, 'w') as node_file:
    node_file.write("NodeId\tlabel\n")  # Modify based on your node attributes
    for node, data in Guapo_G.nodes(data=True):
        # Write node and attributes to the file, ensure attributes match what's in your graph
        node_file.write(f"{node}\t{data.get('label', '')}\n")
print(f'saved nodes to {output_nodes_tsv}')
# Open a file to write the edge data
with open(output_edges_tsv, 'w') as edge_file:
    edge_file.write("Source\tTarget\tWeight\n")  # Modify if you have different or additional attributes
    for source, target, data in Guapo_G.edges(data=True):
        # Write edge and attributes to the file, ensure attributes match what's in your graph
        edge_file.write(f"{source}\t{target}\t{data.get('weight', '')}\n")
print(f'saved edges to {output_edges_tsv}')

Code snippet to make a shuffled fasta for making the test set: 

In [1]:
from Bio import SeqIO
import random

# Load your FASTA file
file_path = '../data/All_Asgards_042423.faa'

# Read all sequences into a list
records = list(SeqIO.parse(file_path, 'fasta'))

# Shuffle the list of records
random.shuffle(records)

# Write the shuffled records to a new file
shuffled_file_path = 'shuffled.fasta'
with open(shuffled_file_path, 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')
