In [None]:
import os

import pandas as pd
from data_generator import Protein
from data_generator import EntrezProtein
from data_generator import RandomPropertyProtein
from data_generator import RandomPropertyProteinIsoform
from data_generator import InteractionGenerator

- [x] 1e1
- [x] 2e1
- [x] 4e1
- [x] 8e1
      
- [x] 1e2
- [x] 2e2
- [ ] 3e2
- [x] 4e2
- [ ] 5e2
- [ ] 6e2
- [ ] 7e2
- [x] 8e2
- [ ] 9e2
- [ ] 10e2

- [x] 1e3
- [x] 2e3
- [x] 4e3
- [x] 8e3

In [None]:
# Extract id, label, properties
def node_generator(proteins):
    for protein in proteins:
        yield (
            protein.get_id(),
            protein.get_label(),
            protein.get_properties(),
        )

In [None]:
# Extract id, source, target, label, and property dictionary
def edge_generator(ppi):
    for interaction in ppi:
        yield (
            interaction.get_id(),
            interaction.get_source_id(),
            interaction.get_target_id(),
            interaction.get_label(),
            interaction.get_properties(),
        )

In [None]:
def nodes_to_csv(generator_nodes):
    list_nodes = [node for node in generator_nodes]

    columns_nodes = ["UniProt ID", "label", "properties"]
    df = pd.DataFrame(list_nodes, columns=columns_nodes)
    df.info(memory_usage="deep")

    filename = "dataset_" + str(3*n_proteins) + "_nodes_proteins.csv"
    
    df.to_csv(os.path.join("./datasets", filename), index=False)

In [None]:
def edges_to_csv(generator_edges):
    list_edges = [edge for edge in generator_edges]

    columns_edges = ["UniProt ID", "Source ID", "Target ID", "label", "properties"]
    df = pd.DataFrame(list_edges, columns=columns_edges)
    df.info(memory_usage="deep")

    filename = "dataset_" + str(3*n_proteins) + "_edges_interactions.csv"

    df.to_csv(os.path.join("./datasets", filename), index=False)

# Generate Nodes

In [None]:
# Number of proteins to generate
n_proteins = int(2e4)

In [None]:
# Create a list of proteins from the data generator
proteins = [
    p for sublist in zip(
        [RandomPropertyProtein() for _ in range(n_proteins)],
        [RandomPropertyProteinIsoform() for _ in range(n_proteins)],
        [EntrezProtein() for _ in range(n_proteins)],
    ) for p in sublist
]

In [None]:
# Write the nodes into a CSV file
nodes_to_csv(node_generator(proteins))

# Generator Edges

In [None]:
# Create a list of interactions from the data generator
ppi = InteractionGenerator(
    interactors=[p.get_id() for p in proteins],
    interaction_probability=0.05,
).generate_interactions()

In [None]:
# Write the edges into a CSV file
edges_to_csv(edge_generator(ppi))