# Experiment 1: Testing profilers

[//]: # (------------------------------------------    DO NOT MODIFY THIS    ------------------------------------------)
<style type="text/css">
.tg  {border-collapse:collapse;
      border-spacing:0;
     }
.tg td{border-color:black;
       border-style:solid;
       border-width:1px;
       font-family:Arial, sans-serif;
       font-size:14px;
       overflow:hidden;
       padding:10px 5px;
       word-break:normal;
      }
.tg th{border-color:black;
       border-style:solid;
       border-width:1px;
       font-family:Arial, sans-serif;
       font-size:14px;
       font-weight:normal;
       overflow:hidden;
       padding:10px 5px;
       word-break:normal;
      }
.tg .tg-fymr{border-color:inherit;
             font-weight:bold;
             text-align:left;
             vertical-align:top
            }
.tg .tg-0pky{border-color:inherit;
             text-align:left;
             vertical-align:top
            }
[//]: # (--------------------------------------------------------------------------------------------------------------)

[//]: # (-------------------------------------    FILL THIS OUT WITH YOUR DATA    -------------------------------------)
</style>
<table class="tg">
    <tbody>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Title:</td>
        <td class="tg-0pky"> Experiment 1: Testing profilers</td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Authors:</td>
        <td class="tg-0pky">
            <a href="https://github.com/ecarrenolozano" target="_blank" rel="noopener noreferrer">Edwin Carreño</a>
        </td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Affiliations:</td>
        <td class="tg-0pky">
            <a href="https://www.ssc.uni-heidelberg.de/en" target="_blank" rel="noopener noreferrer">Scientific Software Center</a>
        </td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Date Created:</td>
        <td class="tg-0pky">30.10.2024</td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Description:</td>
        <td class="tg-0pky">Testing profilers for memory and time in: Lists, Pandas DF, Dictionaries, Numpy Arrays.</td>
      </tr>
    </tbody>
</table>

[//]: # (--------------------------------------------------------------------------------------------------------------)

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/quickstart/beginner.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/quickstart/beginner.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>  
</table>

## Overview

In this notebook we are going to:

1. Import CSV (comma-separated values) data from nodes and edges.
2. Describe the data from CSVs.
3. Store the CSV data in data structures:
   - List of tuples
   - Pandas DF
   - Dictionaries of dictionaries
   - Numpy Arrays
4. Profile the memory and time to create the data structure.
5. Create a NetworkX graph from the aforementioned datasets

## Setup (if required)

**Note:** If your code require to install dependencies before your main code, please add the commands to install the dependencies. You can silence the messages returned by `pip` the flag `-q`, i.e. `pip install pandas -q`

### NetworkX installation

In [1]:
!pip install networkx[default] -q

## Importing Libraries

In [20]:
"""
Recommendations:
    - Respect the order of the imports, they are indicated by the numbers 1, 2, 3.
    - One import per line is recommended, with this we can track easily any modified line when we use git.
    - Absolute imports are recommended (see 3. Local application/library specific imports below), they improve readability and give better error messages.
    - You should put a blank line between each group of imports.
"""

# future-imports (for instance: from __future__ import barry_as_FLUFL)
# from __future__ import barry_as_FLUFL  

# 1. Standard library imports
import ast
import cProfile
import csv
import os
import pstats
import sys

# 2. Related third party imports
import networkx as nx
import pandas as pd
from pympler import asizeof

# 3. Local application/library specific imports
# import <mypackage>.<MyClass>         # this is an example
# from <mypackage> import <MyClass>    # this is another example 

## Helper Functions

In [3]:
def create_edgelist_ds(iterable):
    return [edge for edge in iterable]    

def edges_generator_streaming(edgelist):
    for edge in edgelist:

        # Original format
        #  (id, source node, target node, label, properties)
        
        # Desired format
        #  (source node, target node, properties)

        edge_source = edge[1]
        edge_target = edge[2]
        properties = ast.literal_eval(edge[4])
        properties["id"] = edge[0]
        properties["label"] = edge[3]
        
        #dict_prop = {"id":edge[0],
        #            "label":edge[3],
        #            "properties":edge[4
        #            }
        yield(
            edge_source,
            edge_target,
            properties
            #dict_prop
        ) 

def load_csv_module(filepath, header=True):
    list_rows = []
    with open(filepath, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            if header: # Skip first row when header exists
                header = False
                continue

            row = [field.strip() for field in row]
            list_rows.append(tuple(row))  # Add each row to the list
            
    return list_rows

## Introduction

## Section 1: Load the CSV data that contains nodes and edges

For this exercise, each graph is represented by two CSV files. One containing information about the **nodes** and the other about the **edges**. To indicate that both files correspond to the same graph, their names include the same number of nodes. For example:

- `dataset_30_nodes_proteins.csv`: contains 30 rows (nodes).
- `dataset_30_edges_interactions.csv`: contains 47 rows (edges).

We reference each CSV file or dataset as follows:

In [4]:
filename_nodes = "dataset_dummy2_nodes.csv"
filename_edges = "dataset_dummy2_edges.csv"

### 1.1: Load Nodes

The CSV file for nodes contains three columns:
- `UniProt ID`
- `label`
- `properties`

We are going to load the information of nodes as a **list of tuples**. Each tuple represents a node with the structure:
- `(id, label, properties)`
- Each field in the tuple is a `string`
- The `properties` field is a string containing a dictionary of properties.

In [5]:
filepath_nodes = os.path.join("./datasets", filename_nodes)

list_nodes = load_csv_module(filepath_nodes, header=True)

In [6]:
print("The list of NODES contains: {} nodes".format(len(list_nodes)))
print("Example:\n\t{}".format(list_nodes[0]))

The list of NODES contains: 6 nodes
Example:
	('A', 'letters', "{'alphabet': 'latin'", "'equivalent':'a'}")


### 1.2: Load Edges

The CSV file for edges contains five columns:
- `Relation ID`
- `Source ID`
- `Target ID`
- `label`
- `properties`

We are going to load the information of edges as a list of tuples. Each tuple represents a node with thethe structure:
- `(id, source, target, label, properties)`
- Each field in the tuple is a `string`
- The `properties` field is string containing a dictionary of properties.

In [7]:
filepath_edges = os.path.join("./datasets", filename_edges)

list_edges = load_csv_module(filepath_edges, header=True)

In [8]:
print("The list of EDGES contains: {} edges".format(len(list_edges)))
print("Example:")
for edge in list_edges[0:6]:
    print("\t{}".format(edge))

The list of EDGES contains: 6 edges
Example:
	('E1', 'A', '1', 'is_equivalent', "{'info': 'A is equal to 1', 'mass=': 1}")
	('E2', 'B', '2', 'is_equivalent', "{'info': 'B is equal to 2', 'mass=': 2}")
	('E3', 'C', '3', 'is_equivalent', "{'info': 'C is equal to 3', 'mass=': 3}")
	('R1', 'B', 'A', 'follows', "{'info': 'B follows A', 'mass=': 1.0}")
	('R2', 'C', 'B', 'follows', "{'info': 'C follows B', 'mass=': 2.0}")
	('R3', 'C', 'A', 'follows', "{'info': 'C follows A', 'mass=': 3.0}")


## Section 2: Profile each data structure
- List of tuples
- Pandas DF
- Dictionary of dictionaries
- 

| Stage | Function                      | Description |
|-------|-------------------------------| ----------- |
| 1     | `load_csv_module()`           | read the csv and load the information in memory    |
| 2     | `edges_generator_streaming()` | transform the tuples to desired format             |
| 3     | `create_edgelist_ds()`        | create an edgelist containing all the tuples       |

### 2.1 Create List of tuples (data structure)

In [9]:
edgelist = create_edgelist_ds(edges_generator_streaming(load_csv_module(filepath_edges, header=True)))

In [15]:
for edge in edgelist[0:5]:
    print(edge)

('A', '1', {'info': 'A is equal to 1', 'mass=': 1, 'id': 'E1', 'label': 'is_equivalent'})
('B', '2', {'info': 'B is equal to 2', 'mass=': 2, 'id': 'E2', 'label': 'is_equivalent'})
('C', '3', {'info': 'C is equal to 3', 'mass=': 3, 'id': 'E3', 'label': 'is_equivalent'})
('B', 'A', {'info': 'B follows A', 'mass=': 1.0, 'id': 'R1', 'label': 'follows'})
('C', 'B', {'info': 'C follows B', 'mass=': 2.0, 'id': 'R2', 'label': 'follows'})


In [17]:
print(asizeof.asized(edgelist, detail=1).format())

[('A', '1', {'info': 'A is equal to 1'....3.0, 'id': 'R3', 'label': 'follows'})] size=4264 flat=120
    ('A', '1', {'info': 'A is equal to 1',....'id': 'E1', 'label': 'is_equivalent'}) size=848 flat=64
    ('B', '2', {'info': 'B is equal to 2',....'id': 'E2', 'label': 'is_equivalent'}) size=736 flat=64
    ('C', '3', {'info': 'C is equal to 3',....'id': 'E3', 'label': 'is_equivalent'}) size=736 flat=64
    ('B', 'A', {'info': 'B follows A', 'ma.... 1.0, 'id': 'R1', 'label': 'follows'}) size=608 flat=64
    ('C', 'B', {'info': 'C follows B', 'ma.... 2.0, 'id': 'R2', 'label': 'follows'}) size=608 flat=64
    ('C', 'A', {'info': 'C follows A', 'ma.... 3.0, 'id': 'R3', 'label': 'follows'}) size=608 flat=64


In [19]:
edge_tuple = ('A', '1', {'info': 'A is equal to 1', 'mass=': 1, 'id': 'E1', 'label': 'is_equivalent'})
print(asizeof.asized(edge_tuple, detail=3).format())

('A', '1', {'info': 'A is equal to 1',....'id': 'E1', 'label': 'is_equivalent'}) size=848 flat=64
    {'info': 'A is equal to 1', 'mass=': 1, 'id': 'E1', 'label': 'is_equivalent'} size=672 flat=232
        [V] info: 'A is equal to 1' size=64 flat=64
        [V] label: 'is_equivalent' size=64 flat=64
        [K] info size=56 flat=56
        [K] mass= size=56 flat=56
        [K] id size=56 flat=56
        [V] id: 'E1' size=56 flat=56
        [K] label size=56 flat=56
        [V] mass=: 1 size=32 flat=32
    'A' size=56 flat=56
    '1' size=56 flat=56


In [27]:
df_edges = pd.read_csv(filepath_edges)
df_edges.head()

Unnamed: 0,ID,source,target,label,properties
0,E1,A,1,is_equivalent,"{'info': 'A is equal to 1', 'mass=': 1}"
1,E2,B,2,is_equivalent,"{'info': 'B is equal to 2', 'mass=': 2}"
2,E3,C,3,is_equivalent,"{'info': 'C is equal to 3', 'mass=': 3}"
3,R1,B,A,follows,"{'info': 'B follows A', 'mass=': 1.0}"
4,R2,C,B,follows,"{'info': 'C follows B', 'mass=': 2.0}"


In [24]:
df_edges.memory_usage(deep=True)

Index         128
ID            354
source        348
target        348
label         402
properties    570
dtype: int64

In [25]:
df_edges.dtypes

ID            object
source        object
target        object
label         object
properties    object
dtype: object

In [31]:
df_edges[df_edges.columns].astype('category').memory_usage(deep=True)

Index         128
ID            532
source        288
target        468
label         248
properties    748
dtype: int64

In [None]:
128+

## Section 3: Creating NetworkX 

In [None]:
try:
    G.clear()
    print("Graph has been cleared!")
except:
    print("Graph G doesn't exist")

### 3.1 Create a Directed Graph

In [None]:
G = nx.from_edgelist(edgelist, create_using=nx.DiGraph)

### 3.2 Draw Graph

In [None]:
nx.draw(G, with_labels=True)

### 3.3 Some statistics

In [None]:
print("Number of nodes: {}".format(G.number_of_nodes()))
print("Number of edges: {}".format(G.number_of_edges()))

In [None]:
limit = 5
for index, edge in enumerate(G.edges(data=True)):
    if index == limit:
        break
    print(edge)

In [32]:
%load_ext memray

In [33]:
%%memray_flamegraph --trace-python-allocators --leaks
def a():
    return "a" * 10_000

def bc():
    return "bc" * 10_000

x = a() + bc()

