# Experiment 3: Pandas Dataframes as data structures for Graphs

[//]: # (------------------------------------------    DO NOT MODIFY THIS    ------------------------------------------)
<style type="text/css">
.tg  {border-collapse:collapse;
      border-spacing:0;
     }
.tg td{border-color:black;
       border-style:solid;
       border-width:1px;
       font-family:Arial, sans-serif;
       font-size:14px;
       overflow:hidden;
       padding:10px 5px;
       word-break:normal;
      }
.tg th{border-color:black;
       border-style:solid;
       border-width:1px;
       font-family:Arial, sans-serif;
       font-size:14px;
       font-weight:normal;
       overflow:hidden;
       padding:10px 5px;
       word-break:normal;
      }
.tg .tg-fymr{border-color:inherit;
             font-weight:bold;
             text-align:left;
             vertical-align:top
            }
.tg .tg-0pky{border-color:inherit;
             text-align:left;
             vertical-align:top
            }
[//]: # (--------------------------------------------------------------------------------------------------------------)

[//]: # (-------------------------------------    FILL THIS OUT WITH YOUR DATA    -------------------------------------)
</style>
<table class="tg">
    <tbody>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Title:</td>
        <td class="tg-0pky">Experiment 3: Pandas Dataframes as data structures for Graphs</td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Authors:</td>
        <td class="tg-0pky">
            <a href="https://github.com/ecarrenolozano" target="_blank" rel="noopener noreferrer">Edwin Carre√±o</a>
        </td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Affiliations:</td>
        <td class="tg-0pky">
            <a href="https://www.ssc.uni-heidelberg.de/en" target="_blank" rel="noopener noreferrer">Scientific Software Center</a>
        </td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Date Created:</td>
        <td class="tg-0pky">30.10.2024</td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Description:</td>
        <td class="tg-0pky">Creation of a graph using Pandas dataframes and data from CSV files. Conversion to NetworkX is tested too.</td>
      </tr>
    </tbody>
</table>

[//]: # (--------------------------------------------------------------------------------------------------------------)

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/quickstart/beginner.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/quickstart/beginner.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## Overview

In this notebook we are going to:

1. Import CSV data from nodes and edges.
2. Describe the data from CSVs.
3. Create a data pipeline that:
   - load csv data as Pandas dataframes.
4. Create a NetworkX graph from a Pandas dataframe that contains information about edges.

## Setup (if required)

If your code require to install dependencies before your main code, please add the commands to install the dependencies. You can silence the messages returned by `pip` the flag `-q`, i.e. `pip install pandas -q`

### NetworkX installation

In [1]:
!pip install networkx[default] -q

### Pandas installation

In [2]:
!pip install pandas -q

## Importing Libraries

In [3]:
"""
Recommendations:
    - Respect the order of the imports, they are indicated by the numbers 1, 2, 3.
    - One import per line is recommended, with this we can track easily any modified line when we use git.
    - Absolute imports are recommended (see 3. Local application/library specific imports below), they improve readability and give better error messages.
    - You should put a blank line between each group of imports.
"""

# future-imports (for instance: from __future__ import barry_as_FLUFL)
# from __future__ import barry_as_FLUFL  

# 1. Standard library imports
import ast
import csv
import os

# 2. Related third party imports
import networkx as nx
import pandas as pd

# 3. Local application/library specific imports
# import <mypackage>.<MyClass>         # this is an example
# from <mypackage> import <MyClass>    # this is another example 

## Helper Functions

In [13]:
def print_networkx_edges(networkx_graph, limit=5):
    for index, edge in enumerate(networkx_graph.edges(data=True)):
        if index >=5:
            break
        print(edge)

def create_pandasdf_from_csv(filepath, header=True):
    pandas_dataframe = pd.read_csv(filepath, delimiter=',')
         
    return pandas_dataframe

def update_dict(row):
    if not pd.isna(row['UniProt ID']):
        row['properties']['id'] = row['UniProt ID']
    if not pd.isna(row['label']):
        row['properties']['label'] = row['label']
    return row['properties']

def edges_format_converter(dataframe):
    # Original format
    #  (id, source node, target node, label, properties)
    
    # Desired format
    #  (source node, target node, properties)

    dataframe['properties'] = dataframe['properties'].apply(lambda d: ast.literal_eval(d))
    dataframe['properties'] = dataframe.apply(update_dict, 1)    

    dataframe.rename(columns={'Source ID':'source',
                              'Target ID':'target'
                             },
                    inplace=True)

    dataframe.drop(columns=['UniProt ID', 'label'], inplace=True)
    
    return dataframe

def create_dataframes(file_path_nodes, file_path_edges):
    df_nodes = from_csv_to_pandasdf(file_path_nodes, delimiter=',')
    df_edges = from_csv_to_pandasdf(file_path_edges, delimiter=',')
    return df_nodes, df_edges    

def from_csv_to_pandasdf(file_path, delimiter=','):
    return pd.read_csv(file_path,
                       delimiter=delimiter)

def load_csv_generator(file_path, header=True):
    with open(file_path, "r") as file:
        reader = csv.reader(file)
        if header:
            next(reader)
        for row in reader:
            yield tuple(row)

## Introduction

## Section 1: Load the CSV data that contains nodes and edges

## Section 1: Load the CSV data that contains nodes and edges

For this exercise, each graph is represented by two CSV files. One containing information about the **nodes** and the other about the **edges**. To indicate that both files correspond to the same graph, their names include the same number of nodes. For example:

- `dataset_30_nodes_proteins.csv`: contains 30 rows (nodes).
- `dataset_30_edges_interactions.csv`: contains 47 rows (edges).

We reference each CSV file or dataset as follows:

In [14]:
filename_nodes = "dataset_30_nodes_proteins.csv"
filename_edges = "dataset_30_edges_interactions.csv"

FILE_PATH_DATASETS = "../../../DATASETS"

### 1.1: Load Nodes

The CSV file for nodes contains three columns:
- `UniProt ID`
- `label`
- `properties`

We are going to load the information of nodes as a **list of tuples**. Each tuple represents a node with the structure:
- `(id, label, properties)`
- Each field in the tuple is a `string`
- The `properties` field is a string containing a dictionary of properties.

In [15]:
file_path_nodes = os.path.join(FILE_PATH_DATASETS, filename_nodes)

list_nodes = [node for node in load_csv_generator(file_path_nodes, header=True)]

In [16]:
print("The NODES dataframe contains: {} nodes".format(len(list_nodes)))
print("Example:\n\t{}".format(list_nodes[0]))

The NODES dataframe contains: 30 nodes
Example:
	('G0P1I0', 'uniprot_protein', "{'sequence': 'QKRTLFKVEFGSMSWFYKHRTNMHLRTMMYD', 'description': 'Lorem ipsum ilymp', 'taxon': '7813'}")


### 1.2: Load Edges

The CSV file for edges contains five columns:
- `Relation ID`
- `Source ID`
- `Target ID`
- `label`
- `properties`

We are going to load the information of edges as a list of tuples. Each tuple represents a node with thethe structure:
- `(id, source, target, label, properties)`
- Each field in the tuple is a `string`
- The `properties` field is string containing a dictionary of properties.

In [17]:
file_path_edges = os.path.join(FILE_PATH_DATASETS, filename_edges)

list_edges = [edge for edge in load_csv_generator(file_path_edges, header=True)]

In [18]:
print("The list of EDGES contains: {} edges".format(len(list_edges)))
print("Example:")
for edge in list_edges[0:6]:
    print("\t{}".format(edge))

The list of EDGES contains: 47 edges
Example:
	('', 'B6V6V7', 'H9R6K5', 'interacts_with', "{'source': 'signor'}")
	('', 'B6V6V7', 'I5X3I2', 'interacts_with', "{'method': 'Lorem ipsum pblfc'}")
	('', '385603', 'H9R6K5', 'interacts_with', "{'source': 'intact', 'method': 'Lorem ipsum nbqvj'}")
	('', '385603', 'I9A3P8', 'interacts_with', '{}')
	('', 'L3V2Y6', 'X0C9T5', 'interacts_with', "{'source': 'intact', 'method': 'Lorem ipsum zwhrq'}")
	('intact956331', 'L3V2Y6', 'Y8Y3K0', 'interacts_with', "{'source': 'signor'}")


## Section 2: Create Data Pipeline
- **input:** CSV data of edges
- **output:** Pandas dataframe containing information of edges

The pipeline consist of two consecutive stages:

| Stage | Function                              | Description |
|-------|---------------------------------------| ----------- |
| 1     | `create_pandasdf_from_csv()`          | create a pandas dataframe in memory        |
| 2     | `edges_format_converter(dataframe)`   | transform dataframe rows to desired format |

### 2.1 Create Pandas Dataframe from CSV file (data structure)

In [19]:
df_nodes, df_edges = create_dataframes(file_path_nodes, file_path_edges)

In [22]:
df_nodes.head()

Unnamed: 0,UniProt ID,label,properties
0,G0P1I0,uniprot_protein,{'sequence': 'QKRTLFKVEFGSMSWFYKHRTNMHLRTMMYD'...
1,B6V6V7,uniprot_isoform,{'sequence': 'CGLGLSMRFLMVGVQNWFWYGTRVPDVAYIDW...
2,385603,entrez_protein,{'sequence': 'EHYTERRSMRSTGQDKTSNPYLFFHITRFKRC...
3,L3V2Y6,uniprot_protein,"{'sequence': 'EGYPSHLLDVESFMSHNWTPWIFKCDARLC',..."
4,Q1R3N5,uniprot_isoform,"{'sequence': 'IVHIKHTTQLGAIIQRNFQVGWCHVG', 'de..."


In [23]:
df_edges.head()

Unnamed: 0,Relationship ID,Source ID,Target ID,label,properties
0,,B6V6V7,H9R6K5,interacts_with,{'source': 'signor'}
1,,B6V6V7,I5X3I2,interacts_with,{'method': 'Lorem ipsum pblfc'}
2,,385603,H9R6K5,interacts_with,"{'source': 'intact', 'method': 'Lorem ipsum nb..."
3,,385603,I9A3P8,interacts_with,{}
4,,L3V2Y6,X0C9T5,interacts_with,"{'source': 'intact', 'method': 'Lorem ipsum zw..."


Unnamed: 0,UniProt ID,properties
0,G0P1I0,{'sequence': 'QKRTLFKVEFGSMSWFYKHRTNMHLRTMMYD'...
1,B6V6V7,{'sequence': 'CGLGLSMRFLMVGVQNWFWYGTRVPDVAYIDW...
2,385603,{'sequence': 'EHYTERRSMRSTGQDKTSNPYLFFHITRFKRC...
3,L3V2Y6,"{'sequence': 'EGYPSHLLDVESFMSHNWTPWIFKCDARLC',..."
4,Q1R3N5,"{'sequence': 'IVHIKHTTQLGAIIQRNFQVGWCHVG', 'de..."
5,249853,{'sequence': 'RYWCKKEDCLAIKLEWVNHTHCLVQKDGMCMN...
6,X0C9T5,{'sequence': 'VPWFKQEYWACIYPEKWDETCTQTCHWSYQWK...
7,Y8Y3K0,"{'sequence': 'LKPNAIFRKINTLIMMWTAFRGQGKEKR', '..."
8,535278,{'sequence': 'LEGPWGTSCMNSCNISKNYMEYSFKCQYMHRH...
9,D8O4Y7,"{'sequence': 'IQMLFDTFTNNMRNWWCEHW', 'descript..."


In [None]:
edges_pandas_dataframe = create_dataframes()



edges_format_converter(create_pandasdf_from_csv(filepath_edges))

In [None]:
edges_pandas_dataframe.head()

## Section 3: Converting to Networkx graph

In [None]:
try:
    G.clear()
    print("Graph has been cleared!")
except:
    print("Graph G doesn't exist")

### 3.1 Create a Directed Graph

In [None]:
G = nx.from_pandas_edgelist(edges_pandas_dataframe, 
                            source="source",
                            target="target",
                            edge_attr="properties",
                            create_using=nx.DiGraph())

Note that the graph contains a dictionary with a single key, `"properties"`. Inside this dictionary is another dictionary that holds each property. However, this is not the desired format. Instead, we want a single dictionary containing all the properties directly.

In [None]:
print_networkx_edges(G)

### 3.2 Format the dictionary of properties for each edge

In [None]:
for index, row in edges_pandas_dataframe.iterrows():
    properties = G[row['source']][row['target']]["properties"]
    G.add_edge(row['source'], row['target'], **properties)
    G[row['source']][row['target']].pop("properties")

In [None]:
print_networkx_edges(G)

### 3.2 Draw Graph

In [None]:
nx.draw(G, with_labels=True)

### 3.3 Some statistics

In [None]:
print("Number of nodes: {}".format(G.number_of_nodes()))
print("Number of edges: {}".format(G.number_of_edges()))

In [None]:
print_networkx_edges(G)