# Graph algorithms: 2014

In [19]:
import numpy as np
import pandas as pd
import neo4j
import os
import psycopg2

## Neo4j setup

In [20]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [21]:
session = driver.session(database="neo4j")

In [22]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [23]:
def my_neo4j_create_node(country):
    "create a node with label Country"
    
    query = """
    
    CREATE (:Country {name: $country})
    
    """
    
    session.run(query, country=country)
    

In [24]:
def my_neo4j_create_relationship_one_way(from_country, to_country, weight):
    "create relationships one way between two countries with a weight"
    
    query = """
    
    MATCH (from:Country), 
          (to:Country)
    WHERE from.name = $from_country and to.name = $to_country
    CREATE (from)-[:MIGRATE_TO {weight: $weight}]->(to)

    
    """
    
    session.run(query, from_country=from_country, to_country=to_country, weight=weight)
    

In [25]:
my_neo4j_wipe_out_database()

## Postgres setup

In [26]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [27]:
cursor = connection.cursor()

## Load CSV file containing country nodes data into postgres

In [28]:
def load_country_nodes_csv_into_postgres(output_file_name: str) -> list[tuple[str]]:
    """
    Creates a "countries" table in Postgres, loads data from the specified CSV file into the table,
    and then returns the data using `cursor.fetchall()`.
    
    Args:
        output_file_name (str): The name of the CSV file containing the country nodes data 
            to be loaded into the Postgres table.
    Returns:
        list[tuple[any]]: A list of 1-item tuples where each tuple represents a row containing 
            the country code from the country table.
    """
    # Query into countries table for nodes
    connection.rollback()

    query = """

    drop table if exists countries;

    create table countries (
      country_full varchar(100),
      country varchar(100)
    );

    copy countries (country_full, country)
    from %(file_path)s delimiter ',' NULL '' csv header;

    select country
    from countries
    order by country;

    """
    
    file_path = f"/user/projects/project-3-dliang5299/{output_file_name}"
    cursor.execute(query, {'file_path': file_path})
    
    connection.rollback()

    rows = cursor.fetchall()
    return rows

## Creating + visualizing country nodes from postgres into Neo4j for 2014

In [29]:
my_neo4j_wipe_out_database()
# Load country data from 2014 into postgres and create neo4j nodes
rows = load_country_nodes_csv_into_postgres('nodes_2014.csv')
for row in rows:
    country = row[0]
    my_neo4j_create_node(country)

## Load CSV file containing migration data into postgres

In [30]:
def load_migration_csv_into_postgres(output_file_name: str) -> list[tuple[any]]:
    """
    Creates a "migrations" table in Postgres, loads data from the specified CSV file into the table,
    and then returns the data using `cursor.fetchall()`.
    
    Args:
        output_file_name (str): The name of the CSV file containing the migration data to be loaded into the Postgres table.
    Returns:
        list[tuple[any]]: A list of tuples where each tuple represents a row in the migration table.
    """
    # Query into migration table for relationships
    connection.rollback()
    
    query = """

    drop table if exists migration;

    create table migration (
      to_country varchar(100),
      from_country varchar(100),
      refugees numeric(8)
    );

    copy migration (to_country, from_country, refugees)
    from %(file_path)s delimiter ',' NULL '' csv header;

    select to_country, from_country, refugees
    from migration
    order by to_country, from_country;

    """
    
    file_path = f"/user/projects/project-3-dliang5299/{output_file_name}"
    cursor.execute(query, {'file_path': file_path})
    
    connection.rollback()

    rows = cursor.fetchall()
    return rows

## Creating migration movement graphs from postgres into Neo4j

In [31]:
## Load country data from 2014 into postgres and create neo4j nodes
rows = load_migration_csv_into_postgres('migration_2014.csv')

for row in rows:
    
    from_country = row[0]
    to_country = row[1]
    weight = row[2]
    
    my_neo4j_create_relationship_one_way(from_country, to_country, int(weight))


## Explore relationships + algorithms for 2014

In [32]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [33]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

In [34]:
# for 2014 database loaded 
my_neo4j_nodes_relationships()

# observe a density of .2 --> sparse graph or low density

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,ABW,[Country]
1,AFG,[Country]
2,ALB,[Country]
3,ALG,[Country]
4,AND,[Country]
...,...,...
197,WES,[Country]
198,WSH,[Country]
199,YEM,[Country]
200,ZAM,[Country]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels
0,AFG,[Country],MIGRATE_TO,ARG,[Country]
1,AFG,[Country],MIGRATE_TO,AUL,[Country]
2,AFG,[Country],MIGRATE_TO,AUS,[Country]
3,AFG,[Country],MIGRATE_TO,AZE,[Country]
4,AFG,[Country],MIGRATE_TO,BEL,[Country]
...,...,...,...,...,...
3825,ZIM,[Country],MIGRATE_TO,SWA,[Country]
3826,ZIM,[Country],MIGRATE_TO,SWE,[Country]
3827,ZIM,[Country],MIGRATE_TO,SWI,[Country]
3828,ZIM,[Country],MIGRATE_TO,USA,[Country]


-------------------------
  Density: 0.2
-------------------------


### Alg #1: Degree Centrality 

High degree centrality shows countries with most creation of refugees / grant asylum 
In this graph, we see top countries sending refugees: 

In [35]:
# degree centrality 
query = "CALL gds.graph.drop('ds_graph', false) yield graphName"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'Country', 'MIGRATE_TO')"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f9bd09c0a90>

In [36]:
query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name
limit 8

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,degree
0,SYR,98.0
1,SOM,96.0
2,IRQ,92.0
3,COD,90.0
4,SUD,81.0
5,IRN,79.0
6,AFG,77.0
7,ERT,71.0


This shows the top 8 countries sending refugees: 
- Syria
- Somalia
- Iraq
- Democratic Republic of the Congo
- Sudan
- Iran
- Afghanistan
- Eritrea

In [None]:
# Query used to generate graph in Neo4j
degree_centrality_graph_query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
WITH gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name
limit 8
MATCH (n)
WHERE n.name IN name
RETURN n


"""

![title](img/picture.png)

## Alg #2: Pagerank

Most influential countries granting asylum 

In [37]:
query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC
limit 8

"""

max_iterations = 20
damping_factor = 0.05

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,USA,2.363713
1,CAN,2.163004
2,GFR,1.535513
3,GBR,1.491807
4,AUL,1.401717
5,SWE,1.32711
6,FRA,1.314922
7,ITA,1.199714


In [None]:
# Query used to generate graph in Neo4j
pagerank_graph_query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: 20,
                           dampingFactor: 0.05}
                         )
YIELD nodeId, score
WITH gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC
limit 8
MATCH (n)
WHERE n.name IN name
RETURN n


"""

![title](img/picture.png)

## Alg #3: Randomized-Approximate Brandes 
Since betweenness can be very time consuming and expensive due to all pairs shortest path

High betweeness shows the intermediate countries that act as places of temporary refuge 

In [38]:
query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: $sampling_size, samplingSeed: $sampling_seed})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC
limit 8

"""

sampling_size = 5
sampling_seed = 0

my_neo4j_run_query_pandas(query, sampling_size=sampling_size, sampling_seed=sampling_seed)


Unnamed: 0,name,betweenness
0,RUS,113.796718
1,SYR,99.127527
2,SUD,59.65812
3,PAK,59.188738
4,ARE,52.714757
5,COD,36.530343
6,TUR,32.327678
7,ETH,29.246055


In [None]:
# Query used to generate graph in Neo4j
# Note that the graph displayed will be different as the 
# sampling size and sampling seed for RA_Brandes will not produce the same graph in different runs
ra_brandes_graph_query = """

CALL gds.betweenness.stream('ds_graph', {samplingSize: 5, samplingSeed: 0})
YIELD nodeId, score
WITH gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC
limit 8
MATCH (n)
WHERE n.name IN name
RETURN n


"""

![title](img/picture.png)