# Project notebook

In [1]:
import pandas as pd
import neo4j
import os
import psycopg2

## Neo4j setup

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [5]:
def my_neo4j_create_node(country):
    "create a node with label Country"
    
    query = """
    
    CREATE (:Country {name: $country})
    
    """
    
    session.run(query, country=country)
    

In [6]:
def my_neo4j_create_relationship_one_way(from_country, to_country, weight):
    "create relationships one way between two countries with a weight"
    
    query = """
    
    MATCH (from:Country), 
          (to:Country)
    WHERE from.name = $from_country and to.name = $to_country
    CREATE (from)-[:LINK {weight: $weight}]->(to)

    
    """
    
    session.run(query, from_country=from_country, to_country=to_country, weight=weight)
    

In [7]:
my_neo4j_wipe_out_database()

## Postgres setup

In [8]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [9]:
cursor = connection.cursor()

## Importing refugee movement data

In [10]:
# Load population.csv from Github
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-22/population.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,year,coo_name,coo,coo_iso,coa_name,coa,coa_iso,refugees,asylum_seekers,returned_refugees,idps,returned_idps,stateless,ooc,oip,hst
0,2010,Afghanistan,AFG,AFG,Afghanistan,AFG,AFG,0,0,0,351907,3366,0,838250,,
1,2010,Iran (Islamic Rep. of),IRN,IRN,Afghanistan,AFG,AFG,30,21,0,0,0,0,0,,
2,2010,Iraq,IRQ,IRQ,Afghanistan,AFG,AFG,6,0,0,0,0,0,0,,
3,2010,Pakistan,PAK,PAK,Afghanistan,AFG,AFG,6398,9,0,0,0,0,0,,
4,2010,Egypt,ARE,EGY,Albania,ALB,ALB,5,0,0,0,0,0,0,,


## Creating country nodes in Neo4j

In [11]:
# Create origin and arrival countries DF for appending
origin_df = df[['coo_name', 'coo']].rename(columns = {'coo_name': 'country_full', 'coo': 'country'})
arrival_df = df[['coa_name', 'coa']].rename(columns = {'coa_name': 'country_full', 'coa': 'country'})
nodes_df = pd.concat([origin_df, arrival_df])

# Create countries table (nodes)
nodes_df = nodes_df.drop_duplicates()

# Ensure 1:1 mapping between country names and codes
print("Nodes DF unique? ", nodes_df['country_full'].is_unique)

# Specify file paths for saving CSVs
main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
nodes_path = os.path.join(main_dir, 'nodes.csv')
nodes_df.to_csv(nodes_path, index = False)

Nodes DF unique?  True


In [12]:
# Query into countries table for nodes
connection.rollback()

query = """

drop table if exists countries;

create table countries (
  country_full varchar(100),
  country varchar(100)
);

copy countries (country_full, country)
from '/user/projects/project-3-dliang5299/nodes.csv' delimiter ',' NULL '' csv header;

select country
from countries
order by country;

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    country = row[0]
    
    my_neo4j_create_node(country)


## Create migration movement relationships

In [13]:
# Aggregate migration across all years by country pairs
migration_df = df.groupby(['coo', 'coa'])['refugees'].sum().reset_index()
migration_df = migration_df.rename(columns = {'coo': 'to_country', 'coa': 'from_country'})
migration_df = migration_df[migration_df['refugees'] > 0]

# Specify file paths for saving CSV
migration_path = os.path.join(main_dir, 'migration.csv')
migration_df.to_csv(migration_path, index = False)

# Display summary stats for refugees
migration_df.describe()

Unnamed: 0,refugees
count,5654.0
mean,39431.17
std,580856.7
min,5.0
25%,39.0
50%,144.0
75%,1077.0
max,29257140.0


In [14]:
# Query into migration table for relationships
connection.rollback()

query = """

drop table if exists migration;

create table migration (
  to_country varchar(100),
  from_country varchar(100),
  refugees numeric(8)
);

copy migration (to_country, from_country, refugees)
from '/user/projects/project-3-dliang5299/migration.csv' delimiter ',' NULL '' csv header;

select to_country, from_country, refugees
from migration
order by to_country, from_country;

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    from_country = row[0]
    to_country = row[1]
    weight = row[2]
    
    my_neo4j_create_relationship_one_way(from_country, to_country, int(weight))
