## Refugee Movement Around the World Analysis

https://github.com/rfordatascience/tidytuesday/blob/master/data/2023/2023-08-22/readme.md

In [1]:
import neo4j
import pandas as pd
from IPython.display import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import data

In [2]:
population = pd.read_csv('population.csv')

In [3]:
population[population['coo'] == 'USA'].head(2)

Unnamed: 0,year,coo_name,coo,coo_iso,coa_name,coa,coa_iso,refugees,asylum_seekers,returned_refugees,idps,returned_idps,stateless,ooc,oip,hst
132,2010,United States of America,USA,USA,Argentina,ARG,ARG,5,0,0,0,0,0,0,,
237,2010,United States of America,USA,USA,Australia,AUL,AUS,8,0,0,0,0,0,0,,


In [4]:
# Map country three letter codes to names (e.g., USA == United States of America) 
origin_country_dict = dict(zip(population['coo'], population['coo_name']))
asylum_country_dict = dict(zip(population['coa'], population['coa_name']))
country_name_dict = origin_country_dict | asylum_country_dict
print(f'{len(country_name_dict)} unique countries in dataset, with \
{len(origin_country_dict)} origin and {len(asylum_country_dict)} asylum.')

212 unique countries in dataset, with 210 origin and 189 asylum.


## Connect, login, create driver, create session; with community edition, we can only use 1 database, the "neo4j" database

In [5]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [6]:
session = driver.session(database="neo4j")

## Functions to Manage neo4j data

In [7]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [8]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [9]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

## Nodes and Relationships

In [10]:
my_neo4j_wipe_out_database()

In [11]:
# Loop to create node per country
# Template --> (usa:Country {name: 'United States of America'}),
create_country_nodes_query = 'CREATE'
for c_code, c_name in origin_country_dict.items():
    temp_string = "(" + c_code + ":Country {name: \"" + c_name + "\"}),"
    create_country_nodes_query = create_country_nodes_query + temp_string

create_country_nodes_query = create_country_nodes_query[:-1] # Drop trailing comma

In [12]:
# create_country_nodes_query

In [13]:
# Run query to create node per country
session.run(create_country_nodes_query)

<neo4j._sync.work.result.Result at 0x7f76d86f8ee0>

In [14]:
# Confirm that country nodes created successfully
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,Afghanistan,[Country]
1,Albania,[Country]
2,Algeria,[Country]
3,Andorra,[Country]
4,Angola,[Country]
...,...,...
205,Viet Nam,[Country]
206,Western Sahara,[Country]
207,Yemen,[Country]
208,Zambia,[Country]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels


-------------------------
  Density: 0.0
-------------------------


In [15]:
# Loop to create node per year
# Template --> (Y2010:Year),
# NOTE: neo4j requires names begin with a letter
#temp_string = "(\"" + str(year) + "\":Year {name: \"" + str(year) + "\"}),"
create_year_nodes_query = 'CREATE'
for year in population.year.unique():
    temp_string = "(Y" + str(year) + "),"
    temp_string = "(Y" + str(year) + ":Year {name: " + str(year) + "}),"
    create_year_nodes_query = create_year_nodes_query + temp_string

create_year_nodes_query = create_year_nodes_query[:-1] # Drop trailing comma

In [16]:
create_year_nodes_query

'CREATE(Y2010:Year {name: 2010}),(Y2011:Year {name: 2011}),(Y2012:Year {name: 2012}),(Y2013:Year {name: 2013}),(Y2014:Year {name: 2014}),(Y2015:Year {name: 2015}),(Y2016:Year {name: 2016}),(Y2017:Year {name: 2017}),(Y2018:Year {name: 2018}),(Y2019:Year {name: 2019}),(Y2020:Year {name: 2020}),(Y2021:Year {name: 2021}),(Y2022:Year {name: 2022})'

In [17]:
# Run query to create node per country
session.run(create_year_nodes_query)

<neo4j._sync.work.result.Result at 0x7f76d8716220>

In [18]:
# Confirm that year nodes created successfully
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------


Unnamed: 0,node_name,labels
0,Afghanistan,[Country]
1,Albania,[Country]
2,Algeria,[Country]
3,Andorra,[Country]
4,Angola,[Country]
...,...,...
218,2018,[Year]
219,2019,[Year]
220,2020,[Year]
221,2021,[Year]


-------------------------
  Relationships:
-------------------------


Unnamed: 0,node_name_1,node_1_labels,relationship_type,node_name_2,node_2_labels


-------------------------
  Density: 0.0
-------------------------
