# Graph preprocessing

In [1]:
import numpy as np
import pandas as pd
import neo4j
import os
import psycopg2

## Importing refugee movement data

In [2]:
# Load population.csv from Github
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-08-22/population.csv"
df = pd.read_csv(url)
print(df.shape)

df.head()

(64809, 16)


Unnamed: 0,year,coo_name,coo,coo_iso,coa_name,coa,coa_iso,refugees,asylum_seekers,returned_refugees,idps,returned_idps,stateless,ooc,oip,hst
0,2010,Afghanistan,AFG,AFG,Afghanistan,AFG,AFG,0,0,0,351907,3366,0,838250,,
1,2010,Iran (Islamic Rep. of),IRN,IRN,Afghanistan,AFG,AFG,30,21,0,0,0,0,0,,
2,2010,Iraq,IRQ,IRQ,Afghanistan,AFG,AFG,6,0,0,0,0,0,0,,
3,2010,Pakistan,PAK,PAK,Afghanistan,AFG,AFG,6398,9,0,0,0,0,0,,
4,2010,Egypt,ARE,EGY,Albania,ALB,ALB,5,0,0,0,0,0,0,,


## Split Data into Individual Years
- **Analyze 3 single years: 2014, 2018, 2022 - look at single year graphs, each 6 years apart to see trends**

In [3]:
# Create dfs for specific years
df_2014 = df[df['year'] == 2014]
df_2018 = df[df['year'] == 2018]
df_2022 = df[df['year'] == 2022]

# Validate that the DataFrames only contain the expected year
assert set(df_2014['year'].unique()) == {2014}
assert set(df_2018['year'].unique()) == {2018}
assert set(df_2022['year'].unique()) == {2022}

In [4]:
print(df_2014.shape)
print(df_2018.shape)
print(df_2022.shape)

(4746, 16)
(5264, 16)
(5930, 16)


## Preprocess refugee movement data to map countries to their country code (ex. Afghanistan -> AFG, Iran (Islamic Rep. of) -> IRN): for 2014, 2018, 2022

In [5]:
def preprocess_country_nodes_to_csv(df: pd.DataFrame, output_file_path: str) -> None:
    """Map countries to their country code from a DataFrame containing origin and asylum data.
    Saves the resulting country nodes table to CSV file specified by output_file_path.

    Args:
        df (pd.DataFrame): Refugee data. Will contain the split data for individual years (2014, 2018, 2022).
        output_file_path (str): The name of the file path to save the country nodes table to.
    Returns:
        None.
    """
    # Create origin and asylum countries DF for appending
    origin_df = df[['coo_name', 'coo']].rename(columns = {'coo_name': 'country_full', 'coo': 'country'})
    asylum_df = df[['coa_name', 'coa']].rename(columns = {'coa_name': 'country_full', 'coa': 'country'})
    nodes_df = pd.concat([origin_df, asylum_df])

    # Create countries table (nodes)
    nodes_df = nodes_df.drop_duplicates()

    # Ensure 1:1 mapping between country names and codes
    print("Nodes DF unique? ", nodes_df['country_full'].is_unique)

    # Specify file paths for saving CSVs
    main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
    nodes_path = os.path.join(main_dir, output_file_path)
    nodes_df.to_csv(nodes_path, index = False)
    return

In [6]:
preprocess_country_nodes_to_csv(df_2014, 'nodes_2014.csv')
preprocess_country_nodes_to_csv(df_2018, 'nodes_2018.csv')
preprocess_country_nodes_to_csv(df_2022, 'nodes_2022.csv')

Nodes DF unique?  True
Nodes DF unique?  True
Nodes DF unique?  True


## Create migration movement relationships for 2014, 2018, 2022

In [7]:
def create_migration_movement_relationship_to_csv(df: pd.DataFrame, output_file_path: str) -> None:
    """Create migration movement relationships for countries paired via origin and asylum.
    Saves the resulting migration relationships table to CSV file specified by output_file_path.

    Args:
        df (pd.DataFrame): Refugee data. Will contain the split data for individual years (2014, 2018, 2022).
        output_file_path (str): The name of the file path to save the country nodes table to.
    Returns:
        None.
    """

    # Aggregate migration across all years by country pairs
    migration_df = df.groupby(['coo', 'coa'])['refugees'].sum().reset_index()
    migration_df = migration_df.rename(columns = {'coo': 'from_country', 'coa': 'to_country'})
    migration_df = migration_df[migration_df['refugees'] > 0]

    # Specify file paths for saving CSV
    main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
    migration_path = os.path.join(main_dir, output_file_path)
    migration_df.to_csv(migration_path, index = False)
    

    # Display summary stats for refugees
    migration_df.describe()

In [8]:
create_migration_movement_relationship_to_csv(df_2014, 'migration_2014.csv')
create_migration_movement_relationship_to_csv(df_2018, 'migration_2018.csv')
create_migration_movement_relationship_to_csv(df_2022, 'migration_2022.csv')