In [1]:
import pandas as pd
import numpy as np
import json
from typing import List, Optional

from neo4j import GraphDatabase

  from pandas.core import (


In [12]:
class Table:
    def __init__(self, code: str, name: str, paths: List[str], level1_name: str, level1: List[str], aggregates: List[str]):
        self.code = code
        self.name = name
        self.paths = paths
        self.level1_name = level1_name
        self.level1 = level1
        self.aggregates = aggregates
        self.df = pd.read_csv(paths[0])
        if len(paths) > 1:
            for p in paths[1:]:
                self.df = pd.merge(self.df, pd.read_csv(p), on='SED_CODE_2021')
    
    def summary_stats(self) -> pd.DataFrame:
        # Calculate percentage of each category relative to the aggregate column
        for col in self.level1:
            self.df[f"{col}_by_{self.aggregates[0]}"] = self.df[col] / self.df[self.aggregates[0]] * 100
        return self.df

    def correlation(self, table: 'Table') -> float:
        # Merge and calculate correlation matrix of combined data
        merged_df = pd.merge(self.summary_stats(), table.summary_stats(), on='SED_CODE_2021')[self.level1 + table.level1]
        corr = merged_df.corr()
        off_diagonal = corr.values[~np.eye(corr.shape[0], dtype=bool)]
        table_corr = np.mean(np.abs(off_diagonal))
        return table_corr
    
    def melt(self) -> pd.DataFrame:
        # Melt the DataFrame to convert from wide to long format
        value_vars = self.level1
        melted_df = pd.melt(self.df, id_vars=['SED_CODE_2021'], value_vars=value_vars, var_name='l1_category', value_name='population')
        
        return melted_df
    
    def to_json(self) -> str:
        melted_df = self.melt().to_json(orient='records', indent=4)
        
        final_json = {
            "code": self.code,
            "name": self.name,
            "level1_name": self.level1_name,
            "level1": self.level1,
            "data_level1": melted_df
        }
        
        return json.dumps(final_json, indent=4)
    
    def __str__(self) -> str:
        s = f"Table {self.code}: {self.name}"
        return s


In [13]:
import pickle
all_tables = pickle.load(open('../data/table_objects.p', 'rb'))

In [25]:
# add 60, 21, 37
for t in all_tables:
    print(f"{t.code}: {t.level1_name}")

g04: Age (45+ yr old)
g17: Weekly personal income
g18: Need for assistance
g19: Long-term health condition
g33: Weekly household income
g54: Industry of employment


In [15]:
# Connection credentials
uri = "neo4j://localhost:7687"
user = "neo4j"
password = "AdaptTest"

driver = GraphDatabase.driver(uri, auth=(user, password))

In [21]:
def insert_data(session, table_json):
    table = json.loads(table_json)
    query = """
    MERGE (n:Table {code: $code, name: $name, level1_name: $level1_name, level1: $level1, data_level1: $data_level1})
    """
    session.run(query, 
                code=table['code'],
                name=table['name'],
                level1_name=table['level1_name'],
                level1=table['level1'],
                data_level1=table['data_level1'])

In [22]:
with driver.session() as session:
    for t in all_tables:
        table_json = t.to_json()
        insert_data(session, table_json)

driver.close()

  with driver.session() as session:


In [23]:
def insert_edge(session, code1, code2, correlation):
    query = """
    MATCH (t1:Table {code: $code1}), (t2:Table {code: $code2})
    MERGE (t1)-[r:CORRELATED_TO {correlation: $correlation}]->(t2)
    """
    session.run(query, code1=code1, code2=code2, correlation=correlation)

In [24]:
with driver.session() as session:
    for t1 in all_tables:
        for t2 in all_tables:
            if t1.code != t2.code:
                corr = t1.correlation(t2)
                insert_edge(session, code1=t1.code, code2=t2.code, correlation=corr)

driver.close()

  with driver.session() as session:
