In [1]:
import os                                                                        
import time                                                                      
import json
import numpy as np
import pandas as pd
import dask as dd
from timeit import default_timer as timer
import gcsfs, json
from katana import remote
from katana.remote import import_data
from datetime import datetime
from dask.dataframe import from_pandas
from dask.dataframe import to_numeric
# os.environ["KATANA_SERVER_ADDRESS"] = "host.docker.internal:8080"

In [2]:
from dask.distributed import Client, progress
client = Client("10.0.0.101:8786")

In [3]:
print(dd.config.get("distributed.client"))

{'heartbeat': '5s', 'scheduler-info-interval': '2s', 'security-loader': None, 'preload': [], 'preload-argv': []}


In [4]:
import dask.bag as db
import json
base_path = 'gs://airport-ops/csv/'
b = db.read_text(base_path+'Complaints_Suggestions_Compliments_2022.csv', blocksize="1024MiB")
b.take(1)

('\ufeffCase Number,Case Origin,Issue Terminal,Case Record Type,Case Type,Category (F),Sub-Category (F),Issue Category,Subject,Description,Date/Time Opened,Case Owner\n',)

In [17]:
def standardize(record):
    import csv, sys
    from datetime import datetime
    csv.field_size_limit(sys.maxsize)
    reader = csv.reader([str(record)])
    
    for parsed_record in reader:
        case_number=parsed_record[0]
        case_origin=parsed_record[1]
        terminal   =parsed_record[2]
        case_record_type=parsed_record[3]
        case_type  =parsed_record[4]
        category   =parsed_record[5]
        sub_category=parsed_record[6]
        issue_category=parsed_record[7]
        subject=parsed_record[8]
        description=parsed_record[9]
        case_owner=parsed_record[11]
        parse_issue = 'Parsed successfully'
        opened_time=''
        # 22/10/2022 23:12
        try:
            opened_time = datetime.strptime(parsed_record[10], '%d/%m/%Y %H:%M')
            parse_issue = 'Parsed successfully'
        except:
            parse_issue=f'issue parsing opened time {parsed_record[10]}'
            try:
                opened_time = datetime.strptime(parsed_record[10], '%d/%m/%y %H:%M')
                parse_issue = 'Parsed successfully'
            except:
                parse_issue=f'issue parsing opened time {parsed_record[10]}'            
            
        return {
            'case_number': case_number,
            'case_origin': case_origin,
            'terminal': terminal,
            'case_record_type': case_record_type,
            'case_type': case_type,
            'category': category,
            'sub_category': sub_category,
            'issue_category': issue_category,
            'subject': subject,
            'description': description,
            'opened_time': opened_time,
            'case_owner': case_owner,
            'parse_issue': parse_issue
        }
    return {}

def isHeader(record):
    if  'Case Number' in record['case_number'] \
     and 'Case Origin' in record['case_origin']:
        return True
    return False

def hasParseError(record):
    if  'Parsed successfully' in record['parse_issue']:
        return False
    return True

final_df=b.map(standardize).remove(isHeader).remove(hasParseError)\
            .to_dataframe()\
            .astype({'case_number':'string','case_origin':'string','terminal':'string','case_record_type':'string','case_type':'string',
                    'category':'string', 'sub_category':'string', 'issue_category':'string', 'subject':'string', 'description':'string',
                    'opened_time':'M8[us]', 'case_owner':'string', 'parse_issue':'string'}).rename(columns={"case_number": "ID"}) 


In [18]:
len(final_df)

5283

In [12]:
len(final_df.ID.unique())

5283

### Import graph

In [14]:
rc=remote.Client()
rc.server_version

'0.5.0+20221106T212601Z.9c14e060f.dev'

In [15]:
graph = rc.create_graph(num_partitions=2)
graph.graph_id

'BFjWRQC46aWzT8vzoM26Jw3xREctoSjPS1o4RABJbpBc'

In [19]:

with import_data.DataFrameImporter(graph) as df_importer:
    df_importer.nodes_dataframe(final_df, id_column="ID", id_space='CASE', label='CASE')
#
    df_importer.insert()

          0/? [?op/s]

          0/? [?op/s]

In [21]:
query="""
MATCH(c:CASE)
return c limit 10
"""
graph.query(query).view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

### Promote Category to a Node

In [23]:
query="""
MATCH(c:CASE)
with distinct c.category as category
CREATE(cat:CATEGORY)
set cat.category=category
"""
graph.query(query).view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [None]:
# link categories : check
query="""
MATCH (cas:CASE),(cat:CATEGORY)
WHERE EXISTS(cas.category)
  and cas.category = cat.category
return cas.ID, cas.category,id(cat), cat.category order by cas.ID limit 10
"""
graph.query(query).view()


In [56]:
# link categories: establish
query="""
MATCH (cas:CASE),(cat:CATEGORY)
WHERE EXISTS(cas.category)
  and cas.category = cat.category
CREATE (cas)-[:HAS_CATEGORY]->(cat)
"""
graph.query(query).view()


          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [58]:
# link categories : check
query="""
MATCH (cas:CASE)-[:HAS_CATEGORY]->(cat:CATEGORY)
return cas, cat limit 10"""
graph.query(query, contextualize=True).view()


          0/? [?op/s]

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…

In [59]:
graph.schema().view()

          0/? [?op/s]

VBox(children=(HTML(value='\n                <style>\n                #jp-main-content-panel .widget-container…