
First make sure to run your ssh tunnel with:  
GCP: 
  `gcloud compute ssh --zone us-east1-b --project katana-internal8 katana-controller -- -NL 8080:127.0.0.1:8080 -vvv`
<br> <br>
AWS: 
  `ssh -i ~/.ssh/id_rsa -N -L 8080:54.162.203.170:8080 ubuntu@54.162.203.170 -vvv` 
<br><br>  

In [1]:
# do not change these values, only change platform_prefix below to gs or s3 depending where you run
gs_prefix="gs://katana-demo-datasets/csv-datasets/ldbc/"
s3_prefix="s3://property-graphs/csv/ldbc-4.0/bi/"
s3_sandbox="s3://graph-test-katana/ldbc/"

In [2]:
######## setup variables ##########

partitions = 2   
platform_prefix=gs_prefix


In [3]:
import os                                                                        
import time                                                                      
import json
import pandas as pd
from timeit import default_timer as timer

from katana import remote
from katana.remote import import_data

os.environ["KATANA_SERVER_ADDRESS"] = "host.docker.internal:8080"

In [4]:
graph = remote.Client().create_graph(
    num_partitions=partitions
)

In [5]:
test_node_file = platform_prefix + "headers/sf-0.003-bi-node_headers.txt"
test_edge_file = platform_prefix + "headers/sf-0.003-bi-edge_headers.txt"
input_path=platform_prefix 


In [6]:
start = timer()
print("Importing graph from CSV files...")
import_data.csv(
    graph,
    input_node_path=test_node_file,
    input_edge_path=test_edge_file,
    input_dir=input_path,
    files_have_headers=True,
    data_delimiter="|",
    schema_delimiter="|"
    #partition_policy="random-oec"
)
end = timer()
print(f"  import: {end - start:.1f} seconds") 

Importing graph from CSV files...


          0/? [?op/s]

  import: 8.5 seconds


In [7]:
%%time
graph.num_nodes()

          0/? [?op/s]

CPU times: user 63.4 ms, sys: 31.2 ms, total: 94.6 ms
Wall time: 3.14 s


29946

In [8]:
%%time
graph.num_edges()

          0/? [?op/s]

CPU times: user 88.3 ms, sys: 24 ms, total: 112 ms
Wall time: 3.13 s


43072

In [9]:
query1="match (n) return count(n)"

In [10]:
%%time
result = graph.query(query1)
pd.DataFrame(result[0:10])

          0/? [?op/s]

CPU times: user 93.9 ms, sys: 16.4 ms, total: 110 ms
Wall time: 3.17 s


Unnamed: 0,count(n)
0,29946


In [11]:
query2 = """
MATCH (p:Person)-[e:KNOWS]->(p2:Person) return count(e)
"""

In [12]:
%%time
result = graph.query(query2)
pd.DataFrame(result[0:10])

          0/? [?op/s]

CPU times: user 88.6 ms, sys: 19.8 ms, total: 108 ms
Wall time: 2.99 s


Unnamed: 0,count(e)
0,58


In [13]:
query3 = """
MATCH (p:Person) return p.firstName,p.lastName,p.gender,p.birthday,p.locationIP,p.browserUsed,p.email limit 10
"""


In [14]:
%%time
result = graph.query(query3)
pd.DataFrame(result[0:10])

          0/? [?op/s]

CPU times: user 103 ms, sys: 24.9 ms, total: 127 ms
Wall time: 2.96 s


Unnamed: 0,p.birthday,p.browserUsed,p.email,p.firstName,p.gender,p.lastName,p.locationIP
0,1983-05-28,Internet Explorer,Almira26388279066641@gmail.com,Almira,female,Patras,193.104.227.215
1,1981-05-30,Safari,Ali28587302322209@gmail.com;Ali28587302322209@...,Ali,female,Ferrer,41.223.177.12
2,1981-09-17,Chrome,Miguel32@gmail.com;Miguel32@gmail.com;Miguel32...,Miguel,male,Gonzalez,148.204.226.31
3,1988-08-23,Chrome,Celso13194139533352@gmx.com;Celso1319413953335...,Celso,male,Oliveira,192.190.31.253
4,1986-10-12,Internet Explorer,Ken10995116277782@zoho.com;Ken10995116277782@g...,Ken,male,Yamada,27.122.45.252
5,1984-04-10,Internet Explorer,Alfonso32985348833318@gmx.com;Alfonso329853488...,Alfonso,female,Rodriguez,192.100.158.95
6,1987-01-10,Safari,Arbaaz2199023255573@gmx.com;Arbaaz219902325557...,Arbaaz,female,Ali,115.186.113.189
7,1985-02-11,Internet Explorer,Wei24189255811109@yahoo.com;Wei24189255811109@...,Wei,female,Wei,27.112.88.106
8,1987-12-04,Firefox,Mehmet15393162788877@gmail.com;Mehmet153931627...,Mehmet,female,Koksal,46.154.44.101
9,1985-12-07,Internet Explorer,Wojciech17592186044443@gmail.com;Wojciech17592...,Wojciech,male,Ciesla,31.182.127.125
