# Requirements

##### Requirements are found [here](https://....)

---

## Connect to the cluster

Run this command in separate terminal to forward port from Katana controller node

```gcloud compute ssh --zone us-east1-b --project katana-internal14 katana-controller -- -NL 8080:127.0.0.1:8080 -vvv ```

Connect local Docker container to Katana Controller

In [None]:
import os                                                                        
import time                                                                      
import pandas as pd
from timeit import default_timer as timer

from katana import remote
from katana.remote import import_data
from katana.remote.import_data import Operation
cluster_name="katana-internal14"
num_partitions = 1


os.environ["KATANA_SERVER_ADDRESS"] = "host.docker.internal:8080"

In [None]:
remote.Client().widgets().dashboard()

## Initial Load 

Specify input and output locations on cloud storage

In [None]:
graph = remote.Client().create_graph()
graph_id = graph.graph_id

In [None]:
print(remote.Client().graphs())
#graph = remote.Client().graphs()[0]
#graph_id = graph.graph_id
print(graph_id)

In [None]:
/Users/hadiahmadi/Katana/git/katana-enterprise/external/katana/python/examples
start = timer()
print("Importing graph from CSV files...")
import_data.csv(
    graph,
    operation=Operation.Insert,
    input_node_path="gs://katana-demo-datasets/entity-resolution/synthetic/node_list.txt",
    input_edge_path="gs://katana-demo-datasets/entity-resolution/synthetic/edge_list.txt",
    input_dir="gs://katana-demo-datasets/entity-resolution/synthetic",
    files_have_headers=True,
    data_delimiter=",",
    schema_delimiter=",",
)
end = timer()
print(f"  import: {end - start:.1f} seconds") 

In [None]:
start = timer()
print(f"  num nodes: {graph.num_nodes()}")
print(f"  num edges: {graph.num_edges()}")
end = timer()
print(f"  count nodes: {end - start:.1f} seconds") 

In [None]:
# test query
start = timer()
result = graph.query("""
match (n:Account)-->(p)<--(m:Account)
return * order by n limit 200
""")

end = timer()
print(f"  result: {result[0:2]} ... ")
print(f"  time: {end - start:.1f} seconds")
result.visualize()

In [None]:
start = timer()
result = graph.query("""
match (n:Account)-->(p)<--(m:Account) where id(n) < id(m)
with distinct n,m, 
case when n.first_name = m.first_name then 1 else 0 end as w1,
case when n.last_name = m.last_name then 1 else 0 end as w2,
collect(p.id) as similar,
count(p) as w3
return distinct n.first_name+ " "+n.last_name+"("+n.id+")" as l_account, m.first_name+" "+m.last_name+"("+m.id+")" as r_account,
similar, w1+w2+w3 as score order by l_account desc limit 20 
""")
end = timer()
totaldf = pd.DataFrame()
for row in result:
    df = pd.json_normalize(row)
    totaldf = pd.concat([totaldf,df])
print(f"  time: {end - start:.1f} seconds")
totaldf.reset_index(drop=True)

In [None]:
start = timer()

result = graph.query("""
match (n:Account)-->(p)<--(m:Account) where id(n) < id(m)
with distinct n,m, 
case when n.first_name = m.first_name then 1 else 0 end as w1,
case when n.last_name = m.last_name then 1 else 0 end as w2,
count(p) as w3
where w1+w2+w3 > 2
create (n)-[r:SIMILAR {score: w1+w2+w3}]->(m)
""")

end = timer()
print(f"  result: {result[0:2]}")
print(f"  time: {end - start:.1f} seconds")

In [None]:
result = graph.query("""
match (n:Account)-[r:SIMILAR]->(m:Account) 
return *
""")
result.visualize()

In [None]:
from katana.remote.aio.analytics import louvain_clustering
louvain_clustering(graph, result_property_name="cluster", edge_weight_property_name="score")
#stats = LouvainClusteringStatistics(graph, "value", "output")

#print(stats.n_clusters)
#print(stats.n_non_trivial_clusters)
#print(stats.largest_cluster_size)

In [None]:
result = graph.query("""
match (n:Account)
with distinct n.cluster as cluster, collect(n.id)
return count(*)
""")
print (result[0:2])

In [None]:
# Go to local for jaccard
import katana.local
from katana.example_data import get_input
from katana.local import Graph
katana.local.initialize()
input_rdg_path="gs://katana-internal14/graph/"+graph_id
property_graph = Graph(input_rdg_path)
print(property_graph.num_nodes())
property_graph.get_node_property("first_name")[0]

In [None]:
from katana.local.analytics import jaccard, JaccardStatistics
property_name = "simiar_to_19"
compare_node = 19

In [None]:
#jaccard(property_graph, compare_node, property_name)
stats = JaccardStatistics(property_graph, compare_node, property_name)

print("Max Similarity:", stats.max_similarity)
print("Min Similarity:", stats.min_similarity)
print("Average Similarity:", stats.average_similarity)

In [None]:
property_graph.get_node_property("simiar_to_19")