# Module 3 - Apply Graph Analytics

This module has the following objectives:
1. Deduplicating definitions that are very similar
2. Use GDS to find useful patterns in the Graph

Import our usual suspects (and some more...)

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
import matplotlib.pyplot as plt
import seaborn as sns

# Setup

Load env variables

In [None]:
env_file = 'ws.env'

In [None]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')
else:
    print(f"File {env_file} not found.")

Setup connection to the database with the [Python Driver](https://neo4j.com/docs/python-manual/5/).

In [None]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

In case we want to split large files

In [None]:
def split_dataframe(df, chunk_size = 50_000):
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

Test the connection

In [None]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

## Extracted Definitions

In Notebook 1, we saw that some definitions are **very similar** to each other or are **occurring in many documents and chunks**. Let's analyse this and see if we can find solutions to deal with these points!

### Reminder notebook 1: what definitions are mentioned most frequently within chunks?

In [None]:
definition_count_df = driver.execute_query(
    """
    MATCH (doc:Document)<-[:PART_OF]-(chunk:Chunk)-[:MENTIONS]->(def:Definition)
    WITH DISTINCT def, COUNT(DISTINCT chunk) AS chunk_count, COUNT(DISTINCT doc) AS document_count
    RETURN def.term AS definition, def.description AS description, chunk_count, document_count ORDER BY chunk_count DESC LIMIT 25
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
)

In [None]:
definition_count_df

## Deduplicate definitions

As you can see, there are many similar/duplicate definitions that we would ideally merge together. That's what we'll try in this section!

Similar definitions based on [Levenshtein](https://neo4j.com/labs/apoc/4.1/overview/apoc.text/apoc.text.distance/) distance:

In [None]:
levenshtein_definitions_df = driver.execute_query(
    """
    MATCH (a:Definition), (b:Definition)
    WHERE elementId(a) < elementId(b) AND apoc.text.distance(toLower(a.term), toLower(b.term)) < 3
    RETURN a.term, b.term
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
)

In [None]:
levenshtein_definitions_df.head(25)

### Use embeddings

Levenshtein distance is not always ideal, as sometimes completely different terms have small edit distance. 

Let's see if embeddings do a better job!

In [None]:
schema_result_df  = driver.execute_query(
    'SHOW INDEXES',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head()

Return similar definitions based on embeddings distance:

In [None]:
similar_definitions_df = driver.execute_query(
    """
    MATCH (n:Definition)
    CALL db.index.vector.queryNodes('definition-embeddings', 10, n.embedding)
    YIELD node AS similar, score
    WHERE elementId(n) < elementId(similar)
    
    //Make sure that the definitions are from the same document
    MATCH (n)<-[:MENTIONS]-(chunk:Chunk)-[:PART_OF]->(source_doc:Document)
    MATCH (similar)<-[:MENTIONS]-(chunk:Chunk)-[:PART_OF]->(target_doc:Document)
    RETURN DISTINCT n.term AS source, similar.term AS target, apoc.coll.sort(COLLECT(source_doc.file_name)) AS source_files, apoc.coll.sort(COLLECT(target_doc.file_name)) AS target_files, score
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [None]:
similar_definitions_df

In [None]:
similar_definitions_df.loc[(similar_definitions_df["score"] > 0.97) & (similar_definitions_df["source_files"] == similar_definitions_df["target_files"])].head(10)

In [None]:
similar_terms = similar_definitions_df.loc[(similar_definitions_df["score"] > 0.97) & (similar_definitions_df["source_files"] == similar_definitions_df["target_files"])]
len(similar_terms)

Number of relations between `Chunk` and `Definition` before deduplicating defintions

In [None]:
driver.execute_query(
    """
        MATCH p=()-[r:MENTIONS]->() RETURN COUNT(r) as count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Number of `Definitions` before deduplicating defintions

In [None]:
driver.execute_query(
    """
        MATCH (n:Definition) RETURN COUNT(n) as count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

**Merge nodes with similar embeddings**

In [None]:
for _, row in similar_terms.iterrows():
    driver.execute_query(
        """
        MATCH (a:Definition {term: $source}), (b:Definition {term: $target})
        CALL apoc.refactor.mergeNodes([a, b], {
            properties: "overwrite",
            mergeRels: true
        })
        YIELD node
        RETURN node
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        result_transformer_= lambda r: r.to_df(),
        source=row['source'],
        target=row['target']
    )

Number of relations between `Chunk` and `Definition` after deduplicating defintions

In [None]:
driver.execute_query(
    """
        MATCH p=()-[r:MENTIONS]->() RETURN COUNT(r) as count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Number of `Definitions` after deduplicating defintions

In [None]:
driver.execute_query(
    """
        MATCH (n:Definition) RETURN COUNT(n) as count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Look once again at the most occurring definitions

In [None]:
definition_count_df = driver.execute_query(
    """
    MATCH (doc:Document)<-[:PART_OF]-(chunk:Chunk)-[:MENTIONS]->(def:Definition)
    WITH DISTINCT def, COUNT(DISTINCT chunk) AS chunk_count, COUNT(DISTINCT doc) AS document_count
    RETURN def.term AS definition, def.description AS description, chunk_count, document_count ORDER BY chunk_count DESC LIMIT 25
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
)

In [None]:
definition_count_df

---

### Analyse the Definition graph

How many definitions did we extract on average from Chunks?

In [None]:
definitions_per_chunk_df = driver.execute_query(
    """
    MATCH (doc:Document)<-[:PART_OF]-(chunk:Chunk)-[:MENTIONS]->(def:Definition)
    WITH DISTINCT doc, chunk, COUNT(def) AS definition_count
    RETURN doc.file_name AS file_name, AVG(definition_count) AS avg_definitions_per_chunk_count, MIN(definition_count) AS min_definitions_per_chunk_count, MAX(definition_count) AS max_definitions_per_chunk_count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
)

In [None]:
definitions_per_chunk_df

How many shared definitions do documents have? 

In [None]:
doc_name = "Payment and Online Services Terms Sept 2022.pdf"  

shared_definitions_df = driver.execute_query(
    """
    MATCH (doc1:Document {file_name: $doc_name})<-[:PART_OF]-(:Chunk)-[:MENTIONS]->(def:Definition)<-[:MENTIONS]-(:Chunk)-[:PART_OF]->(doc2:Document)
    WHERE doc1 <> doc2
    WITH DISTINCT doc1, doc2, COLLECT(DISTINCT def) AS definitions
    RETURN doc1.file_name AS file_name_1, doc2.file_name AS file_name_2, SIZE(definitions) AS shared_definitions_count ORDER BY shared_definitions_count DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    doc_name=doc_name
)

In [None]:
shared_definitions_df

### Chunk Similarity

Definitions can be informative about the content of chunks and can for example be used to see which chunks are **similar**

We can define the similarity of chunks based on **overlapping definitions**. 

First, we start by setting the degree property on `Definition` nodes. Definitions occurring in many chunks are less informative, so we can use this property to filter those out!

In [None]:
# Set property
driver.execute_query(
    """
    MATCH (chunk:Chunk)-[:MENTIONS]->(def:Definition)
    WITH DISTINCT def, COUNT(chunk) AS chunk_count
    SET def.degree = chunk_count
    RETURN COUNT(*) AS rows_processed
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    result_transformer_= lambda r: r.to_df()
)

Can we find similar chunks?

In [None]:
threshold = 30  # Used to filter out frequently occurring definitions

similar_chunks_df = driver.execute_query(
    """
    MATCH (chunk_1:Chunk)-[:MENTIONS]->(def:Definition)<-[:MENTIONS]-(chunk_2:Chunk)
    WHERE chunk_1 > chunk_2 AND def.degree <= $threshold
    WITH DISTINCT chunk_1, chunk_2, COLLECT(DISTINCT def.term) AS definitions, COUNT(DISTINCT def) AS definition_count
    WHERE definition_count > 1
    RETURN chunk_1.id AS chunk_1_id ,chunk_1.chunk_eng AS chunk_1, chunk_2.id AS chunk_2_id, chunk_2.chunk_eng AS chunk_2, definitions, definition_count ORDER BY definition_count DESC

    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df(),
    threshold=threshold
)

In [None]:
len(similar_chunks_df)

In [None]:
similar_chunks_df

We can load the overlapping definition count to the database as **new relationship**

In [None]:
for chunk in split_dataframe(similar_chunks_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p1:Chunk {id:row.chunk_1_id})
        MERGE (p2:Chunk {id:row.chunk_2_id})
        MERGE (p1)-[s:OVERLAPPING_DEFINITIONS]->(p2)
        SET s.overlap = row.definition_count
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

Take a minute to explore the `OVERLAPPING_DEFINITIONS` network in the database

- `MATCH p=()-[:OVERLAPPING_DEFINITIONS]->() RETURN p LIMIT 50`

- `MATCH p=()-[r:OVERLAPPING_DEFINITIONS]->() WHERE r.overlap >= 5 RETURN p LIMIT 50`

- `MATCH p=()-[r:OVERLAPPING_DEFINITIONS]->() WHERE r.overlap >= 10 RETURN p LIMIT 50`

### Communities

Let's run some Graph Data Science based on Chunks and Definitions. Let's first setup the [Graph Data Science Client](https://neo4j.com/docs/graph-data-science-client/current/).

In [None]:
gds = GraphDataScience.from_neo4j_driver(driver=driver)
gds.set_database(DATABASE)
gds.version()

Let's investigate the Chunks that are similar in the graph (based on the definitions they share). For that we first need to create a [Graph object](https://neo4j.com/docs/graph-data-science-client/current/graph-object/)

In [None]:
graph_name = "chunk_similarity_projection"
node_projection = ["Chunk"]
rel_projection = {"OVERLAPPING_DEFINITIONS": {"orientation": 'UNDIRECTED', "properties": "overlap"}, }

In [None]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

In [None]:
gds.graph.list()

Run the [Leiden Algorithm](https://neo4j.com/docs/graph-data-science/current/algorithms/leiden/) for Community Detection

In [None]:
gds.leiden.write(
    G,
    writeProperty='leiden_community',
    relationshipWeightProperty='overlap',
    maxLevels=100,
    gamma=3,
    theta=0.0001,
    concurrency = 1,
    randomSeed = 42
)

In [None]:
communities_df = driver.execute_query(
    """
    MATCH (c:Chunk)
    RETURN c.leiden_community AS community, COUNT(*) as member_count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [None]:
communities_df.sort_values(by='member_count', ascending=False).head(10)

In [None]:
len(communities_df.loc[lambda df: df['member_count'] > 1])

Check communities based on Chunks with high overlap

In [None]:
community_check_df = driver.execute_query(
    """
    MATCH (c1:Chunk)-[s:OVERLAPPING_DEFINITIONS]->(c2:Chunk)
    WHERE s.overlap > 5
    RETURN s.overlap AS overlap, c1.chunk_eng AS chunk_1, c1.leiden_community AS community_1, c2.chunk_eng AS chunk_2, c2.leiden_community AS community_2 ORDER BY s.overlap DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [None]:
community_check_df.head(10)

Check some high definition occurrences in the communities

In [None]:
communities_definitions_df = gds.run_cypher('''
    MATCH (c:Chunk)-[:MENTIONS]->(def:Definition) WHERE (c.leiden_community) IS NOT NULL AND def.degree <= 30
    WITH c.leiden_community AS leiden_community, def.term as definition, count(*) as cnt
    WHERE cnt > 7
    RETURN *
    ORDER BY leiden_community, cnt DESC
''')


In [None]:
communities_definitions_df.head(15)

## Plot Communities with their Definition count

In [None]:


df = gds.run_cypher("""
MATCH (c:Chunk)-[:MENTIONS]->(def:Definition) WHERE (c.leiden_community) IS NOT NULL AND def.degree <= 30
WITH c.leiden_community AS leiden_community, def.term as definition, count(*) as cnt
WHERE cnt > 7
RETURN leiden_community, definition, cnt
""")

In [None]:
pivot_table = df.pivot(index="definition", columns="leiden_community", values="cnt").fillna(0).sort_index()
sns.set_theme(style="whitegrid")
plt.figure(figsize=(15, 10))
sns.heatmap(pivot_table, cmap="Blues", yticklabels=True, linewidths=0.5)
plt.xlabel("Community")
plt.ylabel("Definition")
plt.title("Definition Distribution Heatmap per Community")
plt.tight_layout()
plt.show()

In [None]:
G.drop()