# Langchain with markdown

In [None]:
!pip install langchain

In [None]:
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [None]:
loader = UnstructuredMarkdownLoader('/data-transfer/iihf/rulebook.md')
documents = loader.load()


In [132]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

In [133]:
loader = DirectoryLoader('/data-transfer/iihf', glob="**/*.md", loader_cls=TextLoader)

In [134]:
documents = loader.load()


In [135]:
headers_to_split_on = [
    ("#", "header1"),
    ("##", "header2"),
    ("###", "header3"),
    ("####", "header4"),
]

In [136]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=True)
md_header_splits = markdown_splitter.split_text(documents[0].page_content)
md_header_splits[0]

Document(page_content='No matter where ice hockey is played, the object of the game is the same – to put the puck into the opponent’s goal. Beyond that, ice hockey across the globe is subject to certain variations. This makes the rules of the game extremely important. These rules must be followed all times, in all countries, in all age categories, for the game to be enjoyed by everyone.  \nHockey’s speed is one of the qualities that makes it so exciting. But this skill and excitement must be balanced with fair play and respect.  \nIt is, therefore, important to make a clear separation between the purpose of all the elements of the game and to use these respectfully. These distinctions can be taught at an early age or whenever one begins to show interest in the game. And this is why hockey development begins with parents and coaches, those people most influential in guiding a person, old or young, into playing the game properly and within the rules.  \nThe IIHF Championship program enco

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
    # Existing args
)

In [138]:
# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 250
chunk_overlap = 30
##text_splitter = RecursiveCharacterTextSplitter(
##    chunk_size=chunk_size, chunk_overlap=chunk_overlap
##)

# Split
splits = text_splitter.split_documents(md_header_splits)
for m in splits[6].metadata:
    print (splits[6].metadata[m])

IIHF Official Rulebook 2023/24
PLAYING AREA
RULE 1 RINK
1.6 DIVISION OF ICE SURFACE


In [139]:
import ollama
from ollama import Client
client = Client(host='http://192.168.1.102:11434')


In [83]:
from langchain_community.embeddings import OllamaEmbeddings


## Create LangChain Neo4JVector instance with Similarity Search with Cosine Distance

In [140]:
from langchain.vectorstores import Neo4jVector
from langchain_community.embeddings import OllamaEmbeddings

# Neo4j Aura credentials
NEO4J_URL = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    splits,
    embedding = OllamaEmbeddings(
            base_url="http://192.168.1.102:11434", model="mxbai-embed-large", temperature=0
        ),
    url=NEO4J_URL,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    node_label="Chunk"
)



In [112]:
from neo4j import GraphDatabase
import ollama

# Local Neo4j instance
# NEO4J_URL = "bolt://localhost:7687"
# Remote Neo4j instance on AuraDB
NEO4J_URL = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""

EMBEDDING_MODEL = "mxbai-embed-large"

def get_embedding(client, text, model):
    response = client.embeddings(
                    prompt=text,
                    model=model,
                )
    return response["embedding"]

def LoadEmbedding(label, property):
    driver = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)
    client = Client(host='http://192.168.1.102:11434')
    
    with driver.session() as session:
        # get chunks in document, together with their section titles
        result = session.run(f"MATCH (ch:{label}) RETURN id(ch) AS id, ch.{property} AS text")
        # call OpenAI embedding API to generate embeddings for each proporty of node
        # for each node, update the embedding property
        count = 0
        for record in result:
            id = record["id"]
            text = record["text"]
            
            # For better performance, text can be batched
            embedding = get_embedding(client, text, EMBEDDING_MODEL)
            
            # key property of Embedding node differentiates different embeddings
            cypher = "CREATE (e:Embedding) SET e.key=$key, e.value=$embedding, e.model=$model"
            cypher = cypher + " WITH e MATCH (n) WHERE id(n) = $id CREATE (n) -[:HAS_EMBEDDING]-> (e)"
            session.run(cypher,key=property, embedding=embedding, id=id, model=EMBEDDING_MODEL) 
            count = count + 1

        session.close()
        
        print("Processed " + str(count) + " " + label + " nodes for property @" + property + ".")
        return count

In [113]:
# For smaller amount (<2000) of text data to embed
LoadEmbedding("Chunk", "text")

Processed 17 Chunk nodes for property @text.


17

In [141]:
query = "when is a minor penalty ruled"

docs_with_score = neo4j_vector.similarity_search_with_score(query, k=3)
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)


--------------------------------------------------------------------------------
Score:  0.818839430809021
For a Minor Penalty, any Player, other than a Goalkeeper, shall be ruled off the ice for two (2) minutes during which time no substitute shall be permitted.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.7946059703826904
If the penalty to be imposed is a Minor Penalty and a goal is scored on the play by the non-offending side, the Minor Penalty shall not be imposed but Major Penalties shall be imposed in the normal manner regardless of whether or not a goal is scored.  
If two (2) or more Minor Penalties were to be imposed and a goal is scored on the play by the non-offending side, the Captain of the offending Team shall designate to the Referee which Minor Penalty(ies) will be assessed and which Minor Penalty will be washed out as a result of the scoring of

In [142]:
neo4j_vector.query("SHOW CONSTRAINTS")

[{'id': 5,
  'name': 'constraint_1dc138a',
  'type': 'UNIQUENESS',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['id'],
  'ownedIndex': 'constraint_1dc138a',
  'propertyType': None}]

In [143]:
neo4j_vector.query(
    """SHOW INDEXES
       YIELD name, type, labelsOrTypes, properties, options
       WHERE type = 'VECTOR'
    """
)

[{'name': 'vector',
  'type': 'VECTOR',
  'labelsOrTypes': ['Chunk'],
  'properties': ['embedding'],
  'options': {'indexProvider': 'vector-2.0',
   'indexConfig': {'vector.dimensions': 1024,
    'vector.similarity_function': 'cosine'}}}]

In [144]:
print (neo4j_vector.node_label)
print (neo4j_vector.embedding_node_property)

Chunk
embedding


In [148]:
# Local Neo4j instance
# NEO4J_URL = "bolt://localhost:7687"
# Remote Neo4j instance on AuraDB
NEO4J_URL = "bolt://neo4j-1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "abc123abc123"
NEO4J_DATABASE = ""


# Parent retriever

parent_query = """
MATCH (node)<-[:HAS_CHILD]-(parent)
WITH parent, max(score) AS score // deduplicate parents
RETURN parent.text AS text, score, {} AS metadata LIMIT 1
"""

parent_vectorstore = Neo4jVector.from_existing_index(
    embedding = OllamaEmbeddings(
            base_url="http://192.168.1.102:11434", model="mxbai-embed-large", temperature=0
        ),
    url=NEO4J_URL,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,    
    index_name="parent_document",
    retrieval_query=parent_query,
)