In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Extract commit history from Git

In [None]:
repo_owner="nwiizo"
repo_name="tfmcp"
access_token="<ACCESS_TOKEN>"


In [None]:
import json
import requests
class ExtractDataFromGit:

    def __init__(self, repo_owner, repo_name, access_token):
        self.repo_owner = repo_owner
        self.repo_name = repo_name
        self.access_token = access_token

    '''
    fetch data - this function is responsible to fetch the data from the git repository and return the data in json format
    :return: json data

    '''

    def fetchData(self):
        getAllCommitsEndpoint = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/commits"
        getEachCommitEndpoint = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/commits/"
        print(getAllCommitsEndpoint)
        headers = {"Authorization": f"Bearer {self.access_token}"}
        allCommitsResponse = requests.get(getAllCommitsEndpoint, headers=headers)
        allCommits = allCommitsResponse.json()
        #print(allCommits)
        print(len(allCommits))
        allCommitsResult = []
        for commit in allCommits:
            eachCommitResponse = requests.get(getEachCommitEndpoint + commit['sha'], headers=headers)
            allCommitsResult.append(eachCommitResponse.json())

        with open('/content/drive/MyDrive/git_llm/commitsAll.json', 'w', encoding='utf-8') as file:
            json.dump(allCommitsResult, file, ensure_ascii=False, indent=4)
        return allCommitsResult

if __name__ == "__main__":
    extractData = ExtractDataFromGit(repo_name=repo_name, repo_owner=repo_owner, access_token=access_token)
    extractData.fetchData()

https://api.github.com/repos/nwiizo/tfmcp/commits
16


# Convert JSON to Unstructured text
* As we have Git commit data extracted, to pass information to LLM, we need to convert the JSON data into unstructured data, so that LLM can understand and summarize the infomation.

### Install Required Libraries

In [None]:
!pip install langchain-experimental langchain-community langchain networkx langchain-google-genai langchain-core json-repair



In [None]:
# Configure Gemini API Key
GEMINI_API_KEY="AIzaSyBoXudDFqnt5B7cIwFDQVtmdul6mpMkQ78"

In [None]:
import os
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_google_genai import GoogleGenerativeAI
import networkx as nx
from langchain.chains import GraphQAChain, GraphCypherQAChain
from langchain_core.documents import Document
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
import json


# Create GenerativeAI instance with gemini 2.0 flash model and API_KEY

llm = GoogleGenerativeAI(model='gemini-2.0-flash', google_api_key=GEMINI_API_KEY)

#prompt = "Convert the given json object into unstructured, readable paragraph. Output the final paragraph. It should have information like commit, author, message, committer, commit date and time, comments, parent commit and files information. json: {}"


#prompt = "Convert the given json object into unstructured, readable paragraph. Output the final paragraph. It should have information like commit id, message, committed date and time, author id and email, committer id and email comments, parent commit, files information and stats about insertion, deletion, changes in json: {}"

# Provide the prompt that takes JSON as input and that should be in detailed manner, llm can take json provided and provide summary of commit history
prompt = "Convert the given json object into unstructured, readable paragraph. Output the final paragraph. It should have information like commit id, message, committed date and time, author, committer, files stats like insertion, deletion, changes in json: {}"


with open('/content/drive/MyDrive/git_llm/commitsAll.json', 'r', encoding='utf-8') as file:
    commits = json.load(file)

text = ""
count = 0 # Configured count here, as I was getting resource exhausted, when sending more than 14 requests
for _ in commits:
  if (count == 14):  # Configured count here, as I was getting resource exhausted, when sending more than 14 requests
    break
  text += "\n" + llm(prompt.format(_))
  count+=1

print(text)




The commit with ID b0fcc5c32f4bf0a71f373dd18891ce9faf36f09b was made on March 9, 2025, at 01:23:19 UTC. The author and committer were both nwiizo (syu.m.5151@gmail.com). The commit message was "Update latest release version to v0.1.1". There was one file modified: README.md, with 1 insertion and 1 deletion, resulting in a total of 2 changes.
The commit with ID `6581651b970375c4929dce18dcc6edc33268baf1` has the message "Fix unnecessary whitespace in contract_tilde function". It was committed on March 8, 2025, at 23:10:23 UTC. The author and committer are both `nwiizo` with the email `syu.m.5151@gmail.com`. The commit includes changes to one file, `src/shared/utils/path.rs`, which was modified. The file had a total of 4 changes, with 2 insertions and 2 deletions.
The commit with ID `b7acff2dfa42b8e5c724246fd897bf12bccf26a5` was authored and committed by nwiizo (syu.m.5151@gmail.com) on 2025-03-08 at 17:13:02 UTC. The commit message is "Fix path handling for cross-platform compatibility"

# Create Knowledge Graph from Unstructured Data using LLM
* LLMGraphTransformer will be responsible for taking the document containing unstructured text and providing knowledge graph containing nodes and relationships

In [None]:
llm_transformer = LLMGraphTransformer(llm=llm)

In [None]:
documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='terraform/main.tf', type='File', properties={}), Node(id='.github/workflows/rust.yml', type='File', properties={}), Node(id='Fix path handling for cross-platform compatibility', type='Message', properties={}), Node(id='15b7f370e2509e9ffeae0ec6d23bfa65b8cc733b', type='Commit', properties={}), Node(id='Fix unnecessary whitespace in contract_tilde function', type='Message', properties={}), Node(id='b566ec77aff49fc0844bb2fec5229ea69a7cf7fb', type='Commit', properties={}), Node(id='.github/images/tfmcp-demo.gif', type='File', properties={}), Node(id='6581651b970375c4929dce18dcc6edc33268baf1', type='Commit', properties={}), Node(id='example', type='Directory', properties={}), Node(id='b0fcc5c32f4bf0a71f373dd18891ce9faf36f09b', type='Commit', properties={}), Node(id='rules/general', type='Directory', properties={}), Node(id='.cursor/rules/rust.json', type='File', properties={}), Node(id='.terraform-version', type='File', properties={}), Node(id='42b0eb1b8ded679dfb851e40c27910d

# Store Knowledge graph in Graph Database ( Neo4J)
*

In [None]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.28.1


In [None]:
from langchain_community.graphs import Neo4jGraph

# provide connection details like url, username and password
# In my case I am using Neo4j Aura DB instance ( Free edition )

NEO4J_GRAPH = Neo4jGraph(
    url="<NEO4J_URL>",
    username="neo4j",
    password="<PASSWORD>",
    refresh_schema=False
)

In [None]:
# store graph document in Neo4j
NEO4J_GRAPH.add_graph_documents(graph_documents)

# Execute Queries on Neo4j using LLM
* Here LLM is responsible for converting the natural language text to Cypher query which is queried on graph created on Neo4j to get appropriate results

In [None]:
# Use GraphCypherQAChain to convert the natural language queries into Cypher query which is then queried on Neo4j to get the appropriate response
cypher_chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=NEO4J_GRAPH,
    verbose=True,
    allow_dangerous_requests=True
)

In [None]:
from langchain.prompts import PromptTemplate

CYPHER_GENERATION_TEMPLATE = """
You are an expert Neo4j Cypher query generator specializing in Git commit history analysis.

The Neo4j graph contains Git commit data with the following schema:
- Commit nodes with properties: id, message, committed_date, authored_date
- Person nodes with properties: id
- File nodes with properties: id
- Message nodes with properties: id
File nodes are case sensitive, check properly and obtain closest match to file name when querying
- Relationships:
  - (Commit)-[:AUTHORED_BY]->(Person)
  - (Commit)-[:COMMITTED_BY]->(Person)
  - (Commit)-[:MODIFIES]->(File) with properties: changes and modification
  - (Commit)-[:ADDS]->(File) with properties: addition or introduced
  - (Commit)-[:INTRODUCES]->(File) with properties: addition or introduced
  - (Commit)-[:HAS_MESSAGE]->(Message)
  - (Commit)-[:HAS_PARENT]->(COMMIT)

Based on this schema and the user's question, generate a Cypher query to answer it accurately.

User question: {question}

Cypher query:
"""


# Created Custom prompt template
custom_prompt = PromptTemplate(
    input_variables=["question"],
    template=CYPHER_GENERATION_TEMPLATE,
)

# Create the chain with custom prompt
cypher_chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=NEO4J_GRAPH,
    cypher_prompt=custom_prompt,
    verbose=True,
    allow_dangerous_requests=True
)

In [None]:
# working prompt 1
question="""what is the commit id when we add build.rs file"""

cypher_chain.run(question)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[:ADDS]->(f:File)
WHERE f.id = "build.rs"
RETURN c.id
[0m
Full Context:
[32;1m[1;3m[{'c.id': '64fcd1323e43a2774a0171e9800a1f648c1963e3'}][0m

[1m> Finished chain.[0m


'The commit ID when adding the build.rs file is 64fcd1323e43a2774a0171e9800a1f648c1963e3.'

In [None]:
# working prompt 2 ( not working if the case of Release.sh is changed )
question="""what is the commit id when we add Release.sh file"""

cypher_chain.run(question)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[:ADDS]->(f:File {id: "Release.sh"})
RETURN c.id AS CommitId
[0m
Full Context:
[32;1m[1;3m[{'CommitId': '64fcd1323e43a2774a0171e9800a1f648c1963e3'}][0m

[1m> Finished chain.[0m


'The commit ID when adding the Release.sh file is 64fcd1323e43a2774a0171e9800a1f648c1963e3.'

In [None]:
# working prompt 3
question="""what is the commit id when file example/demo/main.tf is modified"""

cypher_chain.run(question)





[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[:MODIFIES]->(f:File)
WHERE f.id = "example/demo/main.tf"
RETURN c.id
[0m
Full Context:
[32;1m[1;3m[{'c.id': 'be096a63bcc731c92faf691e4345bdf5da0f3f3a'}][0m

[1m> Finished chain.[0m


'The commit ID when file example/demo/main.tf is modified is be096a63bcc731c92faf691e4345bdf5da0f3f3a.'

In [None]:
# working prompt 4
question="""list of the commit id's when file README.md is added and modified"""

cypher_chain.run(question)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[:ADDS|MODIFIES]->(f:File)
WHERE f.id = "README.md"
RETURN c.id AS CommitID
[0m
Full Context:
[32;1m[1;3m[{'CommitID': 'b0fcc5c32f4bf0a71f373dd18891ce9faf36f09b'}, {'CommitID': '64fcd1323e43a2774a0171e9800a1f648c1963e3'}, {'CommitID': 'be096a63bcc731c92faf691e4345bdf5da0f3f3a'}, {'CommitID': '21359e4f48b26a2b83b38f18e0f989805beef5a2'}][0m

[1m> Finished chain.[0m


'The commit IDs when file README.md is added and modified are: b0fcc5c32f4bf0a71f373dd18891ce9faf36f09b, 64fcd1323e43a2774a0171e9800a1f648c1963e3, be096a63bcc731c92faf691e4345bdf5da0f3f3a, and 21359e4f48b26a2b83b38f18e0f989805beef5a2.'

In [None]:
# working prompt 5
question="""list of the commit id's when file rules/README.md is introduced"""

cypher_chain.run(question)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[:INTRODUCES]->(f:File)
WHERE f.id = "rules/README.md"
RETURN c.id
[0m
Full Context:
[32;1m[1;3m[{'c.id': 'c8a2da38edc2c0f02ff29ff07d791e4c3068d26b'}][0m

[1m> Finished chain.[0m


'c8a2da38edc2c0f02ff29ff07d791e4c3068d26b'

In [None]:
# working prompt 6
question="""list of commit id's authored by nwiizo?"""

cypher_chain.run(question)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Person {id: "nwiizo"})<-[:AUTHORED_BY]-(c:Commit)
RETURN c.id AS CommitId
[0m
Full Context:
[32;1m[1;3m[{'CommitId': '15b7f370e2509e9ffeae0ec6d23bfa65b8cc733b'}, {'CommitId': 'b566ec77aff49fc0844bb2fec5229ea69a7cf7fb'}, {'CommitId': '6581651b970375c4929dce18dcc6edc33268baf1'}, {'CommitId': 'b0fcc5c32f4bf0a71f373dd18891ce9faf36f09b'}, {'CommitId': '42b0eb1b8ded679dfb851e40c27910de8493ae98'}, {'CommitId': '64fcd1323e43a2774a0171e9800a1f648c1963e3'}, {'CommitId': 'be096a63bcc731c92faf691e4345bdf5da0f3f3a'}, {'CommitId': '7b0db5cd882ab119697ccd84639703c9eb7ac43e'}, {'CommitId': '2e78eb7d37bd47baae328fef071dfdec12ad403e'}, {'CommitId': 'c8a2da38edc2c0f02ff29ff07d791e4c3068d26b'}][0m

[1m> Finished chain.[0m


'The commit IDs are: 15b7f370e2509e9ffeae0ec6d23bfa65b8cc733b, b566ec77aff49fc0844bb2fec5229ea69a7cf7fb, 6581651b970375c4929dce18dcc6edc33268baf1, b0fcc5c32f4bf0a71f373dd18891ce9faf36f09b, 42b0eb1b8ded679dfb851e40c27910de8493ae98, 64fcd1323e43a2774a0171e9800a1f648c1963e6, be096a63bcc731c92faf691e4345bdf5da0f3f3a, 7b0db5cd882ab119697ccd84639703c9eb7ac43e, 2e78eb7d37bd47baae328fef071dfdec12ad403e, c8a2da38edc2c0f02ff29ff07d791e4c3068d26b.'

In [None]:
# working prompt 7
question="""what are the parent commits starting from 42b0eb1b8ded679dfb851e40c27910de8493ae98"""

cypher_chain.run(question)




[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit {id: '42b0eb1b8ded679dfb851e40c27910de8493ae98'})-[:HAS_PARENT]->(parent:Commit)
RETURN parent
[0m
Full Context:
[32;1m[1;3m[{'parent': {'id': '64fcd1323e43a2774a0171e9800a1f648c1963e3'}}][0m

[1m> Finished chain.[0m


'The parent commit starting from 42b0eb1b8ded679dfb851e40c27910de8493ae98 is 64fcd1323e43a2774a0171e9800a1f648c1963e3.'

In [None]:
# working prompt 8
question="""Number of commits committed by nwiizo?"""

cypher_chain.run(question)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:Person {id: "nwiizo"})<-[:COMMITTED_BY]-(c:Commit)
RETURN count(c) AS NumberOfCommits
[0m
Full Context:
[32;1m[1;3m[{'NumberOfCommits': 14}][0m

[1m> Finished chain.[0m


'The number of commits committed by nwiizo is 14.'

In [None]:
# working prompt 9
question="""what is the commit id when we add or modify build.rs file"""

cypher_chain.run(question)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[r]->(f:File)
WHERE f.id = "build.rs" AND (type(r) = "MODIFIES" OR type(r) = "ADDS" OR type(r) = "INTRODUCES")
RETURN c.id
[0m
Full Context:
[32;1m[1;3m[{'c.id': '64fcd1323e43a2774a0171e9800a1f648c1963e3'}][0m

[1m> Finished chain.[0m


'The commit ID is 64fcd1323e43a2774a0171e9800a1f648c1963e3.'

In [None]:
# working prompt 10
question="""list one man who authored rEADME md file"""

cypher_chain.run(question)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[:AUTHORED_BY]->(p:Person)
MATCH (c)-[:MODIFIES]->(f:File)
WHERE f.id = "README.md"
RETURN p.id
LIMIT 1
[0m
Full Context:
[32;1m[1;3m[{'p.id': 'nwiizo'}][0m

[1m> Finished chain.[0m


'nwiizo authored the rEADME md file.'

In [None]:
# Testcase 11
question="""list one man who added, introduced or modified LICENSE file"""

cypher_chain.run(question)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:Commit)-[r:ADDS|INTRODUCES|MODIFIES]->(f:File)
WHERE f.id = "LICENSE"
WITH c
MATCH (c)-[:AUTHORED_BY]->(p:Person)
RETURN p.id AS PersonId, count(c) as Commits
ORDER BY Commits DESC
LIMIT 1
[0m
Full Context:
[32;1m[1;3m[{'PersonId': 'nwiizo', 'Commits': 1}][0m

[1m> Finished chain.[0m


'nwiizo is one man who added, introduced or modified LICENSE file.'

# Visualize created Knowledge graph using LLM

In [None]:
!sudo apt-get install graphviz graphviz-dev
#pip install pygraphviz


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'libgraphviz-dev' instead of 'graphviz-dev'
graphviz is already the newest version (2.42.2-6ubuntu0.1).
The following additional packages will be installed:
  libgail-common libgail18 libgtk2.0-0 libgtk2.0-bin libgtk2.0-common
  libgvc6-plugins-gtk librsvg2-common libxdot4
Suggested packages:
  gvfs
The following NEW packages will be installed:
  libgail-common libgail18 libgraphviz-dev libgtk2.0-0 libgtk2.0-bin
  libgtk2.0-common libgvc6-plugins-gtk librsvg2-common libxdot4
0 upgraded, 9 newly installed, 0 to remove and 34 not upgraded.
Need to get 2,434 kB of archives.
After this operation, 7,681 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libgtk2.0-common all 2.24.33-2ubuntu2.1 [125 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libgtk2.0-0 amd64 2.24.33-2ubuntu2.1 [2,038 kB]
Get:3 http://a

In [None]:
!pip install pygraphviz

Collecting pygraphviz
  Downloading pygraphviz-1.14.tar.gz (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.0/106.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pygraphviz
  Building wheel for pygraphviz (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pygraphviz: filename=pygraphviz-1.14-cp311-cp311-linux_x86_64.whl size=169713 sha256=31767ee2aba320ef52a35f158c282e729a2b4811b1aaae74b744d9e193eacd6b
  Stored in directory: /root/.cache/pip/wheels/9c/5f/df/6fffd2a4353f26dbb0e3672a1baf070c124a1d74a5f9318279
Successfully built pygraphviz
Installing collected packages: pygraphviz
Successfully installed pygraphviz-1.14


In [None]:
# visualize the graph
graph.draw_graphviz()