In [None]:
!pip install PyGithub docarray qdrant-client 

In [43]:
import os
import dotenv
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import json
import time

OPEN_AI_MODEL_NAME="gpt-4o-mini"
CONFIG_FILE_PATH = '.7ytrepmnt'
dotenv.load_dotenv(CONFIG_FILE_PATH)
print(os.getenv("QDRANT_HOST_URL"))
GITTHUB_TOKEN  = os.getenv("GITHUB_TOKEN")

localhost:6333


## Load Github Issues

In [44]:
import requests

repo = "spring-projects/spring-boot"
url = f"https://api.github.com/repos/{repo}"
response = requests.get(url)
data = response.json()

creator = data['owner']['login']
print(f"The creator of the repository is: {creator}")


The creator of the repository is: spring-projects


In [45]:
from langchain_community.document_loaders import GitHubIssuesLoader
repo_url = f"https://github.com/{repo}"

In [46]:
ghLoader = GitHubIssuesLoader(
    repo=repo,
    access_token=GITTHUB_TOKEN,  # delete/comment out this argument if you've set the access token as an env var.
    creator=creator,
    include_prs=False,
)

In [31]:
ghDocs = ghLoader.load()

## Helper function to get issues page by page and save them

In [32]:
def download_github_issues_as_dict(repo_url, issue_state, token):
    '''
    since GitHub doesn't make it super easy to download GitHub issues...
    :param repo_url: the full URL of the repo (don't include the trailing "/").
    :param token: a GitHub Personal Access Token (create from GitHub itself)
    :return: a dictionary that can be easily json-ified with the relevant info from the issues.
    '''
    assert isinstance(repo_url, str) and not repo_url.endswith("/") and "/" in repo_url, "need nice repo_url"
    assert isinstance(token, str), "need nice token"
    import github  # pip install PyGithub
    g = github.Github(token)
    user_str, repo_str = repo_url.replace("https://github.com/", "").split("/")
    user = g.get_user(user_str)
    repo = user.get_repo(repo_str)
    issues = repo.get_issues(state=issue_state)
    return issues

def get_issues_pageCount(issues, coun_per_page=30):
    return round(issues.totalCount / coun_per_page) + 1
    
def get_issues_by_page(issues,page_num):
     # the number 30 appears to hardcoded into the PyGitHub - maybe there is a better way to do this part?
     # real_issues = [_ for i in range(round(issues.totalCount / 30) + 1) for _ in issues.get_page(i) if
     #              not _.pull_request]
    
    rtn = {}
    for iss in issues.get_page(page_num):
        if not iss.pull_request:
            iss_dict = {"title": iss.title,
                        "body": iss.body,
                        "state": iss.state,
                        "comments": [_.body for _ in iss.get_comments()]}
            rtn[iss.number] = iss_dict
    return rtn
    

In [33]:
def write_issues_to_file(file, page_number):
    """
    Writes GitHub issues from a specified page to an already opened file.

    Args:
        file: An open file handle.
        page_number: The page number of issues to retrieve.

    Returns:
        None
    """
    issue_page = issues.get_page(page_number)
    for iss in issue_page:
        if not iss.pull_request:
            file.write(f"***Begining of Issue number:{iss.number}***\n")
            file.write(f"Issue Title:{iss.title}\n")
            file.write(f"Issue Body:{iss.body}\n")
            file.write("Comments:\n")
            for comment in iss.get_comments():
                file.write(f"{comment.body}\n")
            file.write(f"***End of Issue number:{iss.number}***\n")


## Fetch issues for configured repo

In [34]:
repo_url = f"https://github.com/{repo}"
token = os.getenv("GITHUB_TOKEN")
issue_state = "open" # other values - "closed", "all"
issues = download_github_issues_as_dict(repo_url,issue_state,token)

In [35]:
print(f"Total Issues count is: {issues.totalCount}")
issues_page_count = get_issues_pageCount(issues)
print(f"Total pages for navigating all issues: {issues_page_count}") 

Total Issues count is: 541
Total pages for navigating all issues: 19


In [38]:
issues_dict = get_issues_by_page(issues,1)

In [39]:
issuePage1 = issues.get_page(1)
for iss in issuePage1:
    if not iss.pull_request:
        print(f"***Begining of Issue number:{iss.number}***")
        print(f"Issue Title:{iss.title}")
        print(f"Issue Body:{iss.body}")
        print("Comments:")
        for comment in iss.get_comments():
            print(f"{comment.body}")
        print(f"***End of Issue number:{iss.number}***")
        break

***Begining of Issue number:43079***
Issue Title:Investigate ClientHttpConnector builders for WebClient with a similar design to ClientHttpRequestFactoryBuilder
Issue Body:<!--
Thanks for raising a Spring Boot issue. Please take the time to review the following
categories as some of them do not apply here.

🙅 "Please DO NOT Raise an Issue" Cases
- Question
STOP!! Please ask questions about how to use something, or to understand why something isn't
working as you expect it to, on Stack Overflow using the spring-boot tag.
- Security Vulnerability
STOP!! Please don't raise security vulnerabilities here. Head over to https://spring.io/security-policy to learn how to disclose them responsibly.
- Managed Dependency Upgrade
You DO NOT need to raise an issue for a managed dependency version upgrade as there's a semi-automatic process for checking managed dependencies for new versions before a release. BUT pull requests for upgrades that are more involved than just a version property change are

## Save all issues page by page as text file

In [40]:
for page_number_to_write in range(issues_page_count):  
    file_name = f"gh_issues/github_issues_{page_number_to_write}.txt"
    with open(file_name, "w") as issues_file:
        write_issues_to_file(issues_file, page_number_to_write)
    
    print(f"GitHub issues from page {page_number_to_write} appended to '{file_name}'")

GitHub issues from page 0 appended to 'gh_issues/github_issues_0.txt'
GitHub issues from page 1 appended to 'gh_issues/github_issues_1.txt'
GitHub issues from page 2 appended to 'gh_issues/github_issues_2.txt'
GitHub issues from page 3 appended to 'gh_issues/github_issues_3.txt'
GitHub issues from page 4 appended to 'gh_issues/github_issues_4.txt'
GitHub issues from page 5 appended to 'gh_issues/github_issues_5.txt'
GitHub issues from page 6 appended to 'gh_issues/github_issues_6.txt'
GitHub issues from page 7 appended to 'gh_issues/github_issues_7.txt'
GitHub issues from page 8 appended to 'gh_issues/github_issues_8.txt'
GitHub issues from page 9 appended to 'gh_issues/github_issues_9.txt'
GitHub issues from page 10 appended to 'gh_issues/github_issues_10.txt'
GitHub issues from page 11 appended to 'gh_issues/github_issues_11.txt'
GitHub issues from page 12 appended to 'gh_issues/github_issues_12.txt'
GitHub issues from page 13 appended to 'gh_issues/github_issues_13.txt'
GitHub issue

## Load the text files

In [41]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

loader = DirectoryLoader('./', glob="gh_issues/*.txt", loader_cls=TextLoader, silent_errors=True)
docs = loader.load()

In [42]:
len(docs)


19

## Querying via default Index engine

In [None]:

from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator

index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [None]:
# query ="I am using custom arguments in PromptTemplate and GraphCypherQAChain and getting missing key error. what could be reason"
query1 =  "Facing issues with common ForkJoinPool used in ExecutableJar"
response = index.query(query1)
print(response)

## Using QA Chain Retriver and LLM

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

documents = loader.load()
text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()


## Use Existing Qdrant VectorStore

In [47]:
# https://python.langchain.com/v0.1/docs/integrations/vectorstores/qdrant/
qdrant_url = os.getenv("QDRANT_HOST_URL")
# qdrant_api_key = os.getenv("QDRANT_API_KEY")
print(qdrant_url)

localhost:6333


In [48]:
from qdrant_client import QdrantClient
from langchain.vectorstores import Qdrant
from langchain.embeddings import OpenAIEmbeddings

def get_existing_vector_store(collection_identity):
        qdrant_url = os.getenv("QDRANT_HOST_URL")
        # qdrant_api_key = os.getenv("QDRANT_API_KEY")
        
        qdrant_client = QdrantClient(
            url=qdrant_url 
            # api_key=qdrant_api_key,
        )

        embeddings=OpenAIEmbeddings()
        return Qdrant(
            client=qdrant_client,
            collection_name=collection_identity,
            embeddings=embeddings,
        )

In [49]:
QDRANT_COLL_NAME = "github_issues_coll"
qdrant = get_existing_vector_store(QDRANT_COLL_NAME)

  warn_deprecated(


In [50]:
query = "Facing issues with common ForkJoinPool used in ExecutableJar"
found_docs = qdrant.similarity_search(query)
print(f"Total docs found:{len(found_docs)}")
print(found_docs[0].page_content)

Total docs found:4
***End of Issue number:39928***
***Begining of Issue number:39843***
Issue Title:Incorrect classloader used by common ForkJoinPool when using Executable Jar
Issue Body:# Context
I created this issue as a bug report or enhancement proposal - depending on how would you classify current behaviour.
I have a spring application that I am building using "org.springframework.boot gradle" plugin. This plugin builds Executable Jar and War as described in documentation:
https://docs.spring.io/spring-boot/docs/current/reference/html/executable-jar.html
# Problem
Executable Jar uses custom class loader: `org.springframework.boot.loader.launch.LaunchedClassLoader` when running the application. 
This class loader is not propagated to the common ForkJoinPool, which uses system class loader by default.
Take a code like that:
```
IntStream.rangeClosed(0, 4)
    .parallel()
    .forEach(i -> System.out.println(Thread.currentThread().getName() + " " + Thread.currentThread().getContextCl

In [51]:
from langchain.chains import RetrievalQA

retriever = qdrant.as_retriever(search_type="mmr",search_kwargs={"k": 2, "fetch_k": 4})

In [52]:
results = retriever.invoke(query)

In [53]:
print(f"Total docs found:{len(results)}")
print(results[0].page_content)

Total docs found:2
***End of Issue number:39928***
***Begining of Issue number:39843***
Issue Title:Incorrect classloader used by common ForkJoinPool when using Executable Jar
Issue Body:# Context
I created this issue as a bug report or enhancement proposal - depending on how would you classify current behaviour.
I have a spring application that I am building using "org.springframework.boot gradle" plugin. This plugin builds Executable Jar and War as described in documentation:
https://docs.spring.io/spring-boot/docs/current/reference/html/executable-jar.html
# Problem
Executable Jar uses custom class loader: `org.springframework.boot.loader.launch.LaunchedClassLoader` when running the application. 
This class loader is not propagated to the common ForkJoinPool, which uses system class loader by default.
Take a code like that:
```
IntStream.rangeClosed(0, 4)
    .parallel()
    .forEach(i -> System.out.println(Thread.currentThread().getName() + " " + Thread.currentThread().getContextCl

In [54]:
#  https://stackoverflow.com/questions/78399709/limit-context-token-on-document-retrieval-chains
from langchain_openai import ChatOpenAI

# Initialize a chat model from OpenAI with no randomness in responses (temperature=0)
llm = ChatOpenAI(temperature=0)

# Create a document compressor using the initialized chat model
# compressor = LLMChainExtractor.from_llm(llm)

# Create a retriever that uses contextual compression
# compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)


In [60]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)
query2 =  "Facing issues with common ForkJoinPool used in ExecutableJar. How to resolve this"
response = qa_stuff.run(query2)




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [61]:
print(response)

To resolve the issue with the common ForkJoinPool not using the custom class loader in an Executable Jar, you can try the following approaches:

1. **Custom ForkJoinPool**: Wrap your application entry points in a custom ForkJoinPool that uses the desired class loader. This way, you can ensure that the ForkJoinPool created within your application context uses the correct class loader.

2. **Alternative Jar Solutions**: Consider building your Spring application jar without using the Executable Jar format. You can explore alternative methods mentioned in the Spring Boot documentation under "Alternative Single Jar Solutions." This may involve building a fat jar using tools like Gradle Shadow Plugin or other solutions that allow you to control the class loading behavior.

3. **Unpack Fat Jar**: If building a fat jar for your Spring application is challenging, you can try unpacking the fat jar and configuring the class loading behavior manually. This approach may require additional configura

In [62]:
query3 = "JPA DDL properties are difficult to use with auto-configuration"
response = qa_stuff.run(query3)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [63]:
print(response)

Yes, it seems that there are challenges with using JPA DDL properties alongside auto-configuration in Spring Boot, as mentioned in the context provided. The behavior described indicates that the `spring.jpa.generate-ddl` flag may not work as expected when Hibernate auto-configuration is active. This discrepancy could lead to unexpected schema updates when upgrading to newer versions of Spring Boot.


## Ingest some Closed issues as well

In [65]:
repo_url = f"https://github.com/{repo}"
token = os.getenv("GITHUB_TOKEN")
issue_state = "closed" # other values - "closed", "all"
issues_closed = download_github_issues_as_dict(repo_url,issue_state,token)

In [68]:
print(f"Total Issues count is: {issues.totalCount}")
issues_closed_page_count = get_issues_pageCount(issues_closed)
max_pages_to_index = min(20, issues_closed_page_count)
print(f"Total pages for navigating all clsoed issues: {issues_closed_page_count}") 
print(f"Total pages for clsoed issues that will be indexed: {max_pages_to_index}") 

Total Issues count is: 42246
Total pages for navigating all clsoed issues: 1409
Total pages for clsoed issues that will be indexed: 20


In [69]:
for page_number_to_write in range(max_pages_to_index):  
    file_name = f"gh_issues/closed/github_issues_{page_number_to_write}.txt"
    with open(file_name, "w") as issues_file:
        write_issues_to_file(issues_file, page_number_to_write)
    
    print(f"GitHub issues from page {page_number_to_write} appended to '{file_name}'")

GitHub issues from page 0 appended to 'gh_issues/closed/github_issues_0.txt'
GitHub issues from page 1 appended to 'gh_issues/closed/github_issues_1.txt'
GitHub issues from page 2 appended to 'gh_issues/closed/github_issues_2.txt'
GitHub issues from page 3 appended to 'gh_issues/closed/github_issues_3.txt'
GitHub issues from page 4 appended to 'gh_issues/closed/github_issues_4.txt'
GitHub issues from page 5 appended to 'gh_issues/closed/github_issues_5.txt'
GitHub issues from page 6 appended to 'gh_issues/closed/github_issues_6.txt'
GitHub issues from page 7 appended to 'gh_issues/closed/github_issues_7.txt'
GitHub issues from page 8 appended to 'gh_issues/closed/github_issues_8.txt'
GitHub issues from page 9 appended to 'gh_issues/closed/github_issues_9.txt'
GitHub issues from page 10 appended to 'gh_issues/closed/github_issues_10.txt'
GitHub issues from page 11 appended to 'gh_issues/closed/github_issues_11.txt'
GitHub issues from page 12 appended to 'gh_issues/closed/github_issues_1

In [74]:
from langchain_text_splitters import CharacterTextSplitter
def get_chunked_text(file_name):
    with open(file_name, "r", encoding="utf-8") as f:
        text = f.read()
        text_splitter = CharacterTextSplitter(
            separator='\n',
            chunk_size=1000,
            chunk_overlap=150,
            length_function=len
        )
        return text_splitter.split_text(text)

In [75]:
QDRANT_COLLECTION_NAME="github_issues_coll"

In [76]:
qdrant_collection = get_existing_vector_store(QDRANT_COLLECTION_NAME)
for page_num in range(20):
    chunked_text = get_chunked_text(f"gh_issues/closed/github_issues_{page_num}.txt")
    vector_ids = qdrant_collection.add_texts(chunked_text)

Created a chunk of size 1015, which is longer than the specified 1000
Created a chunk of size 2709, which is longer than the specified 1000
Created a chunk of size 2709, which is longer than the specified 1000
Created a chunk of size 2709, which is longer than the specified 1000
