<a href="https://colab.research.google.com/github/donghuna/AI-Expert/blob/main/Project/retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain
!pip install tiktoken
!pip install langchain-openai
!pip install faiss-cpu
!pip install langchain-community

Collecting langchain
  Downloading langchain-0.2.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.30 (from langchain)
  Downloading langchain_core-0.2.30-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.99-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.30->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4

In [2]:
import os
from pathlib import Path
import subprocess
import shutil
from langchain.text_splitter import TokenTextSplitter

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

from google.colab import userdata

In [3]:
def clone_git_repository(repo_url, clone_dir):
    """
    주어진 GitHub 저장소를 지정된 디렉토리에 클론합니다.
    """
    if os.path.exists(clone_dir):
        print(f"Directory {clone_dir} already exists. Deleting and recloning the repository.")
        shutil.rmtree(clone_dir)
    subprocess.run(['git', 'clone', repo_url, clone_dir], check=True)
    print(f"Repository cloned to {clone_dir}")

def read_code_files(directory, extensions=['.py']):
    """
    지정된 디렉토리에서 주어진 확장자를 가진 모든 코드 파일을 읽어서 반환합니다.
    """
    code_texts = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if any(file.endswith(ext) for ext in extensions):
                file_path = Path(root) / file
                with open(file_path, 'r', encoding='utf-8') as f:
                    code_texts.append(f.read())
    return code_texts

def split_text_with_overlap(text, chunk_size=1000, overlap=200):
    """
    TokenTextSplitter를 사용하여 텍스트를 오버랩을 가진 청크로 나눕니다.
    """
    text_splitter = TokenTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        disallowed_special=()  # 모든 특수 토큰을 허용하지 않음
    )
    return text_splitter.split_text(text)

def process_code_directory(directory, extensions=['.py', '.cpp', '.h'], chunk_size=1000, overlap=200):
    """
    디렉토리 내의 모든 코드 파일을 읽고, TokenTextSplitter를 사용하여 오버랩이 있는 청크로 나눠서 반환합니다.
    """
    code_texts = read_code_files(directory, extensions)
    chunks = []
    for text in code_texts:
        chunks.extend(split_text_with_overlap(text, chunk_size, overlap))
    return chunks

In [5]:
# def main():


# if __name__ == "__main__":
#     main()


Repository cloned to ./llama_cpp


In [20]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever
from langchain.tools.retriever import create_retriever_tool
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

repo_url = 'https://github.com/ggerganov/llama.cpp.git'
clone_dir = './llama_cpp'
openai_organization_id = userdata.get('MY_OPENAI_ORG_ID')
openai_key = userdata.get('OPEN-AI_KEY')

# GitHub 저장소 클론
# clone_git_repository(repo_url, clone_dir)

# 클론한 디렉토리 내의 코드 파일 처리
chunks = process_code_directory(clone_dir, extensions=['.py', '.cpp', '.h'], chunk_size=1000, overlap=200)

# 임시코드
chunks = chunks[:30]

# 청크 출력
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:")
    print(chunk)
    print("\n" + "-"*80 + "\n")

# VectorStore를 생성합니다.
embeddings = OpenAIEmbeddings(openai_api_key=openai_key, organization=openai_organization_id)
vector = FAISS.from_texts(chunks, embeddings)

# Retriever를 생성합니다.
# base_retriever = vector.as_retriever()

model_name = "gpt-3.5-turbo"
os.environ["OPENAI_API_KEY"] = userdata.get('OPEN-AI_KEY')
llm=ChatOpenAI(model=model_name, temperature=0.4, organization=openai_organization_id, max_tokens=500)

# 프롬프트 템플릿 작성
prompt_template = """
다음 질문에 대한 관련 정보를 검색하고 답변을 제공합니다:

질문: {question}

관련 정보: {retrieved_content}

답변을 간결하고 명확하게 작성하세요.
"""

# 프롬프트를 생성합니다.
prompt = PromptTemplate(
    input_variables=["question", "retrieved_content"],
    template=prompt_template,
)

# LLMChain 생성
qa_chain = LLMChain(
    llm=llm,
    prompt=prompt,
)


Chunk 1:
#!/usr/bin/env python3
from __future__ import annotations

import logging
import argparse
import os
import struct
import sys
from enum import IntEnum
from pathlib import Path

import numpy as np

if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf

logger = logging.getLogger("ggml-to-gguf")


class GGMLFormat(IntEnum):
    GGML = 0
    GGMF = 1
    GGJT = 2


class GGMLFType(IntEnum):
    ALL_F32              = 0
    MOSTLY_F16           = 1
    MOSTLY_Q4_0          = 2
    MOSTLY_Q4_1          = 3
    MOSTLY_Q4_1_SOME_F16 = 4
    MOSTLY_Q8_0          = 7
    MOSTLY_Q5_0          = 8
    MOSTLY_Q5_1          = 9
    MOSTLY_Q2_K          = 10
    MOSTLY_Q3_K_S        = 11
    MOSTLY_Q3_K_M        = 12
    MOSTLY_Q3_K_L        = 13
    MOSTLY_Q4_K_S        = 14
    MOSTLY_Q4_K_M        = 15
    MOSTLY_Q5_K_S        = 16
    MOSTLY_Q5_K_M        = 17
    MOSTLY_Q6_K          = 18


class Hyperparameters:
    def __init_

In [25]:
import numpy as np

def get_answer_with_retrieval(question):
    # 검색 쿼리에 대한 임베딩 계산
    query_embedding = embeddings.embed_query(question)

    # query_embedding을 NumPy 배열로 변환
    query_embedding = np.array(query_embedding).reshape(1, -1)

    # FAISS에서 유사도 검색 (유사도 점수와 함께)
    scores, indices = vector.index.search(query_embedding.reshape(1, -1), k=3)

    # 검색된 문서(청크)와 유사도 점수를 함께 가져옵니다.
    retrieved_docs_with_scores = [(chunks[i], scores[0][j]) for j, i in enumerate(indices[0])]

    # 각 청크와 유사도 점수를 출력합니다.
    for content, score in retrieved_docs_with_scores:
        print(f"Chunk: {content[:500]}...")  # 청크 내용 일부 출력
        print(f"Similarity Score: {score}")
        print("-" * 80)

    # 검색된 내용을 텍스트로 결합합니다.
    retrieved_content = "\n".join([content for content, score in retrieved_docs_with_scores])

    # 검색된 정보를 포함한 프롬프트로 LLM에 전달하여 답변을 생성합니다.
    answer = qa_chain.run({
        "question": question,
        "retrieved_content": retrieved_content,
    })

    return answer

question = "GGMLFType에서 MOSTLY_F16 값이 뭘까"
answer = get_answer_with_retrieval(question)

print("Question:", question)
print("Answer:", answer)

Chunk: #!/usr/bin/env python3
from __future__ import annotations

import logging
import argparse
import os
import struct
import sys
from enum import IntEnum
from pathlib import Path

import numpy as np

if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf

logger = logging.getLogger("ggml-to-gguf")


class GGMLFormat(IntEnum):
    GGML = 0
    GGMF = 1
    GGJT = 2


class GGMLFType(IntEnum):
    ALL_F32              = 0
    MOSTLY_F16           = 1
    MOSTLY_Q4_0          = 2
    MOSTLY_Q4_1          = 3
    MOSTLY_Q4_1_SOME_F16 = 4
    MOSTLY_Q8_0          = 7
    MOSTLY_Q5_0          = 8
    MOSTLY_Q5_1          = 9
    MOSTLY_Q2_K          = 10
    MOSTLY_Q3_K_S        = 11
    MOSTLY_Q3_K_M        = 12
    MOSTLY_Q3_K_L        = 13
    MOSTLY_Q4_K_S        = 14
    MOSTLY_Q4_K_M        = 15
    MOSTLY_Q5_K_S        = 16
    MOSTLY_Q5_K_M        = 17
    MOSTLY_Q6_K          = 18


class Hyperparameters:
    def __init__(