## Install dependencies in Jupyter NB

In [1]:
import sys
!{sys.executable} -m pip install -r requirements.txt





### Imports

In [4]:
import os
import uuid
import subprocess
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain.document_loaders i
mport DirectoryLoader, NotebookLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import clean_and_tokenize
import unicodedata
from dotenv import load_dotenv

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

[nltk_data] Downloading package punkt to /Users/eeilstein/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Config

In [332]:
#config.py
WHITE = "\033[37m"
GREEN = "\033[32m"
PURPLE = "\033[35m"
RESET_COLOR = "\033[0m"
NUM_SOURCE_DOCS = 8
model_name = "gpt-3.5-turbo-16k" # VERIFIED 
# model_name = "gpt-3.5-turbo" # VERIFIED
# model_name = "gpt-4-32k" # NOPE
# model_name = "gpt-4" # NOPE


### load_and_index_files

In [361]:

def load_and_index_files(repo_path):
#     extensions = ['txt', 'md', 'markdown', 'rst', 'py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php', 'scala',
#                   'html', 'htm', 'xml', 'json', 'yaml', 'yml', 'ini', 'toml', 'cfg', 'conf', 'sh', 'bash', 'css',
#                   'scss', 'sql', 'gitignore', 'dockerignore', 'editorconfig', 'ipynb']
    extensions = ['py', 'sql']
    file_type_counts = {}
    documents_dict = {}

    texts = []
    for ext in extensions:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=4_000, chunk_overlap=200)
        glob_pattern = f'**/*.{ext}'
        try:
            loader = None
            if ext == 'ipynb':
                # Get a list of all .ipynb files in the directory
                notebook_files = glob.glob(f'{repo_path}/**/*.{ext}', recursive=True)

                loaded_documents = []
                # Load each file using NotebookLoader
                for file in notebook_files:
                    if 'archive' in file.split(os.sep):
                        continue  # Skip files in 'archive' folder
                    loader = NotebookLoader(file, include_outputs=True, max_output_length=20, remove_newline=True)
                    documents = loader.load()
                    loaded_documents += documents

            else:
                loader = DirectoryLoader(repo_path, glob=glob_pattern, loader_kwargs={"content_type": "text/plain"})
                loaded_documents = []
                if callable(loader.load):
                    all_documents = loader.load()
                    # Exclude documents located in 'archive' folders
                    for doc in all_documents:
                        file_path = doc.metadata['source']
                        if 'archive' not in file_path.split(os.sep):
                            loaded_documents.append(doc)
#                         if ext == 'py':
#                             print(file_path)
                            
            if loaded_documents:
                print(f'[LOG] {ext} loaded!')
                file_type_counts[ext] = len(loaded_documents)
                for doc in loaded_documents:
                    file_path = doc.metadata['source']
                    relative_path = os.path.relpath(file_path, repo_path)
                    file_id = str(uuid.uuid4())
                    doc.metadata['source'] = relative_path
                    doc.metadata['file_id'] = file_id

                    documents_dict[file_id] = doc

            texts += text_splitter.split_documents(loaded_documents)
        except Exception as e:
            print(f"Error loading files with pattern '{glob_pattern}': {e}")
            # print(traceback.format_exc())
            continue

    return texts


#### repo funcs

In [362]:
def extract_repo_name(repo_url):
    # Extract the part of the URL after the last slash and before .git
    repo_name = repo_url.split('/')[-1]
    if repo_name.endswith('.git'):
        repo_name = repo_name[:-4]  # remove .git from the end
    return repo_name



def is_repo_cloned(repo_url, path_dir):
    repo_name = extract_repo_name(repo_url)
    repo_path = os.path.join(path_dir, repo_name)
    return os.path.isdir(repo_path)

def clone_github_repo(github_url, local_path):
    try:
        subprocess.run(['git', 'clone', github_url, local_path], check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Failed to clone repository: {e}")
        return False

## Start the Script!

In [363]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# github_url = r"https://github.com/cmooredev/RepoReader"
github_url = r"https://github.com/Lightricks/dwh-data-model-transforms"
repo_name = extract_repo_name(github_url)
local_path = 'stat_path_repos'

### Clone repo

In [364]:
_is_repo_cloned = is_repo_cloned(github_url, local_path)
print(f'[LOG] is repo {repo_name} already cloned? {_is_repo_cloned}')

# if the repo is already cloned in the static path, then skip cloning. If not, clone it in the static path
repo_condition = clone_github_repo(github_url, os.path.join(local_path, repo_name)) if not _is_repo_cloned else True

[LOG] is repo dwh-data-model-transforms already cloned? True


### Process the repo from local dir

In [365]:

texts = load_and_index_files(os.path.join(local_path, repo_name))

print("Done")

[LOG] py loaded!
[LOG] sql loaded!
Done


### Reset our Chroma Vector DB

In [366]:
try:
    # # To cleanup, you can delete the collection
    vectordb.delete_collection()
    vectordb.persist()
except:
    print("Passed")


persist_directory = f'db_{repo_name}'
embedding = OpenAIEmbeddings()


### [Optional] Create it from scratch

In [367]:

"""## create the DB"""

# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

## here we are using OpenAI embeddings but in future we will swap out to local embeddings

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)


Using embedded DuckDB with persistence: data will be stored in: db_dwh-data-model-transforms


### Call it from the persisted DB (if already exists)

In [368]:

# persiste the db to disk
vectordb.persist()
vectordb = None

# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

Using embedded DuckDB with persistence: data will be stored in: db_dwh-data-model-transforms


In [369]:
# Overwrite:
NUM_SOURCE_DOCS = 5
retriever = vectordb.as_retriever(search_kwargs={"k": NUM_SOURCE_DOCS})

docs = retriever.get_relevant_documents("How much money did iOS raise?")

len(docs)

5

## Make a chain

In [370]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)


### Process answer func

In [371]:
def process_llm_response(llm_response):
    answer = llm_response['result']
    print(GREEN + '\nANSWER\n' + answer + RESET_COLOR + '\n')
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])
    return llm_response['result']


## Call the LLM API

In [372]:
turbo_llm = ChatOpenAI(
    temperature=0.2,
    model_name=model_name,

)

### Integrate LLM API and source-docs in the Chain

In [373]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)


### Set the prompt template

In [374]:
conversation_history = ""
context =  """Repo: {repo_name} ({github_url}) | | Conversation history: {conversation_history}

            Instructions:
            1. Answer based on context/docs.
            2. Focus on repo/code.
            3. Consider:
                a. Purpose/features - describe.
                b. Functions/code - provide details/samples.
                c. Setup/usage - give instructions.
            4. SQL Syntax is Bigquery
            5. Unsure? Say "I am not sure".


    Question: {question}
    Answer:
"""
kw = {"repo_name":repo_name, "github_url":github_url, "conversation_history":conversation_history, "numbered_documents":"numbered_documents", "file_type_counts":"file_type_counts", "filenames":"filenames"}

# Start chatting!

In [375]:

while True:
    query = input(PURPLE + "\nAsk a question about the repository, BE SPECIFIC ('exit' to quit): ")
    print(RESET_COLOR)
    if query[:4].lower() == "exit":
        break
    print('Thinking...')

    kw = {"repo_name":repo_name, "github_url":github_url, "conversation_history":conversation_history, "numbered_documents":"numbered_documents", "file_type_counts":"file_type_counts", "filenames":"filenames"}
    llm_response = qa_chain(context.format(question=query, **kw))
    result = process_llm_response(llm_response)

    conversation_history = f'Last Question: {query} \nLast Answer: {result}'


Ask a question about the repository, BE SPECIFIC ('exit' to quit): do you see "unified_devices"? explain
[0m
Thinking...
[32m
ANSWER
Yes, I see the "unified_devices" model in the dwh-data-model-transforms repository. This model is used to create a table that combines device information from both iOS and Android applications. It includes fields such as platform, application, lt_defacto_id, icloud_id, install_time, first_launch, and more. The model uses the "unified_device_info_log" table as a source for the device information.[0m



Sources:
models/general/unified/unified_devices.sql
models/global/unified_devices/unified_devices_global.sql
models/general/attribution_and_unified_devices_sources/unified_device_info_log.sql
models/general/purchase_and_devices/manipulated_devices_table.sql
models/global/unified_device_info_log/unified_device_info_log_legacy.sql

Ask a question about the repository, BE SPECIFIC ('exit' to quit): what are its sources?
[0m
Thinking...
[32m
ANSWER
The "un