In [1]:
import os
import getpass

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake


In [2]:
embeddings = OpenAIEmbeddings(disallowed_special=())

In [3]:
import os
from langchain.document_loaders import TextLoader

root_dir = './DA_fm_analyze'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try: 
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e: 
            print(f"Could not load {dirpath}/{file}")
            print(e)

Could not load ./DA_fm_analyze/scripts/tesseract/images/d1.JPG
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Could not load ./DA_fm_analyze/scripts/tesseract/images/d2.JPG
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Could not load ./DA_fm_analyze/.git/index
'utf-8' codec can't decode byte 0xe5 in position 17: invalid continuation byte
Could not load ./DA_fm_analyze/.git/objects/pack/pack-f798eee2d00d3b196f8760341e72b1efde610a92.idx
'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Could not load ./DA_fm_analyze/.git/objects/pack/pack-f798eee2d00d3b196f8760341e72b1efde610a92.pack
'utf-8' codec can't decode byte 0x91 in position 11: invalid start byte
Could not load ./DA_fm_analyze/fmf/Important_Attrs_Pos.fmf
'utf-8' codec can't decode byte 0x81 in position 9: invalid start byte


In [4]:
docs

[Document(page_content='import argparse\nimport os\n\nfrom fmanalyze.attrs.instructions import *\nfrom fmanalyze.aggregate.main import create_dfs_for_basedir\n\npd.options.mode.chained_assignment = None\nimport yaml\nimport shutil\n\n\nif __name__ == \'__main__\':\n    parser = argparse.ArgumentParser()\n\n    parser.add_argument(\'--config\', help=\'Path to config file\', default=None, required=False)\n    args = parser.parse_args()\n    with open(args.config, \'r\') as confhandle:\n        config = yaml.safe_load(confhandle)\n\n    sourcedir = config.get("source_dir", None)\n    targetdir = config.get("target_dir", None)\n\n    if sourcedir is not None and targetdir is not None:\n        teams_dir = os.path.join(targetdir, \'teams\')\n\n        for rtf_filame in os.listdir(sourcedir):\n            print(f\'Processing {rtf_filame}...\')\n            dir_to_create = os.path.splitext(rtf_filame)[0]\n            basedir = os.path.join(teams_dir, dir_to_create)\n\n            os.makedirs(

In [5]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)


Created a chunk of size 1043, which is longer than the specified 1000
Created a chunk of size 1133, which is longer than the specified 1000
Created a chunk of size 1016, which is longer than the specified 1000
Created a chunk of size 1020, which is longer than the specified 1000
Created a chunk of size 1540, which is longer than the specified 1000
Created a chunk of size 1202, which is longer than the specified 1000
Created a chunk of size 1039, which is longer than the specified 1000
Created a chunk of size 1252, which is longer than the specified 1000


In [6]:
len(texts)

162

In [7]:
username = "diegoamicabile" # replace with your username from app.activeloop.ai
db = DeepLake(dataset_path=f"hub://{username}/da-fm-analyze", embedding_function=embeddings, public=True) #dataset would be publicly available
db.add_documents(texts)


Your Deep Lake dataset has been successfully created!
This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/diegoamicabile/da-fm-analyze
hub://diegoamicabile/da-fm-analyze loaded successfully.


Evaluating ingest: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:17<00:00
/

Dataset(path='hub://diegoamicabile/da-fm-analyze', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (162, 1536)  float32   None   
    ids      text     (162, 1)      str     None   
 metadata    json     (162, 1)      str     None   
   text      text     (162, 1)      str     None   


 

['a74c6a30-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6af8-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6b34-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6b52-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6b70-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6b84-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6ba2-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6bc0-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6bd4-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6bf2-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6c06-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6c24-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6c38-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6c56-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6c6a-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6c88-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6c9c-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6cba-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6cce-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6cec-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6d00-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6d1e-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6d32-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6d50-f29e-11ed-86b4-4ccc6ae1c359',
 'a74c6d64-f29e-

In [8]:
db = DeepLake(dataset_path="hub://diegoamicabile/da-fm-analyze", read_only=True, embedding_function=embeddings)

/

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/diegoamicabile/da-fm-analyze



\

hub://diegoamicabile/da-fm-analyze loaded successfully.

Deep Lake Dataset in hub://diegoamicabile/da-fm-analyze already exists, loading from the storage
Dataset(path='hub://diegoamicabile/da-fm-analyze', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape      dtype  compression
  -------   -------    -------    -------  ------- 
 embedding  generic  (162, 1536)  float32   None   
    ids      text     (162, 1)      str     None   
 metadata    json     (162, 1)      str     None   
   text      text     (162, 1)      str     None   


  

In [9]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

In [12]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model_name='gpt-4') # switch to 'gpt-4'
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)


In [13]:
questions = [
    "Can you describe what this repository does?",
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")


-> **Question**: Can you describe what this repository does? 

**Answer**: I cannot provide a specific description of what this repository does, as the provided information does not include any details about the content or purpose of the code within the repository. The repository is named "DA_fm_analyze", but there is no description available to give more context. 



In [14]:
questions = [
    "Can you list the modules and the files in this repository ?",
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: Can you list the modules and the files in this repository ? 

**Answer**: I'm sorry, but the provided information does not contain a list of modules or files in the repository. 



In [None]:
questions = [
    "Can you list the modules and the files in this repository ?",
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")