In [1]:
import os

# change dir to project root to find modules
levels_up = 1
root_dir = os.sep.join(os.getcwd().split(os.sep)[:-levels_up])
os.chdir(root_dir)

In [4]:
from src.config import config
import logging

API_KEY = config.user_config["ACCESS_TOKEN"]
MODEL_NAME = config.user_config["MODEL_NAME"]
SAVE_PATH = config.user_config["SAVE_PATH"]
SEARCH_TYPE = config.user_config["SEARCH_TYPE"]
N_RETRIEVED_DOCS = config.user_config["N_RETRIEVED_DOCS"]
TOKEN_LIMIT = config.user_config["TOKEN_LIMIT"]
VERBOSE = config.user_config["VERBOSE"]

logging.basicConfig(
    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
    datefmt='%H:%M:%S',
    level=logging.INFO,
)

ModuleNotFoundError: No module named 'config'

# Test module components

In [12]:
from src.communicators import GPTCommunicator

communicator = GPTCommunicator(api_key=API_KEY, model_name="gpt-3.5-turbo")
communicator.post_prompt("hi")

NameError: name 'API_KEY' is not defined

In [None]:
communicator.post_rag_prompt("hi")

ValueError: vs_hndlr not set; pass VectostoreHandler upon init or invoke set_vectorstore_handler() before using this method.

In [None]:
from src.data_processors import WikiTextProcessor

processor = WikiTextProcessor(
    dataset_version = "wikitext-2-raw-v1", 
    split = "train", 
    communicator = communicator,
    verbose = VERBOSE
)
passages = processor.process_text(
    token_limit = TOKEN_LIMIT, 
    save_path = SAVE_PATH,
    save_filename = "processed_data.csv",
    manipulate_pattern = [("Star Trek", "I'm More Of A Star Wars Fan")],
)
print(passages[0][:250])

  from .autonotebook import tqdm as notebook_tqdm
15:17:00,448 datasets INFO PyTorch version 2.2.1 available.
15:17:03,920 root INFO 629 passages created. 
15:17:05,14 root INFO 625 passages remaining after limiting tokens
15:17:06,49 root INFO largest passage after trim is 14815 tokens
15:17:06,55 root INFO 9 passages manipulated; 'Star Trek' -> 'I'm More Of A Star Wars Fan'
15:17:06,146 root INFO Processed data saved to: /Users/dev/projects/Rag_DocumentQA/document_store/processed_data.csv


 = Valkyria Chronicles III = 


 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game 


In [None]:
from src.vectorstore_handlers import LangchainVectorstore
from langchain_community.embeddings import HuggingFaceEmbeddings

# input will be prompted if vs store already exists

vs = LangchainVectorstore(
    embedding_type = HuggingFaceEmbeddings(),
    processed_csv_path = SAVE_PATH+"processed_data.csv",
    verbose = VERBOSE
)
vs.create_local_vectorstore(save_path=SAVE_PATH)

vs.create_retriever(
    search_type=SEARCH_TYPE,
    search_kwargs={
        "k": N_RETRIEVED_DOCS
    }
)

15:17:26,913 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
15:17:27,124 sentence_transformers.SentenceTransformer INFO Use pytorch device: cpu
15:17:27,182 root INFO Vectorstore and retriever must be set using the class methods.
15:17:29,540 root INFO Creating a new local vectorstore at: /Users/dev/projects/Rag_DocumentQA/document_store/
Processing documents:   0%|          | 0/625 [00:00<?, ?it/s]15:17:29,995 faiss.loader INFO Loading faiss.
15:17:30,7 faiss.loader INFO Successfully loaded faiss.
Processing documents: 100%|██████████| 625/625 [02:00<00:00,  5.19it/s]
15:19:30,33 root INFO Vectorstore successfully set and saved to /Users/dev/projects/Rag_DocumentQA/document_store/
15:19:30,34 root INFO Retriever successfully set


In [None]:
communicator.set_vectorstore_handler(vs)

In [None]:
query = "What American science fiction television series is the episode 'Marauders' from?"

response, retrieved_context = communicator.post_rag_prompt(query)
response

15:19:38,550 httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'The American science fiction television series that the episode \'Marauders\' is from is "I\'m More Of A Star Wars Fan: Enterprise."'

In [None]:
communicator = GPTCommunicator(api_key=API_KEY, model_name="gpt-3.5-turbo", vectorstore_handler=vs)
response, retrieved_context = communicator.post_rag_prompt(query)
response

15:19:49,575 httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'The American science fiction television series that the episode \'Marauders\' is from is "I\'m More Of A Star Wars Fan: Enterprise."'

In [None]:
print(retrieved_context[0][:250])

: 220
text: = Marauders ( I'm More Of A Star Wars Fan : Enterprise ) = 


 " Marauders " is the sixth episode of the second season of the American science fiction television series I'm More Of A Star Wars Fan : Enterprise , the 32nd episode overall .


# Test Factories

In [60]:
from src.factories import ModelFactory, VectorstoreFactory

model_factory = ModelFactory()
model = model_factory.create_model("GPT_3.5_TURBO")

with pytest.raises(ValueError):
    response = model.post_rag_prompt("Hi")

response = model.post_prompt("Hi")
response

21:28:54,611 httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'Hello! How can I assist you today?'

In [53]:
from src.factories import ModelFactory, VectorstoreFactory
import pytest

communicator = GPTCommunicator(api_key=API_KEY, model_name="gpt-3.5-turbo")

vsf = VectorstoreFactory()

with pytest.raises(NotImplementedError):
    vsf.attach_vectorstore("bad_name", communicator=communicator, load_vectorstore=True)


In [44]:
vsf = VectorstoreFactory()
vsf.attach_vectorstore("Langchain", communicator=communicator, load_vectorstore=True)
response, context = communicator.post_rag_prompt("Hi")
isinstance(response, str) and isinstance(context, list) 

21:12:34,25 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
21:12:34,245 sentence_transformers.SentenceTransformer INFO Use pytorch device: cpu
21:12:34,247 root INFO Vectorstore and retriever must be set using the class methods.
21:12:34,253 root INFO Vectorstore loaded successfully.
21:12:34,253 root INFO Retriever successfully set
21:12:35,213 httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [49]:
isinstance(response, str) and isinstance(context, list) 

True

In [33]:
from src.data_processors import WikiTextProcessor

processor.__class__ == WikiTextProcessor

True

In [23]:
from src.factories import ModelFactory

model_factory = ModelFactory()
model = model_factory.create_model("GPT_3.5_TURBO")

19:48:13,825 httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [24]:
model.post_prompt("Hi")

19:48:16,888 httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'Hello! How can I assist you today?'

In [25]:
model_factory = ModelFactory()
model = model_factory.create_model("GPT_3.5_TURBO_RAG", new_vectorstore=True)

19:48:29,442 root INFO 629 passages created. 
19:48:30,476 root INFO 460 passages remaining after limiting tokens
19:48:30,918 root INFO largest passage after trim is 4997 tokens
19:48:30,958 root INFO Processed data saved to: /Users/dev/projects/Rag_DocumentQA/document_store/processed_data.csv
19:48:30,959 sentence_transformers.SentenceTransformer INFO Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
19:48:31,51 sentence_transformers.SentenceTransformer INFO Use pytorch device: cpu
19:48:31,78 root INFO Vectorstore and retriever must be set using the class methods.
19:48:31,78 root INFO Creating a new local vectorstore at: /Users/dev/projects/Rag_DocumentQA/document_store/
Processing documents:   0%|          | 0/460 [00:00<?, ?it/s]19:48:31,666 faiss.loader INFO Loading faiss.
19:48:31,678 faiss.loader INFO Successfully loaded faiss.
Processing documents:   2%|▏         | 9/460 [00:02<01:47,  4.18it/s]
19:48:33,233 root INFO Retriever successfully set
19:4

In [5]:
query = "What series is the episode 'Marauders' from?"

response, ret_context = model.post_rag_prompt(query)
response

18:41:24,379 httpx INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'The episode \'Marauders\' is from the American science fiction television series "I\'m More Of A Star Wars Fan: Enterprise."'

In [None]:
model = model_factory.create_model("GPT_3.5_TURBO_RAG", vectorstore_name="Chroma")

16:40:19,952 root INFO 629 passages created. 
16:40:20,996 root INFO 460 passages remaining after limiting tokens
16:40:21,480 root INFO largest passage after trim is 4997 tokens
16:40:21,483 root INFO 7 passages manipulated; 'Star Trek' -> 'I'm More Of A Star Wars Fan'
16:40:21,527 root INFO Processed data saved to: /Users/dev/projects/Rag_DocumentQA/document_store/processed_data.csv


NotImplementedError: 'Chroma' Vectorstore not implemented; valid names include: ['Langchain']