Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: drafting new indexing #55

Merged
merged 17 commits into from
Jun 4, 2023
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ venv
chroma_db
*.pyc
/qdrant_data/
.pytest_cache

# frontend
node_modules
Expand Down
37 changes: 37 additions & 0 deletions backend/embeddings/basesplit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import Enum
from typing import Callable, List


class ContextTypes(Enum):
"""
At the moment we support only text. 03.06.2023
"""
TEXT = 1
CODE = 2
IMAGE = 3
EMBEDDING = 6


class BaseSplit(ABC):
"""
Base class for splitting text into chunks.
Based on text_length, it splits text into reasonable chunks.
"""

def __init__(self, text, context_type: ContextTypes, text_length: Callable[[str], int] = len):
self.text = text
self.context_type = context_type
self.text_length = text_length

@abstractmethod
def split(self) -> List[str]:
"""
Split's file into chunks
"""
pass

@abstractmethod
def chunk_document(self):
pass
60 changes: 38 additions & 22 deletions backend/embeddings/index_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@
from langchain.schema import Document
from database.database import Database
from sqlalchemy import text
from utils import openai_ask
from embeddings.utils import openai_ask
import random
from qdrant_client import QdrantClient
from qdrant_client.http import models
import openai
from fastapi import UploadFile
from embeddings.text_splitter import TextSplitter
from embeddings.basesplit import ContextTypes
import re


# TODO: This is just a base implementation extend it with metadata,..
Expand Down Expand Up @@ -41,25 +44,36 @@ def __init__(self, file_path: str = None, file_meta: UploadFile = None):
if file_path:
self.file_meta = file_meta
self.file_path = file_path
self.loader = TextLoader(self.file_path)
self.documents = self.loader.load()
self.texts = self.text_split(self.documents)
self.vectordb = self.embeddings(self.texts)
# self.genie = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=OPENAI_API_KEY), chain_type="stuff",
# retriever=self.vectordb.as_retriever())
# TODO: seems like LangChain has a bug in creating a db / collection, so I create everything on init - needs refactor
# as collection name should be a parameter


if not isinstance(self.file_path, list):
self.file_path = [self.file_path]
for i in self.file_path:
self.loader = TextLoader(i)
self.documents = self.loader.load()
self.texts = self.text_split(self.documents)
self.vectordb = self.embeddings(self.texts, page=i)

@staticmethod
def text_split(documents: TextLoader):
# TODO: think about split words (make sense out of it for LLM), not 1000 characters as it is now
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
return texts

def upload_embedding(self, texts: List[Document], collection_name: str = "aixplora") -> None:
def text_split(documents: TextLoader) -> List[str]:

document_str = "".join([document.page_content for document in documents])
text_splitter = TextSplitter(document_str, ContextTypes.TEXT).chunk_document()

fixed_whitespaces = []
for document in text_splitter:
replaced = document
replaced = re.sub('\s*\.\s*', '. ', replaced) # replace ' . ' with '. '
replaced = re.sub('\s*,\s*', ', ', replaced) # replace ' , ' with ', '
replaced = re.sub('\s*:\s*', ': ', replaced) # replace ' : ' with ': '
replaced = re.sub('\s*\(\s*', ' (', replaced) # replace ' ( ' with ' ('
replaced = re.sub('\s*\)\s*', ') ', replaced) # replace ' ) ' with ') '
replaced = re.sub('\s+', ' ', replaced) # replace multiple spaces with one space
replaced = replaced.replace('\n', '')
fixed_whitespaces.append(replaced)

print(fixed_whitespaces)
return fixed_whitespaces

def upload_embedding(self, texts: List[Document], collection_name: str = "aixplora", page: int = 0) -> None:
print(len(texts))
for i in range(len(texts)):
print(i)
Expand All @@ -79,19 +93,21 @@ def upload_embedding(self, texts: List[Document], collection_name: str = "aixpl
payload={
"chunk": texts[i],
"metadata": {"filename": self.file_meta.filename,
"filetype": self.file_meta.content_type}
"filetype": self.file_meta.content_type,
"page": page}
},
vector=embeddings,
),
]
)
return

def embeddings(self, texts: List[Document]):
texts = [text.page_content for text in texts]
def embeddings(self, texts: List[str], page: int):
texts = [text for text in texts]
openai.api_key = self.openai_api_key
print(len(texts))
self.upload_embedding(texts=texts)
self.upload_embedding(texts=texts, page=page)
return

def search(self, query: str):
openai.api_key = self.openai_api_key
Expand Down
62 changes: 62 additions & 0 deletions backend/embeddings/text_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from embeddings.basesplit import BaseSplit
import nltk

class TextSplitter(BaseSplit):

def __init__(self, text, context_type, text_length=len):
super().__init__(text, context_type, text_length)

def split(self):
pass

def chunk_document(self):
try:
from nltk.chunk import ChunkParserI
# download resources for tokenization and pos tagging
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
except ImportError:
raise ImportError("nltk module not installed")

# Use NLTK's pre-trained sentence tokenizer
sentences = nltk.sent_tokenize(self.text)

parsed_sentences = []
buffer = ""
for sentence in sentences:
# Add the current sentence to the buffer
buffer += " " + sentence

# Check if buffer already exceeds the desired length
if len(buffer) > 1000:
words = nltk.word_tokenize(buffer)
tagged_words = nltk.pos_tag(words)
chunks = nltk.ne_chunk(tagged_words)
phrases = TextSplitter.extract_phrases(chunks)
parsed_sentence = ' '.join(phrases)
parsed_sentences.append(parsed_sentence)

buffer = ""

# Process any remaining sentences in the buffer
if buffer:
words = nltk.word_tokenize(buffer)
tagged_words = nltk.pos_tag(words)
chunks = nltk.ne_chunk(tagged_words)
phrases = TextSplitter.extract_phrases(chunks)
parsed_sentence = ' '.join(phrases)
parsed_sentences.append(parsed_sentence)

return parsed_sentences

@staticmethod
def extract_phrases(tree):
phrases = []
if hasattr(tree, 'label') and tree.label:
for child in tree:
phrases.extend(TextSplitter.extract_phrases(child))
else:
phrases.append(tree[0])
return phrases
23 changes: 23 additions & 0 deletions backend/embeddings/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import List

import openai



# TODO: make model configurable in config
def openai_ask(context: str = None, pages: List[int] = None, question: str = None, openai_api_key: str = None, model: str = "gpt-3.5-turbo"):
print(question)
print(context)
print(pages)
# TODO: make answer to same language
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": f"Answer the following question: {question} based on that context: {context},"
" Make sure that the answer of you is in the same language then the question. if you can't just answer: I don't know"}
]
)

# TODO: save usage into db
return completion["choices"][0]["message"]["content"]

7 changes: 4 additions & 3 deletions backend/loaders/pdf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ def load_pdf(file: bytes, filename: str, file_meta: UploadFile):

# write files to misc folder
misc_dir = os.path.join(os.getcwd(), "misc")
with open(f"{misc_dir}/{filename}.txt", "w") as f:
for i in range(number_of_pages):

for i in range(number_of_pages):
with open(f"{misc_dir}/{filename}{i}.txt", "w") as f:
page = reader.pages[i]
text = page.extract_text().strip()
f.write(text.replace("\n", " "))

f.close()

return f"{misc_dir}/{filename}.txt", file_meta
return [f"{misc_dir}/{filename}{i}.txt" for i in range(number_of_pages)], file_meta
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,4 @@ zstandard==0.21.0
qdrant-client==1.2.0
openpyxl==3.1.2
xlrd==2.0.1
pytest==7.3.1
Empty file added backend/tests/__init__.py
Empty file.
36 changes: 36 additions & 0 deletions backend/tests/test_text_splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import math
import random
import nltk
from nltk.corpus import words
from embeddings.text_splitter import TextSplitter
from embeddings.basesplit import ContextTypes

def generate_random_sentence():
# Set up NLTK
nltk.download("punkt")
nltk.download("words")

# Get a list of English words
word_list = words.words()

# Generate a random sentence
sentence = []
i = 0
while len(sentence) < random.randint(5, 15):
if i < random.randint(6, 10):
sentence.append(".")
word = random.choice(word_list)
sentence.append(word)
i += 1
random_sentence = " ".join(sentence)

return random_sentence


def test_text_splitter():
# Generate a random text with between 8000 and 2200 characters using random sentences
text = " ".join([generate_random_sentence() for _ in range(random.randint(8, 100))])
chunks = TextSplitter(text=text, context_type=ContextTypes.TEXT).chunk_document()
print(len(list(text)))
print(len(chunks))
assert math.floor(len(text) / 1000) == len(chunks) or math.ceil(len(text) / 1000) == len(chunks)
21 changes: 1 addition & 20 deletions backend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from loaders.xlsx_loader import load_xlsx
from loaders.xls_loader import load_xls

import openai

FILE_HANDLERS = {
".pdf": lambda file: load_pdf(file.file, filename=file.filename, file_meta=file),
Expand All @@ -25,22 +24,4 @@
".xlsx": lambda file: load_xlsx(file.file, filename=file.filename, file_meta=file),
".xls": lambda file: load_xls(file.file, filename=file.filename, file_meta=file),
".csv": lambda file: load_txt(file.file, filename=file.filename, file_meta=file),
}


# TODO: make model configurable in config
def openai_ask(context: str = None, question: str = None, openai_api_key: str = None, model: str = "gpt-3.5-turbo"):
print(question)
print(context)
# TODO: make answer to same language
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": f"Answer the following question: {question} based on that context: {context},"
" Make sure that the answer of you is in the same language then the question. if you can't just answer: I don't know."}
]
)

# TODO: save usage into db
return completion["choices"][0]["message"]["content"]

}