In [9]:
YOUR_OPENAI_KEY=
YOUR_WEAVIATE_KEY=
YOUR_WEAVIATE_CLUSTER=

## 0. Install Dependencies

In [None]:
!pip install langchain
!pip install weaviate-client
!pip install openai
!pip install unstructured
pip install "unstructured[pdf]"

## 1. Data Reading

In [1]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('/content/drive/MyDrive/Shared/YouTube/T1/pdfs', glob="**/*.pdf")
data = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
print(f'You have {len(data)} documents in your data')
print(f'There are {len(data[0].page_content)} characters in your document')

You have 2 documents in your data
There are 87840 characters in your document


## 2. Text Splitting

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

## 3. Embedding Conversion

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key = YOUR_OPENAI_KEY)

## 4. Vector Database Storage

In [10]:
import weaviate
from langchain.vectorstores import Weaviate

# connect Weaviate Cluster
auth_config = weaviate.AuthApiKey(api_key=YOUR_WEAVIATE_KEY)

WEAVIATE_URL = YOUR_WEAVIATE_CLUSTER
client = weaviate.Client(
    url=WEAVIATE_URL,
    additional_headers={"X-OpenAI-Api-Key": YOUR_OPENAI_KEY},
    auth_client_secret=auth_config,
    startup_period=10
)

In [11]:
# define input structure
client.schema.delete_all()
client.schema.get()
schema = {
    "classes": [
        {
            "class": "Chatbot",
            "description": "Documents for chatbot",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}

client.schema.create(schema)

vectorstore = Weaviate(client, "Chatbot", "content", attributes=["source"])



In [12]:
# load text into the vectorstore
text_meta_pair = [(doc.page_content, doc.metadata) for doc in docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)

['c92abd4a-a679-410a-bb1c-3900790f6cc8',
 '341c2bcf-f64a-4081-b1d9-1e0222940255',
 'dd3c1067-01cd-4fff-94b5-92291dd720ec',
 'eb732a38-d90e-45f3-949e-6d523e1111cf',
 '48099f98-69f0-433a-a59a-278a804b1483',
 '7476e000-e8e4-4bfe-9bb7-4d3630581672',
 'a1f0067d-a95d-43b8-9e62-fac487a22162',
 '3f04803d-38df-4dfa-89dc-7af28828c351',
 'd5e8c072-0d10-4cfc-ab78-2bbb694bb297',
 '2ccf7f61-1b1c-4a01-9cbf-13e7079abf3e',
 'b0627585-d2a2-44dc-96f5-a4769ba04c33',
 '7575a219-7a52-47ad-a6d9-2214e41879e4',
 '2cb2a468-13f3-40c3-92df-18980287ed6f',
 'b123d38b-3f33-449e-91d8-0ab76892c584',
 'b169a943-7ede-42e5-a0cb-b8bcc06c7556',
 'd9d8c940-2ff1-420a-88e7-b7497f870309',
 '45bf5561-5db6-4d7e-9c97-21545958e938',
 'c3cc7241-85a8-4bf8-a73b-b24d8ed6b2e7',
 'cf566a95-c7d0-4479-a4df-2cab7ba9b916',
 '87e94c6a-89de-4445-9a3e-67ffaee25f39',
 '680c66e9-ad5a-4966-888f-a4bc68f810a5',
 '005d9074-a7d2-410b-b904-4d9a80b92255',
 '7fcde298-c31c-46b6-9af8-594d1e919c71',
 '94fa53ce-c053-417f-9950-41ea2de5c1d7',
 '32b4c93e-1b26-

## 5. Similarity Search

In [15]:
query = "who founded openai?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, k=4)

## 6.Our Custom ChatBot

In [16]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = YOUR_OPENAI_KEY,temperature=0),
    chain_type="stuff")

# create answer
chain.run(input_documents=docs, question=query)

' Ilya Sutskever, Greg Brockman, Trevor Blackwell, Vicki Cheung, Andrej Karpathy, Durk Kingma, Jessica Livingston, John Schulman, Pamela Vagata, and Wojciech Zaremba, with Sam Altman and Elon Musk serving as the initial board members.'