In [None]:
!pip install openai
!pip install --upgrade langchain
!pip install chromadb
!pip install tiktoken

In [3]:
import os

api_key = os.getenv('OPENAI_API_KEY')

In [4]:
import chromadb

In [5]:
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma


# Workflow
1.   Load the document and break it into chunks
2.   Embedding -> Embed the chunks and get vectors
3.   Save the vector in chroma
4.   Query for similarity search



In [7]:
from langchain.document_loaders.onedrive_file import CHUNK_SIZE
#Load the document and break it into chunks
text_loader = TextLoader("sample_data/FDR_State_of_Union_1944.txt")
documents = text_loader.load()
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)

In [8]:
#Embedding -> Embed the chunks and get vectors
embedding_function = OpenAIEmbeddings()
db = Chroma.from_documents(docs, embedding_function,persist_directory="./speech_new_db")
db.persist()


In [9]:
#Loading existing vectorstore. So we can use either db or db_new
db_new = Chroma(persist_directory="./speech_new_db",embedding_function=embedding_function)


In [10]:
 new_doc = "What did FDR say about the cost of food law"
 similar_docs = db.similarity_search(new_doc)

In [13]:
print(similar_docs[0].page_content)

That is the way to fight and win a war—all out—and not with half-an-eye on the battlefronts abroad and the other eye-and-a-half on personal, selfish, or political interests here at home.

Therefore, in order to concentrate all our energies and resources on winning the war, and to maintain a fair and stable economy at home, I recommend that the Congress adopt:

(1) A realistic tax law—which will tax all unreasonable profits, both individual and corporate, and reduce the ultimate cost of the war to our sons and daughters. The tax bill now under consideration by the Congress does not begin to meet this test.

(2) A continuation of the law for the renegotiation of war contracts—which will prevent exorbitant profits and assure fair prices to the Government. For two long years I have pleaded with the Congress to take undue profits out of war.

(3) A cost of food law—which will enable the Government (a) to place a reasonable floor under the prices the farmer may expect for his production; and

#Load new doucment to Vector Store


In [16]:
text_loader = TextLoader("sample_data/Lincoln_State_of_Union_1862.txt")
documents = text_loader.load()
docs = text_splitter.split_documents(documents)

db_new = Chroma.from_documents(docs,embedding_function,persist_directory="./speech_new_db")



In [17]:
docs = db_new.similarity_search("slavery")


In [18]:
print(docs[0].page_content)

As to the second article, I think it would be impracticable to return to bondage the class of persons therein contemplated. Some of them, doubtless, in the property sense belong to loyal owners, and hence provision is made in this article for compensating such. The third article relates to the future of the freed people. It does not oblige, but merely authorizes Congress to aid in colonizing such as may consent. This ought not to be regarded as objectionable on the one hand or on the other, insomuch as it comes to nothing unless by the mutual consent of the people to be deported and the American voters, through their representatives in Congress.

I can not make it better known than it already is that I strongly favor colonization; and yet I wish to say there is an objection urged against free colored persons remaining in the country which is largely imaginary, if not sometimes malicious.

It is insisted that their presence would injure and displace white labor and white laborers. If th

In [19]:
retriever = db_new.as_retriever()

In [20]:
results = retriever.get_relevant_documents("cost food of law")
print(results[0].page_content)

That is the way to fight and win a war—all out—and not with half-an-eye on the battlefronts abroad and the other eye-and-a-half on personal, selfish, or political interests here at home.

Therefore, in order to concentrate all our energies and resources on winning the war, and to maintain a fair and stable economy at home, I recommend that the Congress adopt:

(1) A realistic tax law—which will tax all unreasonable profits, both individual and corporate, and reduce the ultimate cost of the war to our sons and daughters. The tax bill now under consideration by the Congress does not begin to meet this test.

(2) A continuation of the law for the renegotiation of war contracts—which will prevent exorbitant profits and assure fair prices to the Government. For two long years I have pleaded with the Congress to take undue profits out of war.

(3) A cost of food law—which will enable the Government (a) to place a reasonable floor under the prices the farmer may expect for his production; and

#MultiQuery Retrieval

In [None]:
!pip install wikipedia

###Load sample Wikipedia document

In [25]:
from langchain.document_loaders import WikipediaLoader
wiki_loader = WikipediaLoader(query='MKUltra')
documents = wiki_loader.load()



  lis = BeautifulSoup(html).find_all('li')


In [26]:
len(documents)

9

### Break the doucment into chunks

In [28]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)




In [29]:
len(docs)

19

### Create Enbeddings

In [30]:
from langchain.embeddings import OpenAIEmbeddings
embedding_function = OpenAIEmbeddings()
db = Chroma.from_documents(docs, embedding_function,persist_directory="./some_new_mkultra")


In [31]:
db.persist()

### Let's use LLM and MultiqueryRetriever to generate multiple ways of querying the vectorstore about the given question and get relevant information

In [34]:
from langchain.retrievers import MultiQueryRetriever
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(), llm=llm)


In [35]:
#Let's analyze logs to understand what happens behind the scene
import logging
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

In [36]:
question ="When was this declassified?"
unique_docs = retriever_from_llm.get_relevant_documents(query = question)


INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the date of the declassification?', '2. Can you provide the declassification date?', '3. Do you know when this was officially made public?']


In [37]:
print(unique_docs[0].page_content)


== See also ==
Human experimentation in the United States
Project MKULTRA
Project ARTICHOKE
Project CHATTER
Project MKDELTA
CIA cryptonym
Kurt Blome
Erich Traub


== References ==

BibliographyGoliszek, Andrew, In the name of science : a history of secret programs, medical research, and human experimentation St. Martin's Press, 2003
Summary Report of CIA Investigation of MKNAOMI (US National Archives, released under the JFK Assassination Records Act, December 2017)
