In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [2]:
embedder = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L12-v2'
)
try:
    store = FAISS.load_local('core_knowledge',
                             embeddings=embedder
                            )
    raise NotImplementedError('The code below will double-load the data if the vector db is already populated.')
except RuntimeError:
    store = FAISS.from_texts(texts=[''], embedding=embedder)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# https://python.langchain.com/docs/integrations/document_loaders/wikipedia
from langchain.document_loaders import WikipediaLoader

In [4]:
langchain_docs = WikipediaLoader(query="LangChain", load_max_docs=1, doc_content_chars_max=50000).load_and_split()

In [5]:
len(langchain_docs)

1

In [6]:
langchain_docs[0].page_content

'LangChain is a framework designed to simplify the creation of applications using large language models (LLMs). As a language model integration framework, LangChain\'s use-cases largely overlap with those of language models in general, including document analysis and summarization, chatbots, and code analysis.\n\n\n== Background ==\nLangChain was launched in October 2022 as an open source project by Harrison Chase, while working at machine learning startup Robust Intelligence. The project quickly garnered popularity, with improvements from hundreds of contributors on GitHub, trending discussions on Twitter, lively activity on the project\'s Discord server, many YouTube tutorials, and meetups in San Francisco and London. In April 2023, LangChain had incorporated and the new startup raised over $20 million in funding at a valuation of at least $200 million from venture firm Sequoia Capital, a week after announcing a $10 million seed investment from Benchmark.\n\n\n== Integrations ==\nAs 

In [7]:
docs_to_add = []
ids_to_add = []
metadata_to_add = []

In [8]:
langchain_docs[0].metadata

{'title': 'LangChain',
 'summary': "LangChain is a framework designed to simplify the creation of applications using large language models (LLMs). As a language model integration framework, LangChain's use-cases largely overlap with those of language models in general, including document analysis and summarization, chatbots, and code analysis.",
 'source': 'https://en.wikipedia.org/wiki/LangChain'}

In [10]:
other_pages_to_add = [
    # Concepts
    'Chatbot',
    'Fine-tuning (deep learning)',
    'Generative artificial intelligence',
    'History of artificial intelligence',
    'LangChain',
    'Large language model',
    'Language model',
    'Prompt engineering',

    # LLMs
    'GPT-3',
    'GPT-4',
    'LLaMA',
    'PaLM'
]

In [23]:
for page_name in other_pages_to_add:
    # Make sure we don't load something already loaded
    if store.similarity_search('',
                               filter={'title': page_name}):
        print(f'{page_name} already in vector store - skipping')
        continue
    
    for i, page_doc in enumerate(WikipediaLoader(query=page_name, load_max_docs=1, doc_content_chars_max=50000).load_and_split()):
        docs_to_add.append(page_doc.page_content)
        ids_to_add.append(f"wikipedia-{page_doc.metadata['title']}-{i}")
        metadata_to_add.append(page_doc.metadata)

Chatbot already in vector store - skipping
Generative artificial intelligence already in vector store - skipping
History of artificial intelligence already in vector store - skipping
Large language model already in vector store - skipping
Language model already in vector store - skipping
Prompt engineering already in vector store - skipping
GPT-3 already in vector store - skipping
GPT-4 already in vector store - skipping


In [12]:
ids_to_add

['wikipedia-LangChain',
 'wikipedia-Chatbot-0',
 'wikipedia-Chatbot-1',
 'wikipedia-Chatbot-2',
 'wikipedia-Chatbot-3',
 'wikipedia-Chatbot-4',
 'wikipedia-Chatbot-5',
 'wikipedia-Chatbot-6',
 'wikipedia-Chatbot-7',
 'wikipedia-Fine-tuning (deep learning)-0',
 'wikipedia-Generative artificial intelligence-0',
 'wikipedia-Generative artificial intelligence-1',
 'wikipedia-Generative artificial intelligence-2',
 'wikipedia-Generative artificial intelligence-3',
 'wikipedia-History of artificial intelligence-0',
 'wikipedia-History of artificial intelligence-1',
 'wikipedia-History of artificial intelligence-2',
 'wikipedia-History of artificial intelligence-3',
 'wikipedia-History of artificial intelligence-4',
 'wikipedia-History of artificial intelligence-5',
 'wikipedia-History of artificial intelligence-6',
 'wikipedia-History of artificial intelligence-7',
 'wikipedia-History of artificial intelligence-8',
 'wikipedia-History of artificial intelligence-9',
 'wikipedia-History of art

In [13]:
store.add_texts(
    texts=docs_to_add,
    metadatas=metadata_to_add,
    ids=ids_to_add
)   

['wikipedia-LangChain',
 'wikipedia-Chatbot-0',
 'wikipedia-Chatbot-1',
 'wikipedia-Chatbot-2',
 'wikipedia-Chatbot-3',
 'wikipedia-Chatbot-4',
 'wikipedia-Chatbot-5',
 'wikipedia-Chatbot-6',
 'wikipedia-Chatbot-7',
 'wikipedia-Fine-tuning (deep learning)-0',
 'wikipedia-Generative artificial intelligence-0',
 'wikipedia-Generative artificial intelligence-1',
 'wikipedia-Generative artificial intelligence-2',
 'wikipedia-Generative artificial intelligence-3',
 'wikipedia-History of artificial intelligence-0',
 'wikipedia-History of artificial intelligence-1',
 'wikipedia-History of artificial intelligence-2',
 'wikipedia-History of artificial intelligence-3',
 'wikipedia-History of artificial intelligence-4',
 'wikipedia-History of artificial intelligence-5',
 'wikipedia-History of artificial intelligence-6',
 'wikipedia-History of artificial intelligence-7',
 'wikipedia-History of artificial intelligence-8',
 'wikipedia-History of artificial intelligence-9',
 'wikipedia-History of art

In [16]:
store.similarity_search_with_score('Who invented AI?')

[(Document(page_content='The history of artificial intelligence (AI) began in antiquity, with myths, stories and rumors of artificial beings endowed with intelligence or consciousness by master craftsmen. The seeds of modern AI were planted by philosophers who attempted to describe the process of human thinking as the mechanical manipulation of symbols. This work culminated in the invention of the programmable digital computer in the 1940s, a machine based on the abstract essence of mathematical reasoning. This device and the ideas behind it inspired a handful of scientists to begin seriously discussing the possibility of building an electronic brain.\nThe field of AI research was founded at a workshop held on the campus of Dartmouth College, USA during the summer of 1956. Those who attended would become the leaders of AI research for decades. Many of them predicted that a machine as intelligent as a human being would exist in no more than a generation, and they were given millions of 

In [17]:
store.similarity_search_with_score('What are the benefits of generative AI?')

[(Document(page_content='== Modalities ==\nA generative AI system is constructed by applying unsupervised or self-supervised machine learning to a data set. The capabilities of a generative AI system depend on the modality or type of the data set used.\nGenerative AI can be either unimodal or multimodal; unimodal systems take only one type of input, whereas multimodal systems can take more than one type of input. For example, one version of OpenAI\'s GPT-4 accepts both text and image inputs.\n\n\n=== Text ===\n\nGenerative AI systems trained on words or word tokens include GPT-3, LaMDA, LLaMA, BLOOM, GPT-4, and others (see List of large language models). They are capable of natural language processing, machine translation, and natural language generation and can be used as foundation models for other tasks. Data sets include BookCorpus, Wikipedia, and others (see List of text corpora).\n\n\n=== Code ===\nIn addition to natural language text, large language models can be trained on prog

In [18]:
store.save_local('core_knowledge')

In [21]:
store.similarity_search('',
                       filter={'title': 'Language model'})

[Document(page_content='Corpus of Linguistic Acceptability\nGLUE benchmark\nMicrosoft Research Paraphrase Corpus\nMulti-Genre Natural Language Inference\nQuestion Natural Language Inference\nQuora Question Pairs\nRecognizing Textual Entailment\nSemantic Textual Similarity Benchmark\nSQuAD question answering Test\nStanford Sentiment Treebank\nWinograd NLI\nBoolQ, PIQA, SIQA, HellaSwag, WinoGrande, ARC, OpenBookQA, NaturalQuestions, TriviaQA, RACE, MMLU (Massive Multitask Language Understanding), BIG-bench hard, GSM8k, RealToxicityPrompts, WinoGender, CrowS-Pairs. (LLaMa Benchmark)\n\n\n== See also ==\n\n\n== References ==\n\n\n== Further reading ==', metadata={'title': 'Language model', 'summary': 'A language model is a probabilistic model of a natural language that can generate probabilities of a series of words, based on text corpora in one or multiple languages it was trained on. Large language models, as their most advanced form, are a combination of feedforward neural networks and 