In [None]:
!pip install langchain

In [None]:
!pip install chromadb

In [None]:
!pip install tiktoken

In [None]:
!pip install openai

In [None]:
import os
os.environ['OPENAI_API_KEY'] = "..."

In [None]:
from langchain.llms import OpenAI
from langchain.indexes import VectorstoreIndexCreator
from langchain.document_loaders import TextLoader
loader = TextLoader('./state_of_the_union.txt', encoding='utf8')
index = VectorstoreIndexCreator().from_loaders([loader])
query = "What did the president say about Ketanji Brown Jackson"
index.query(query)

" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans."

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
index.query_with_sources(query)

{'question': 'What did the president say about Ketanji Brown Jackson',
 'answer': " The president said that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson, one of the nation's top legal minds, who will continue Justice Breyer's legacy of excellence, and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\n",
 'sources': './state_of_the_union.txt'}

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
#ドキュメントとしてデータをロード
documents = loader.load()
#テキストを分割
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
#embbedingするライブラリを指定
embeddings = OpenAIEmbeddings()
#embeddingし、インデックスを作成
db = Chroma.from_documents(texts, embeddings)
#リトリバーを作成する
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
#問い合わせを行う
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)

" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice and a former federal public defender, and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans."

In [None]:
loader = TextLoader('./state_of_the_union.txt', encoding='utf8')
index = VectorstoreIndexCreator(
  vectorstore_cls=Chroma,
  embedding=OpenAIEmbeddings(),
  text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
).from_loaders([loader])
query = "What did the president say about Ketanji Brown Jackson"
index.query(query)


" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans."

## モデルの変更

In [None]:
#リトリバーを作成する
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

In [None]:
#リトリバーを作成する
retriever = db.as_retriever()
llm=OpenAI(model_name="gpt-3.5-turbo")
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)



In [None]:
from langchain.chat_models import ChatOpenAI
#リトリバーを作成する
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(), chain_type="stuff", retriever=retriever)

## インデックスの読み込みと保存

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
#ドキュメントとしてデータをロード
documents = loader.load()
#テキストを分割
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
#embbedingするライブラリを指定
embeddings = OpenAIEmbeddings()
#embeddingし、インデックスを作成
db = Chroma.from_documents(texts, embeddings,persist_directory=persist_directory)
db.persist()
#リトリバーを作成する
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(), chain_type="stuff", retriever=retriever)
db=None
#問い合わせを行う
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)

'The President announced that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson to serve on the United States Supreme Court. He referred to her as one of our nation’s top legal minds, a former top litigator in private practice, a former federal public defender, and a consensus builder.'

## ローダーを使う

### webから情報を取得する

In [None]:
!pip install unstructured

In [None]:
!pip install pdf2image

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pdf2image
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.3


In [None]:
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader
urls = [
'https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E3%81%AE%E6%AD%B4%E5%8F%B2',
'https://ja.wikipedia.org/wiki/%E4%B8%96%E7%95%8C%E3%81%AE%E6%AD%B4%E5%8F%B2'
]
loader = UnstructuredURLLoader(urls=urls)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name='gpt-3.5-turbo'), chain_type="stuff",
retriever=retriever)

In [None]:
qa.run("織田信長はどんなことをしましたか？同時期に世界では何が起こっていたのかも説明してください")

'織田信長は16世紀後半に日本で活躍した武将で、室町時代の混乱を収束させ、戦国時代の天下統一を目指しました。信長は多数の合戦に勝利し、近畿地方を中心に領土を拡大しました。また、信長は政治改革にも取り組み、荘園公領制の改革や刀狩などを行い、戦国時代の混乱を収束させました。\n\n一方、同時期の世界では、ルネサンスが盛んでした。ルネサンス期には、芸術、科学、人文主義が発展し、知識人たちが積極的に研究や発明に取り組みました。また、大航海時代が始まり、ヨーロッパ諸国が世界各地へ進出していきました。'

### PDFから情報を取得する

In [None]:
!pip install pypdf

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
loader = PyPDFLoader("./attention.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever()
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name='gpt-3.5-turbo'), chain_type="stuff", retriever=retriever)
qa.run("Please summarize this paper.Please explain it in a way that even a child can understand.")

'This paper talks about a computer program that can understand and translate languages. The program uses something called "attention" to pay attention to important parts of a sentence and figure out what it means. The program can also learn from examples and get better at understanding languages over time. The paper shows some pictures of how the program works and how it pays attention to different parts of a sentence. Overall, the program is a really cool way for computers to understand and communicate with people in different languages.'

## 番外：chain_typeごとの挙動の違い

In [None]:
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name='gpt-3.5-turbo'), chain_type="stuff", retriever=retriever)
qa.run("Please summarize this.Please explain it in a way that even a child can understand.")

'The article discusses a new way of training computers to understand languages called "self-attention". This method allows the computer to focus on important parts of a sentence and learn the structure of language. The article shows examples of how this works and explains how the computer is trained using a dataset of sentence pairs. The goal is to create more accurate and interpretable language models.'

In [None]:
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name='gpt-3.5-turbo'), chain_type="map_reduce", retriever=retriever)
qa.run("Please summarize this.Please explain it in a way that even a child can understand.")

'The document talks about the law and how it should be applied fairly, but sometimes it is not. It also talks about how some governments in America are making it harder for people to vote. There are pictures of how computer programs learn to understand language. These programs can help us make better models for understanding language and how sentences are put together. They were trained using many sentences in different languages.'

In [None]:
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name='gpt-3.5-turbo'), chain_type="refine", retriever=retriever)
qa.run("Please summarize this.Please explain it in a way that even a child can understand.")

'The given context provides additional information about the benefits and training of self-attention mechanism in computer programs. Self-attention can lead to more interpretable models by examining the attention distributions and behavior related to the structure of sentences. The models were trained on large datasets of sentence pairs using byte-pair encoding and word-piece vocabulary. Batching was done by approximate sequence length. The overall goal remains the same, which is to make computers understand sentences like humans do by using attention mechanism.'