<a href="https://colab.research.google.com/github/daigo38/create-pinecone-index/blob/main/create_pinecone_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
packages = """
langchain
openai
tiktoken
urllib3<2
pinecone-client

unstructured
selenium
pypdf
pypdfium2
pdf2image
tabulate
"""

with open(f"./requirements.txt", "w") as f:
    f.write(packages)

!pip install -r requirements.txt
!mkdir load_files

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langchain (from -r requirements.txt (line 2))
  Downloading langchain-0.0.191-py3-none-any.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.7/993.7 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai (from -r requirements.txt (line 3))
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken (from -r requirements.txt (line 4))
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
Collecting pinecone-client (from -r requirements.txt (line 6))
  Downloading pinecone_client-2.2.1-py3-none-any.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [15]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.vCXhXZ0Jxw/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
Executing: /tmp/apt-key-gpghome.nUQ35N0C9L/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138

gpg: signal Interrupt caught ... exiting
Executing: /tmp/apt-key-gpghome.7R0daWs1X6/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
gpg: cannot open '/dev/tty': No such device or address
gpg: [stdout]: write error: Broken pipe
gpg: filter_flush failed on close: Broken pipe
gpg: cannot open '/dev/tty': No such device or address
gpg: [stdout]: write error: Broken pipe
gpg: filter_flush fa

CalledProcessError: ignored

In [16]:
import os
from logging import getLogger
logger = getLogger(__name__)
import pinecone

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import (
    SeleniumURLLoader,
    PyPDFLoader,
    UnstructuredPDFLoader,
    CSVLoader,
    UnstructuredFileLoader,

)
from langchain.text_splitter import (
    MarkdownTextSplitter, 
    CharacterTextSplitter
)


extension_dict = {
    '.txt': 'text',
    '.csv': 'csv',
    '.json': 'json',
    '.pdf': 'pdf',
    '.md': 'markdown',
}
special_dirs_dict = {
    "NotionDB": "notion",
}

load_dir = "/content/load_files"


class CreateIndex:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings()
        self.index_name = os.environ["INDEX_NAME"]
        self.docs = []

        # initialize pinecone
        pinecone.init(
            api_key=os.environ["PINECONE_API_KEY"],  # find at app.pinecone.io
            environment=os.environ["PINECONE_ENV"]  # next to api key in console
        )
    
    def _load_pdf(self, path):
        try:
            print("try PyPDFLoader")
            loader = PyPDFLoader(path)
            documents = loader.load()
        except Exception as e:
            logger.error(f"Exception occurred: {e}", exc_info=True)
            print("try UnstructuredPDFLoader")
            loader = UnstructuredPDFLoader(path)
            documents = loader.load()
        
        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=0)
        split_docs = text_splitter.split_documents(documents)
        return split_docs
    
    def _load_url(self, urls):
        loader = SeleniumURLLoader(urls)
        documents = loader.load()
        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=0)
        split_docs = text_splitter.split_documents(documents)
        return split_docs
      
    def _load_csv(self, path):
        print("try CSVLoader")
        loader = CSVLoader(path)
        documents = loader.load()
        return documents
    
    def _load_text(self, path):
        print("try UnstructuredFileLoader")
        loader = UnstructuredFileLoader(path)
        documents = loader.load()
        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=0)
        split_docs = text_splitter.split_documents(documents)
        return split_docs
    
    def _entry_documents(self, type, name):
        self.docs.append(
            {"type": type, "name": name}
        )
    
    def load_documents(self, urls=None):
        # ウェブサイトを学習
        if urls:
          self._entry_documents("url", urls)

        for root, _, files in os.walk(load_dir):
            dir_name = os.path.basename(root)
            if dir_name in special_dirs_dict:
                self._entry_documents(special_dirs_dict[dir_name], root)
                continue
            for file in files:
                file_path = os.path.join(root, file)
                _, ext = os.path.splitext(file_path)
                file_type = extension_dict.get(ext, ext)

                print(f"File name: {file_path}, File type: {file_type}")
                self._entry_documents(file_type, file_path)
    
    def create_vectordb(self):
        documents = []
        source_list = []
        for doc in self.docs:
            try:
                document = None
                if doc["type"] == "pdf":
                    document = self._load_pdf(doc["name"])
                elif doc["type"] == "markdown":
                    document = self._load_pdf(doc["name"])
                elif doc["type"] == "csv":
                    document = self._load_csv(doc["name"])
                elif doc["type"] == "url":
                    document = self._load_url(doc["name"])
                if doc["type"] == "text":
                    document = self._load_text(doc["name"])
                elif doc["type"] == "notion":
                    document = self._load_pdf(doc["name"])
                
                if document:
                    documents.extend(document)
                    source_list.append(doc["name"])
            except Exception as e:
                print(f"Error: {e}")
                return doc["name"]
        
        if documents:
            # fix metadata
            for document in documents:
                source = document.metadata.get("source")
                if source:
                    document.metadata["source"] = source.replace(load_dir+"/", "")
            # vectordb = Pinecone.from_documents(documents, self.embeddings, index_name=self.index_name)
            print(documents)
        
        return source_list

In [3]:
os.environ["INDEX_NAME"] = "INDEX_NAME"
os.environ["PINECONE_API_KEY"] = "PINECONE_API_KEY"
os.environ["PINECONE_ENV"] = "PINECONE_ENV"
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [17]:
urls = [
]
create_index = CreateIndex()

create_index.load_documents(urls=urls)
sources = create_index.create_vectordb()

File name: /content/load_files/my.txt, File type: text
File name: /content/load_files/dir/6c44051d11b912a07a81cd12db9b5440.pdf, File type: pdf
File name: /content/load_files/dir/ごみ一覧表（作業用佐藤、渡部）.xlsx - Sheet1.csv, File type: csv
try UnstructuredFileLoader
try PyPDFLoader
try CSVLoader
[[Document(page_content='# デート\n天気良かったからお出かけに誘った\nテオヤンセン展やってる美術館か、植物園か\n天気いいから植物園にしよう\n道間違えて高速乗りかけたり、ナビが変な挙動なって紆余曲折ありながらなんとか五台山に到着\n新しくなってた\u3000昼来るの初めてだから新鮮だった\n景色綺麗だったし天気良くて気持ちよかった\nそのまま牧野植物園へ\n入る前から数えきれないくらいの植物があって序盤から足止めを食らった\n室内の資料館をちょっと見ていざこんこん山広場へ\n荘厳な開放感があるわけじゃないけど、めっちゃいろんな植物があった\n触り心地がもふもふだったり、急にふわっと香ったり\u3000不思議な形の植物がたくさんあって楽しかった\u3000まやかさんかわいかった\u3000ずっと笑ってた\n温室はより大きく変な植物がたくさんだった\n序盤で回りきれないことに気づいて急ぎめで回ったけど全部は見れなかった\n坂多くてまやかさん死にそうだった\u3000登山だこれは\n植物園行ったことなかったけどよかった\u3000今度はもっと時間に余裕がある時に行きたい\nスタンプラリーもしたいしベンチでまったりしたい\n17時から予約してた南国のダイハツへ\nリョーマが寝てる近くのパン屋さんでパンを買った\u300030分\u3000車の修理\nめっちゃお腹すいてるわけじゃないしどこ行こう\u3000あ、蔦屋行きたい\n市内にある綺麗な蔦屋に行った\u3000ちょっと店回った後2階の本屋へ\n色々と本を漁りながら互いにボケつつ知識や考えをひけらかしあった\n量子力学のこと語った