In [1]:
%%capture
!pip install -U sentence-transformers langchain pinecone-client python-dotenv cohere

In [2]:
from sentence_transformers import SentenceTransformer
import torch
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
!ln -s /gdrive/MyDrive /mydrive
%cd /mydrive/pinecone-hackaton

/gdrive/MyDrive/pinecone-hackaton


In [5]:
embedding_model = SentenceTransformer('all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes: 0.00B [00:00, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md: 0.00B [00:00, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json: 0.00B [00:00, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py: 0.00B [00:00, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [6]:
!head -n 5 partition_001.json

{"doc-number":"11545508","country":"US","date":20230103,"abstract":"\n<p id=\"p-0001\" num=\"0000\">A display panel includes a first flexible substrate, and a metal wiring layer located on the first flexible substrate. The metal wiring layer includes at least one first power-supply line. At least one binding region is disposed on the side of the first flexible substrate away from the metal wiring layer. The display panel also includes a thin-film transistor layer, located on the side of the metal wiring layer away from the first flexible substrate and including a plurality of first thin-film transistors. Each first thin-film transistor includes a first electrode electrically connected to the first power-supply line. The display panel further includes a first conductive layer, including a plurality of conductive sections. The plurality of conductive sections is located in the binding region, and the first power-supply line is electrically connected to at least one conductive section of 

In [10]:
data_df = pd.read_json('partition_001.json', orient='records',lines = True)
data_df.head()

Unnamed: 0,doc-number,country,date,abstract
0,11545508,US,20230103,"\n<p id=""p-0001"" num=""0000"">A display panel in..."
1,11545541,US,20230103,"\n<p id=""p-0001"" num=""0000"">A wiring line is p..."
2,11545702,US,20230103,"\n<p id=""p-0001"" num=""0000"">A support plate fo..."
3,11545612,US,20230103,"\n<p id=""p-0001"" num=""0000"">A pseudo-piezoelec..."
4,11545494,US,20230103,"\n<p id=""p-0001"" num=""0000"">A method for fabri..."


In [11]:
data_df.loc[:, 'abstract'] = (data_df['abstract']
                              .str.replace('<p[^>]*>(.*)</p[^>]*>',
                                           lambda m: m.group(1),
                                           regex=True)
                              .str.strip('\n')
                             )

In [12]:
data_df = data_df.rename(columns={'doc-number': 'id'})
data_df = data_df[~data_df['abstract'].isna()]
data_df.head(2)

Unnamed: 0,id,country,date,abstract
0,11545508,US,20230103,A display panel includes a first flexible subs...
1,11545541,US,20230103,"A wiring line is provided on a TFT layer, in w..."


In [13]:
test_embed = embedding_model.encode(sentences=data_df.loc[:5, 'abstract'],
                    #    convert_to_tensor=True,
                       show_progress_bar=True,
                       )
test_embed.shape

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(6, 768)

In [14]:
from langchain.document_loaders import DataFrameLoader

In [None]:
doc_loader = DataFrameLoader(data_frame=data_df,
                             page_content_column='abstract')

In [None]:
docs = doc_loader.load()

In [41]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    separators=["\n\n", "\n", " ", ""]
)

text_splitter.split_text(data_df.loc[1, 'abstract'])[:]

['A wiring line is provided on a TFT layer, in which the wiring line is formed in the same layer and formed of the same material as those of a reflection electrode. The reflection electrode includes a plurality of metallic conductive layers made up of a low resistance metallic material, an oxide-based lower transparent conductive layer provided on a lower surface side of a lowermost metallic',
 'lowermost metallic conductive layer constituting a lowermost layer, an oxide-based upper transparent conductive layer having light reflectivity and provided on an upper surface side of an uppermost metallic conductive layer constituting an uppermost layer, and an oxide-based intermediate transparent conductive layer provided between the plurality of metallic conductive layers.']

In [26]:
from langchain.vectorstores import Pinecone
import pinecone
from dotenv import load_dotenv
import os

load_dotenv('.env')

True

In [19]:
print(os.environ.get('PINECONE_ENV'))

asia-southeast1-gcp-free


In [47]:
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment=os.environ.get('PINECONE_ENV'),
)
index_name = 'patentbot'

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=4096
    )


In [40]:
from langchain.embeddings import HuggingFaceEmbeddings, CohereEmbeddings
hf_embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
co_embeddings = CohereEmbeddings(
    cohere_api_key=os.environ.get('COHERE_API_KEY'),
    model='embed-english-v2.0',
    truncate='END',
)


In [51]:
from tqdm.auto import tqdm
from uuid import uuid4
from langchain.embeddings.base import Embeddings
from typing import Optional

def upsert_vetor_index(
    data_df: pd.DataFrame,
    index_name: str,
    embedding_model: Embeddings,
    text_splitter: Optional[TextSplitter],
    batch_size:int = 100
    ):
    texts = []
    metadatas = []
    index = pinecone.Index(index_name)

    for i, record in enumerate(tqdm(data_df.to_dict(orient='records'))):
        # first get metadata fields for this record
        metadata = {
            'patent-id': str(record['id']),
            'country': record['country'],
            'date': record['date']
        }
        # now we create chunks from the record text

        record_texts = text_splitter.split_text(record['abstract']) if text_splitter else [record['abstract']]
        # create individual metadata dicts for each chunk
        record_metadatas = [{
            "chunk": j, "text": text, **metadata
        } for j, text in enumerate(record_texts)]
        # append these to current batches
        texts.extend(record_texts)
        metadatas.extend(record_metadatas)

        # if we have reached the batch_limit we can add texts
        if len(texts) >= batch_size:
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = embedding_model.embed_documents(texts)
            index.upsert(vectors=zip(ids, embeds, metadatas))
            texts = []
            metadatas = []

    if len(texts) > 0:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embedding_model.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))

In [52]:
upsert_vetor_index(
    data_df=data_df,
    index_name=index_name,
    embedding_model=co_embeddings,
    text_splitter=text_splitter,
    batch_size=96
    )

  0%|          | 0/1516 [00:00<?, ?it/s]

In [53]:
pinecone.Index(index_name).describe_index_stats()

{'dimension': 4096,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 3689}},
 'total_vector_count': 3689}

In [54]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, co_embeddings.embed_query, text_field
)

In [55]:
query = "what's a metallic conductive layer?"

vectorstore.similarity_search(
    query,  # our search query
    k=5  # return 5 most relevant docs
)

[Document(page_content='lowermost metallic conductive layer constituting a lowermost layer, an oxide-based upper transparent conductive layer having light reflectivity and provided on an upper surface side of an uppermost metallic conductive layer constituting an uppermost layer, and an oxide-based intermediate transparent conductive layer provided between the plurality of metallic conductive layers.', metadata={'chunk': 1.0, 'country': 'US', 'date': 20230103.0, 'patent-id': '11545541'}),
 Document(page_content='a circuit pattern. The chemical-plating layer is applied over the surface of the metallic seed layer. The conductive fabric has improved conductivity and heat generation efficiency.', metadata={'chunk': 1.0, 'country': 'US', 'date': 20230103.0, 'patent-id': '11546974'}),
 Document(page_content='may include a copper or nickel foil. Portions of the metal current collector not covered by active material may be protected by an adhesive or inorganic layer.', metadata={'chunk': 2.0, 