# Create a Pinecone index with a company description + Twitter Bio
We will use the index as the backend database for our search API

## Install dependencies

In [None]:
pip install -qU sentence-transformers pinecone-client

## Load the Sentence Transformer model

## Test a few different transformer models
We recommend focusing on cosine similarity but feel free to try a couple with examples below and check the similarity score you get to pick the right model

In [None]:
from sentence_transformers import SentenceTransformer, util
# encoding = 'msmarco-distilbert-base-v4' # Similarity: tensor([[0.2574]])
encoding = 'bert-base-nli-mean-tokens' # Similarity: tensor([[0.4602]])
# encoding = 'paraphrase-multilingual-mpnet-base-v2' # Similarity: tensor([[0.4039]])
# encoding = 'all-MiniLM-L6-v2' # Similarity: tensor([[0.3404]])
# encoding = 'all-mpnet-base-v2' # Similarity: tensor([[0.4056]])
# encoding = 'all-roberta-large-v1' # Similarity: tensor([[0.3546]])
model = SentenceTransformer(encoding) 

query_embedding = model.encode('Account-Based Marketing')
passage_embedding = model.encode('6sense is an AI powered account based orchestration platform that provides predictive intelligence to help B2B companies reach their ideal customers at the right time With 6senses solution marketers can identify and prioritize accounts that are most')

print("Similarity:", util.cos_sim(query_embedding, passage_embedding))

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

model

## Encode a set of company descriptions

In [None]:
# load the data from the csv file
import pandas as pd
df_all = pd.read_csv('../data/tech_companies.csv')


# replace NaN with empty string
df_all = df_all.fillna('')

# remove the rows with no name
df_all = df_all[df_all['name'] != '']

# remove the rows with no description and no twitter__bio
df_all = df_all[(df_all['description'] != '') | (df_all['twitter__bio'] != '')]

# get the number of records in the dataframe
print(len(df_all))

# put the first 20000 companies into a dataframe and only keep the name and description columns
# combine the description and twitter__bio column into a new column called description
df = df_all[:100000][['name', 'description', 'twitter__bio']]
df['description'] = df['description'] + ' ' + df['twitter__bio']
df = df[['name', 'description']]
df.head()


## Push embeddings to Pinecone to be accessed by our endpoint in the future

In [None]:
# get our pinecone_api_key from the environment variable
import os
from dotenv import find_dotenv, load_dotenv

# Load environment variables from .env file
load_dotenv(find_dotenv())

# Set API credentials as variables => Please make sure you have a .env file with the following variables
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
PINECONE_ENV = os.environ["PINECONE_ENV"]

# connect to our pinecone instance
from pinecone import init
init(
    api_key=PINECONE_API_KEY, 
    environment=PINECONE_ENV
)

# Create a pincone index
# if the index already exists, delete it and create a new one
from pinecone import create_index, list_indexes, Index, delete_index
if "companies" not in list_indexes():
    create_index(name="companies", metric="cosine", shards=1, dimension=768)
else:
    delete_index("companies")
    create_index(name="companies", metric="cosine", shards=1, dimension=768)

# list_indexes()
index = Index("companies")


## Index the Vectors
Since we're likely going to run this on decently big batches, we want to process in chunks

In [None]:
from tqdm.auto import tqdm

# we will use batches of 128
batch_size = 128

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = model.encode(batch['description'].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient='records')
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

## Test the index by querying it

In [None]:
query = "platform is a web-based designing, prototyping, and collaboration tool, enabling developers, product managers, and marketers to simplify design workflows"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(xq, top_k=3, include_metadata=True)
xc