# Manual Test with a company description

## Install dependencies

In [1]:
pip install -qU sentence-transformers pinecone-client

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


## Load the Sentence Transformer model
you could also use: `all-mpnet-base-v2` or `all-roberta-large-v1`

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

model

  from .autonotebook import tqdm as notebook_tqdm


SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

## Encode a set of company descriptions

In [31]:
# sentences = [
#     "Madkudu is a predictive lead scoring platform that leverages machine learning to help B2B companies identify their best leads and drive revenue growth By analyzing data from marketing and sales systems Madkudu provides teams with actionable insights",
#         "Terminus is a leading account based marketing ABM platform enabling businesses to engage with target accounts and deliver tailored experiences It offers a suite of solutions including advertising web personalization CRM integration and analytic", 
#         "Endgameio is a cybersecurity company that offers nextgen endpoint protection and threat intelligence to protect enterprise networks from advanced threats Its products include a cloudnative endpoint detection and response platform malware analysis" , 
#         "6sense is an AI powered accoun tbased orchestration platform that provides predictive intelligence to help B2B companies reach their ideal customers at the right time With 6senses solution marketers can identify and prioritize accounts that are most"
#         ]

# load the data from the csv file
import pandas as pd
df_all = pd.read_csv('../data/tech_companies.csv')


# replace NaN with empty string
df_all = df_all.fillna('')

# put the first 1000 companies into a dataframe and only keep the name and description columns
df = df_all[['name', 'description']][:1000]
df.head(10)


Unnamed: 0,name,description
0,10x,Your Data Is Your Busness
1,,RandomKeygen is a free mobilefriendly tool tha...
2,Funsales,Conhea todos os nossos apps Criador de Promoes...
3,Unitedprofile,Nordens kraftfullaste webbportal fr profil yrk...
4,Neurons-IT,Neuronsit is an professional website designin...
5,Church Base,Church base offers a complete church engagemen...
6,Pollicy,Pollicy is a feminist collective of technologi...
7,Doshaheen Solutions Pvt.,Doshaheen Solutions Pvt Ltd DSPL is a boutique...
8,MevoFit,Professional AllinOne Platform for Fitness Pro...
9,teamblau,Lass dich von unseren Referenzen inspirieren E...


## Push embeddings to Pinecone to be accessed by our endpoint in the future

In [5]:
# get our pinecone_api_key from the environment variable
import os
from dotenv import find_dotenv, load_dotenv

# Load environment variables from .env file
load_dotenv(find_dotenv())

# Set API credentials as variables => Please make sure you have a .env file with the following variables
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
PINECONE_ENV = os.environ["PINECONE_ENV"]

# connect to our pinecone instance
from pinecone import init
init(
    api_key=PINECONE_API_KEY, 
    environment=PINECONE_ENV
)

# Create a pincone index
from pinecone import create_index, list_indexes, Index
if "companies" not in list_indexes():
    create_index(name="companies", metric="cosine", shards=1, dimension=768)

# list_indexes()

index = Index("companies")


## Index the Vectors
Since we're likely going to run this on decently big batches, we want to process in chunks

In [32]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df.iloc[i:i_end]
    # generate embeddings for batch
    emb = model.encode(batch['description'].tolist()).tolist()
    # get metadata
    meta = batch.to_dict(orient='records')
    # create unique IDs
    ids = [f"{idx}" for idx in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

100%|██████████| 16/16 [01:52<00:00,  7.03s/it]


{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1000}},
 'total_vector_count': 1000}

In [None]:
# Use our pinecode index to find the knn companies based the description of a new company
# calculate the similarities between a new description and the embeddings in our index




## Calculate Similarities matrix

In [45]:
import numpy as np
from sentence_transformers.util import cos_sim

sim = np.zeros((len(sentences), len(sentences)))

for i in range(len(sentences)):
    sim[i:,i] = cos_sim(embeddings[i], embeddings[i:])

sim.shape

## Get me the similarity scores between 2 companies

In [39]:
# given 2 company names, return the similarity score
def get_similarity_score(company1, company2):
    index1 = df[df['name'] == company1].index[0]
    index2 = df[df['name'] == company2].index[0]
    # if one of the 2 companies is not in the list, return 0
    if index1 == -1 or index2 == -1:
        return 0
    return sim[index1, index2]

get_similarity_score('AccountPal', 'Ubiquity NZ')

0.75957190990448

## For a given index return its sentence and return the k-nn sentences

In [40]:
# for a given company, find the top 5 most similar companies
def find_similar_companies(company_name, sim, sentences, top_n=5):
    # if the company is not in the list, return empty list
    if company_name not in df['name'].tolist():
        return []
    sentence = df[df['name'] == company_name]['description'].tolist()[0]
    idx = sentences.index(sentence)
    top_n_idx = np.argsort(sim[idx,:])[-top_n:][::-1]
    res = []
    for i in top_n_idx:
        if sim[idx,i] >= 0.3:
            res.append((df['name'][i],sentences[i], sim[idx,i]))
    return res

find_similar_companies('AccountPal', sim, sentences)

[('AccountPal', 'Account Centric Sales and Marketing on Salesforce', 1.0),
 ('54 North Limited',
  'Revenue Management Solutions for Retailers',
  0.8695738911628723),
 ('Bridgeway Solutions',
  'Comprehensive ID and access management software and hardware solution Sales integration service  support',
  0.8162314891815186),
 ('8C Partners', 'Marketing Consulting Finance Data', 0.8125531077384949),
 ('Aynsoft - India',
  'Automate the workflow process of Lead management Sales pipeline Marketing and Customer acquisition giving companies the ability to track performance and',
  0.8023368120193481)]

In [38]:
# search for a company based on a sentence using the sim matrix
def search_company(sentence, sim, sentences, top_n=5):
    sentence_embedding = model.encode([sentence])
    sim_score = np.zeros(len(sentences))
    for i in range(len(sentences)):
        sim_score[i] = cos_sim(sentence_embedding, embeddings[i])
    top_n_idx = np.argsort(sim_score)[-top_n:][::-1]
    res = []
    for i in top_n_idx:
        if sim_score[i] >= 0.3:
            res.append((df['name'][i],sentences[i], sim_score[i]))
    return res

search_company('account based marketing', sim, sentences)


[('AccountPal',
  'Account Centric Sales and Marketing on Salesforce',
  0.8345026969909668),
 ('Ubiquity NZ', 'Customer Engagement Marketing Software', 0.7729628086090088),
 ('A Guy I Know',
  'Managed IT Fulfillment Administration and Infrastructure Services',
  0.7712785601615906),
 ('Apāto',
  'Decentralised realestate ownership through distributed ledger technology',
  0.7684563398361206),
 ('54 North Limited',
  'Revenue Management Solutions for Retailers',
  0.7599304914474487)]