In [66]:
import scann
import numpy as np
import pandas as pd
from google.cloud import storage, aiplatform
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

In [63]:
project="vtxdemos"
emb_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
storage_client = storage.Client(project=project)

In [4]:
storage_client.bucket("vtxdemos-vsearch-datasets").blob("stgwell_data/df3.csv").download_to_filename("df3.csv")
df = pd.read_csv("df3.csv")
df.fillna("", inplace=True)

  df.fillna("", inplace=True)


In [8]:
df.iloc[0,:]

Unnamed: 0                                                                                       0
id                                                                                               0
email                                                                          dan@goodstuff.co.uk
name                                                                                  Daniel Perry
first_name                                                                                  Daniel
last_name                                                                                    Perry
job_title                                                                   Group Digital Director
department                                                                    Digital & Publishing
company                                                            Goodstuff Holdings Limited (UK)
location                                                                                          
office_add

In [13]:
embeddings_list = []
for index, row in df.iterrows():
    context = f'''
     {row["linkedin_profile_skills"]}, and her/his education history is as follows: {row["linkedin_profile_education"]}, her/his linkedin profile show certifications as follows: {row["linkedin_profile_certifications"]}
    '''
    
    embeddings_list.append(emb_model.get_embeddings([TextEmbeddingInput(context, "SEMANTIC_SIMILARITY")])[0].values)
print(embeddings_list)

[[-0.017368998378515244, 0.0226878821849823, -0.10567767918109894, 0.037610094994306564, 0.04415411502122879, 0.02413182519376278, 0.004737356211990118, 0.03718046471476555, 0.05311634764075279, 0.0005294712609611452, 0.03813675418496132, 0.038345761597156525, 0.015297872945666313, 0.012060420587658882, 0.0056618619710206985, -0.002194652333855629, 0.06777491420507431, 0.04408126324415207, -0.07158999890089035, -0.05187494680285454, -0.02561826817691326, -0.037565965205430984, 0.046272486448287964, -0.007036986295133829, -0.025255324319005013, -0.04312749579548836, 0.03583209589123726, -0.05290699377655983, 0.021784374490380287, -0.03392893448472023, 0.050081945955753326, 0.056562140583992004, -0.03689282387495041, -0.04537061229348183, -0.01361285150051117, 0.02850787527859211, -0.042360953986644745, -0.05055897682905197, 0.03328142687678337, -0.07192029803991318, 0.003883384633809328, 0.0005784259992651641, -0.0286185871809721, 0.03966554254293442, -0.04359300807118416, -0.0784365758

In [16]:
df["embedding"] = embeddings_list

## Testing Locally with ScaNN

In [19]:
img = np.array([r["embedding"] for i, r in df.iterrows()])
k = int(np.sqrt(df.shape[0]))

if int(k/20) < 1:
    leave_search = 1
else:
    leave_search = int(k/20)

searcher = scann.scann_ops_pybind.builder(img, num_neighbors=5, distance_measure="squared_l2").tree(
    num_leaves=k, num_leaves_to_search=leave_search, training_sample_size=df.shape[0]).score_brute_force(
    2).reorder(7).build()

2024-05-20 16:08:50.799393: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 99
2024-05-20 16:08:50.801262: W scann/utils/gmm_utils.cc:921] Could not normalize centroid due to zero norm or empty or zero-weight partition.
2024-05-20 16:08:50.802372: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 2.902967ms.
2024-05-20 16:08:50.803066: I scann/brute_force/scalar_quantized_brute_force.cc:121] squared_l2_norms are not loaded, and they will be computed.


In [58]:
query = "Cho in human resources"
embeddings = emb_model.get_embeddings([TextEmbeddingInput(query, "SEMANTIC_SIMILARITY")])[0].values

In [59]:
neighbors, distances = searcher.search(embeddings,final_num_neighbors=10)

In [60]:
df.iloc[neighbors,:]

Unnamed: 0.1,Unnamed: 0,id,email,name,first_name,last_name,job_title,department,company,location,...,linkedin_profile_accomplishment_projects,linkedin_profile_similarly_named_profiles,linkedin_profile_accomplishment_test_scores,linkedin_profile_background_cover_image_url,linkedin_profile_accomplishment_publications,linkedin_profile_accomplishment_honors_awards,linkedin_profile_accomplishment_organisations,description,embedding,gemini_summ
90,90,90,jennifer.cho@codeandtheory.com,Jennifer Cho,Jennifer,Cho,Senior HR Manager,Human Resources,Code and Theory LLC,TX Remote,...,,{'link': 'https://www.linkedin.com/in/jen-cho-...,,,,,,Jennifer Cho is a Senior HR Manager at Code an...,"[-0.006811332423239946, 0.026161879301071167, ...","Jennifer Cho, BA in Sociology, is a Senior HR ..."
92,92,92,kat.grolle@harrispoll.com,Kat Grolle,Kat,Grolle,HR Generalist,Human Resources,Harris Insights and Analytics LLC,AZ Remote,...,,{'link': 'https://www.linkedin.com/in/kat-grol...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Kat Grolle is an HR Generalist at Harris Insig...,"[-0.006078773178160191, 0.008344571106135845, ...","Kat Grolle, Sociology graduate, is an HR Coord..."
85,85,85,jenny.lee@assemblyglobal.com,Jenny Lee,Jenny,Lee,Analyst,Media,"Forward3D Korea Co., Ltd.","Forward3D Korea Co., Ltd.",...,,{'link': 'https://cn.linkedin.com/in/xlfmetalc...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Jenny Lee is a Sr Data Strategist Lead at Fres...,"[-0.009506486356258392, -0.015855757519602776,...","Jenny Lee, a mathematics graduate, is a Sr Dat..."
4,4,4,idrakebrockman@sloanepr.com,India Drake-Brockman,India,Drake-Brockman,Associate 1,Public Relations,Sloane & Company LLC,Sloane & Company LLC - NY,...,,,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,India Drake-Brockman is an Associate 1 in the ...,"[-0.009678930044174194, -0.02679973654448986, ...",India Drake-Brockman is an Associate at Sloane...
69,69,69,stina.tam@assemblyglobal.com,Stina Tam,Stina,Tam,Sr. Analyst,Partnerships,"Forward3D Korea Co., Ltd.",,...,,,,,,,,Stina Tam is a Sr. Analyst in the Partnerships...,"[-0.01062752865254879, -0.0015147954691201448,...","Stina Tam, Sr. Analyst at Forward3D, specializ..."
28,28,28,sathish.ravichandran@assemblyglobal.com,Sathish R,Sathish,R,Manager,Business Driven Tech,The Search Agency India,The Search Agency India,...,,{'link': 'https://in.linkedin.com/in/sasi-kuma...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Sathish R is a Manager in the Business Driven ...,"[-0.03260263800621033, -0.0038870172575116158,...",Sathish R is a Digital Marketing Manager at Ta...
87,87,87,i.jurkowski@brandnewgalaxy.com,Iwo Jurkowski,Iwo,Jurkowski,Account Executive,BU OCEAN,Pathfinder 23 Sp. z o. o.,,...,,,,,,,,Iwo Jurkowski is an Account Executive at BRAND...,"[-0.036381348967552185, -0.0032202478032559156...","Iwo Jurkowski, Licencjat degree holder, is an ..."


## Vector Search

In [64]:
def preprocess(df, f_name):
    #df['id'] = df.index
    df.loc[:, 'id'] = df.index
    data = df.to_json(orient='records', lines=True)
    
    with open('data.json', 'w') as f:
        f.write(data)
        
    storage_client.bucket("vtxdemos-vsearch-datasets").blob(f_name).upload_from_filename("data.json")
    return data
data_1 = preprocess(df, "stgwell_3/data.json")

In [67]:
abnb_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = f"vs-stgwell-index-v3",
    contents_delta_uri = "gs://vtxdemos-vsearch-datasets/stgwell_3",
    dimensions = len(df["embedding"].iloc[0]),
    approximate_neighbors_count = 15,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/254356041555/locations/us-central1/indexes/657160507735670784/operations/6257153857143439360
MatchingEngineIndex created. Resource name: projects/254356041555/locations/us-central1/indexes/657160507735670784
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/254356041555/locations/us-central1/indexes/657160507735670784')


In [68]:
abnb_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = f"vs-stgwell-index-endpoint-v3",
    public_endpoint_enabled = True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/254356041555/locations/us-central1/indexEndpoints/4390081648872390656/operations/2068806203688878080
MatchingEngineIndexEndpoint created. Resource name: projects/254356041555/locations/us-central1/indexEndpoints/4390081648872390656
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/254356041555/locations/us-central1/indexEndpoints/4390081648872390656')


In [69]:
abnb_index_endpoint.deploy_index(
    index = abnb_index, deployed_index_id = "vs_stgwell_deployed_v3"
)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/254356041555/locations/us-central1/indexEndpoints/4390081648872390656
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/254356041555/locations/us-central1/indexEndpoints/4390081648872390656/operations/9076407223877369856
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/254356041555/locations/us-central1/indexEndpoints/4390081648872390656


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7fdecfee30d0> 
resource name: projects/254356041555/locations/us-central1/indexEndpoints/4390081648872390656

In [72]:
query = "Cho in human resources"
request = emb_model.get_embeddings([TextEmbeddingInput(query, "SEMANTIC_SIMILARITY")])[0].values

response = abnb_index_endpoint.find_neighbors(
      deployed_index_id = "vs_stgwell_deployed_v3",
      queries = [request],
      num_neighbors = 10
    )

nn = [int(i.id) for i in response[0]]
df_2 = df.loc[df['id'].isin(nn)]

In [73]:
df_2

Unnamed: 0.1,Unnamed: 0,id,email,name,first_name,last_name,job_title,department,company,location,...,linkedin_profile_accomplishment_projects,linkedin_profile_similarly_named_profiles,linkedin_profile_accomplishment_test_scores,linkedin_profile_background_cover_image_url,linkedin_profile_accomplishment_publications,linkedin_profile_accomplishment_honors_awards,linkedin_profile_accomplishment_organisations,description,embedding,gemini_summ
4,4,4,idrakebrockman@sloanepr.com,India Drake-Brockman,India,Drake-Brockman,Associate 1,Public Relations,Sloane & Company LLC,Sloane & Company LLC - NY,...,,,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,India Drake-Brockman is an Associate 1 in the ...,"[-0.009678930044174194, -0.02679973654448986, ...",India Drake-Brockman is an Associate at Sloane...
5,5,5,michael.vo@72andsunny.com,Michael Vo,Michael,Vo,Junior Designer,Creative,72andSunny Partners LLC,72 NY,...,,{'link': 'https://www.linkedin.com/in/michaeld...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Michael Vo is a Junior Designer at 72andSunny ...,"[-0.017362339422106743, 0.0011053840862587094,...",Michael Vo is a Junior Designer with a Bachelo...
22,22,22,milagros.bello@stagwellglobal.com,Milagros Bello,Milagros,Bello,"Sr Specialist, Payroll",Payroll & Benefits,Core Stagwell LLC,Stagwell Global - Florida,...,,,,,,,,"Milagros Bello is a Sr Specialist, Payroll at ...","[-0.005905874073505402, 0.022260179743170738, ...",Milagros Bello is a Payroll Specialist and Adm...
24,24,24,peter.bohenek@rhythmagency.com,Peter Bohenek,Peter,Bohenek,Chief Executive Officer,Executive Team,Rhythm Interactive LLC,Rhythm Interactive LLC,...,"{'url': None, 'title': '2012 Integrated Market...",,,,,,,Peter Bohenek is the Chief Executive Officer o...,"[-0.053720623254776, 0.012587340548634529, -0....","Peter Bohenek, with a BS in Finance and Market..."
56,56,56,acohen@skdknick.com,Avery Cohen,Avery,Cohen,Vice President,Public Relations,SKDKnickerbocker LLC,SKDKnickerbocker - NY,...,,,,,,,,Avery Cohen is a Vice President in the Public ...,"[-0.015291843563318253, -0.010884576477110386,...","Avery Cohen, a Political Communication graduat..."
63,63,63,michelle.tan.lishi@allisonworldwide.com,Michelle Tan Li Shi,Michelle,Tan Li Shi,Senior Account Manager,ALL PR divisions,Allison+Partners Singapore PTE Ltd,A+P Singapore,...,,,,,,,,Michelle Tan Li Shi is a Senior Account Manage...,"[-0.002452988177537918, 0.02271372824907303, -...",Michelle Tan Li Shi is a BA holder working as ...
77,77,77,venus.brown@assemblyglobal.com,Venus Brown,Venus,Brown,Vice President,People,Targetcast LLC,Detroit Office,...,,,,,,,,Venus Brown is a Vice President in the People ...,"[-0.035161301493644714, 0.008720475248992443, ...","Venus Brown, a Master's in Education, is a VP ..."
85,85,85,jenny.lee@assemblyglobal.com,Jenny Lee,Jenny,Lee,Analyst,Media,"Forward3D Korea Co., Ltd.","Forward3D Korea Co., Ltd.",...,,{'link': 'https://cn.linkedin.com/in/xlfmetalc...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Jenny Lee is a Sr Data Strategist Lead at Fres...,"[-0.009506486356258392, -0.015855757519602776,...","Jenny Lee, a mathematics graduate, is a Sr Dat..."
90,90,90,jennifer.cho@codeandtheory.com,Jennifer Cho,Jennifer,Cho,Senior HR Manager,Human Resources,Code and Theory LLC,TX Remote,...,,{'link': 'https://www.linkedin.com/in/jen-cho-...,,,,,,Jennifer Cho is a Senior HR Manager at Code an...,"[-0.006811332423239946, 0.026161879301071167, ...","Jennifer Cho, BA in Sociology, is a Senior HR ..."
92,92,92,kat.grolle@harrispoll.com,Kat Grolle,Kat,Grolle,HR Generalist,Human Resources,Harris Insights and Analytics LLC,AZ Remote,...,,{'link': 'https://www.linkedin.com/in/kat-grol...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Kat Grolle is an HR Generalist at Harris Insig...,"[-0.006078773178160191, 0.008344571106135845, ...","Kat Grolle, Sociology graduate, is an HR Coord..."


In [86]:
new_df = df.loc[[int(i.id) for i in response[0]],:]

In [84]:
df_2 = df.loc[df['id'].isin(nn)]

In [87]:
new_df

Unnamed: 0.1,Unnamed: 0,id,email,name,first_name,last_name,job_title,department,company,location,...,linkedin_profile_accomplishment_projects,linkedin_profile_similarly_named_profiles,linkedin_profile_accomplishment_test_scores,linkedin_profile_background_cover_image_url,linkedin_profile_accomplishment_publications,linkedin_profile_accomplishment_honors_awards,linkedin_profile_accomplishment_organisations,description,embedding,gemini_summ
90,90,90,jennifer.cho@codeandtheory.com,Jennifer Cho,Jennifer,Cho,Senior HR Manager,Human Resources,Code and Theory LLC,TX Remote,...,,{'link': 'https://www.linkedin.com/in/jen-cho-...,,,,,,Jennifer Cho is a Senior HR Manager at Code an...,"[-0.006811332423239946, 0.026161879301071167, ...","Jennifer Cho, BA in Sociology, is a Senior HR ..."
92,92,92,kat.grolle@harrispoll.com,Kat Grolle,Kat,Grolle,HR Generalist,Human Resources,Harris Insights and Analytics LLC,AZ Remote,...,,{'link': 'https://www.linkedin.com/in/kat-grol...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Kat Grolle is an HR Generalist at Harris Insig...,"[-0.006078773178160191, 0.008344571106135845, ...","Kat Grolle, Sociology graduate, is an HR Coord..."
77,77,77,venus.brown@assemblyglobal.com,Venus Brown,Venus,Brown,Vice President,People,Targetcast LLC,Detroit Office,...,,,,,,,,Venus Brown is a Vice President in the People ...,"[-0.035161301493644714, 0.008720475248992443, ...","Venus Brown, a Master's in Education, is a VP ..."
5,5,5,michael.vo@72andsunny.com,Michael Vo,Michael,Vo,Junior Designer,Creative,72andSunny Partners LLC,72 NY,...,,{'link': 'https://www.linkedin.com/in/michaeld...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Michael Vo is a Junior Designer at 72andSunny ...,"[-0.017362339422106743, 0.0011053840862587094,...",Michael Vo is a Junior Designer with a Bachelo...
85,85,85,jenny.lee@assemblyglobal.com,Jenny Lee,Jenny,Lee,Analyst,Media,"Forward3D Korea Co., Ltd.","Forward3D Korea Co., Ltd.",...,,{'link': 'https://cn.linkedin.com/in/xlfmetalc...,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,Jenny Lee is a Sr Data Strategist Lead at Fres...,"[-0.009506486356258392, -0.015855757519602776,...","Jenny Lee, a mathematics graduate, is a Sr Dat..."
22,22,22,milagros.bello@stagwellglobal.com,Milagros Bello,Milagros,Bello,"Sr Specialist, Payroll",Payroll & Benefits,Core Stagwell LLC,Stagwell Global - Florida,...,,,,,,,,"Milagros Bello is a Sr Specialist, Payroll at ...","[-0.005905874073505402, 0.022260179743170738, ...",Milagros Bello is a Payroll Specialist and Adm...
63,63,63,michelle.tan.lishi@allisonworldwide.com,Michelle Tan Li Shi,Michelle,Tan Li Shi,Senior Account Manager,ALL PR divisions,Allison+Partners Singapore PTE Ltd,A+P Singapore,...,,,,,,,,Michelle Tan Li Shi is a Senior Account Manage...,"[-0.002452988177537918, 0.02271372824907303, -...",Michelle Tan Li Shi is a BA holder working as ...
4,4,4,idrakebrockman@sloanepr.com,India Drake-Brockman,India,Drake-Brockman,Associate 1,Public Relations,Sloane & Company LLC,Sloane & Company LLC - NY,...,,,,https://s3.us-west-000.backblazeb2.com/proxycu...,,,,India Drake-Brockman is an Associate 1 in the ...,"[-0.009678930044174194, -0.02679973654448986, ...",India Drake-Brockman is an Associate at Sloane...
56,56,56,acohen@skdknick.com,Avery Cohen,Avery,Cohen,Vice President,Public Relations,SKDKnickerbocker LLC,SKDKnickerbocker - NY,...,,,,,,,,Avery Cohen is a Vice President in the Public ...,"[-0.015291843563318253, -0.010884576477110386,...","Avery Cohen, a Political Communication graduat..."
24,24,24,peter.bohenek@rhythmagency.com,Peter Bohenek,Peter,Bohenek,Chief Executive Officer,Executive Team,Rhythm Interactive LLC,Rhythm Interactive LLC,...,"{'url': None, 'title': '2012 Integrated Market...",,,,,,,Peter Bohenek is the Chief Executive Officer o...,"[-0.053720623254776, 0.012587340548634529, -0....","Peter Bohenek, with a BS in Finance and Market..."
