In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## **Notes:**
*   May need to add some logic to keep game types the same, regardless of semantic match
*   Need to make sure we don't repeat users.  Maybe when we match, we remove those users from the user pool and index
*   Do we need to add a limit for distance (i.e. similarity score)?



# Semantic Search using Embeddings

Semantic search is a type of search that uses the meaning of words and phrases to find relevant results.

Demonstrate a gaming match engine, where users are matched based on semantic similarity using [Google ScaNN: Efficient Vector Similarity Search](https://ai.googleblog.com/2020/07/announcing-scann-efficient-vector.html).

## Pre-requisites:
- Gaming user data (generated)
- Vertex LLM SDK
- ScaNN [github](https://github.com/google-research/google-research/tree/master/scann)

In [1]:
PROJECT_ID = 'cloud-llm-preview1' # replace with project ID

In [2]:
try:
    import google.colab
    from google.colab import auth
    auth.authenticate_user()
    !gcloud config set project {PROJECT_ID}
except Exception:
    pass

Updated property [core/project].


## Install Packages

In [4]:
# Vertex AI GenAI Studio SDK
!gsutil cp gs://vertex_sdk_llm_private_releases/SDK/google_cloud_aiplatform-1.25.dev20230502+language.models-py2.py3-none-any.whl .
!pip install google_cloud_aiplatform-1.25.dev20230502+language.models-py2.py3-none-any.whl "shapely<2.0.0" -q

# for working with embeddings locally
!pip install scann -q


Copying gs://vertex_sdk_llm_private_releases/SDK/google_cloud_aiplatform-1.25.dev20230502+language.models-py2.py3-none-any.whl...
/ [0 files][    0.0 B/  2.4 MiB]                                                / [1 files][  2.4 MiB/  2.4 MiB]                                                
Operation completed over 1 objects/2.4 MiB.                                      
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.4/321.4 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m588.3/588.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [5]:
# Enable Vertex AI For This Project
!gcloud services enable aiplatform.googleapis.com

**Attention**: you would need to restart runtime so that the right package is installed.

## Imports

In [3]:
from google.cloud.aiplatform.private_preview import language_models
from google.cloud import storage

import os
import time

import numpy as np
import pandas as pd
import requests
import tempfile

import scann

## Setup LLM TextEmbeddingModel

In [5]:
# create links to model: embedding api and text generation
embedding_model = language_models.TextEmbeddingModel.from_pretrained('textembedding-gecko-001')
#textgen_model = language_models.TextGenerationModel.from_pretrained('text-bison-001')

## Setup

In [6]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID
BUCKET = "mg_demos"
LOCATION = 'us'

cloud-llm-preview1


In [7]:
# local storage
INDEX_PATH = '/gamer_match_index'

In [8]:
if os.path.exists(INDEX_PATH):
  print(f'Already Exists: {INDEX_PATH}')
else:
  os.makedirs(INDEX_PATH)
  print(f'Created: {INDEX_PATH}')

Created: /gamer_match_index


## Loading Embedding Dataset.

In [9]:
# add GCS upload here

In [10]:
from google.colab import files

uploaded = files.upload()

Saving om_player_data.csv to om_player_data.csv


In [14]:
# Load gaming user dataset with embeddings.
df = pd.read_csv("om_player_data.csv")

#get user data, from GCS, BQ, etc.
#using dummy data to test gecko
#df_len = 100
#df = pd.DataFrame(np.random.randint(0,100,size=(df_len, 4)), columns=['UserID', 'Skill', 'Match Type', 'Region'])

df = df.astype(str) #convert numbers to strings to append to text needed to get embedding
df.head()


Unnamed: 0,player_id,elo,match_type,ping,region,level,playtime,age_group
0,pid_1682550803,1845,practice,83.19844338252759,australia-southeast1,advanced,morning,youth
1,pid_1682609504,837,battle royale,45.965771639661824,us-west1,intermediate,morning,adult
2,pid_1682558464,2091,practice,70.13295999116639,asia-east1,beginner,morning,youth
3,pid_1682604598,1932,practice,25.18105680819091,us-east1,intermediate,morning,youth
4,pid_1682615383,1284,battle royale,66.35314212794793,europe-west1,beginner,morning,youth


In [15]:
df['text'] = df.apply(' '.join, axis=1) #create text column combining all user values for embedding API call
#df['embedding'] = np.nan
df.reset_index()
df.head()

Unnamed: 0,player_id,elo,match_type,ping,region,level,playtime,age_group,text
0,pid_1682550803,1845,practice,83.19844338252759,australia-southeast1,advanced,morning,youth,pid_1682550803 1845 practice 83.19844338252759...
1,pid_1682609504,837,battle royale,45.965771639661824,us-west1,intermediate,morning,adult,pid_1682609504 837 battle royale 45.9657716396...
2,pid_1682558464,2091,practice,70.13295999116639,asia-east1,beginner,morning,youth,pid_1682558464 2091 practice 70.13295999116639...
3,pid_1682604598,1932,practice,25.18105680819091,us-east1,intermediate,morning,youth,pid_1682604598 1932 practice 25.18105680819091...
4,pid_1682615383,1284,battle royale,66.35314212794793,europe-west1,beginner,morning,youth,pid_1682615383 1284 battle royale 66.353142127...


In [16]:
len(df)
df_small = df.head(100)

## Get Embeddings

In [17]:
rate_limit_minute = 150

In [77]:
PRIOR_PARSE = False

if PRIOR_PARSE:
  print('Embeddings created on previous run.')
else:
  print(f"The expected run time for embeddings is {(len(df)/rate_limit_minute):.2f} minutes")
  start = time.time()
  for index, row in df.iterrows():
    if index % rate_limit_minute == 0:
      time.sleep(((time.time() - start) % 60) + 10)
      start = time.time()
    text = row.text
    if text:
      embed = embedding_model.get_embeddings([text])[0].values
    else:
      embed = []
    df['embedding'][index] = embed

The expected run time for embeddings is 6.67 minutes


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embedding'][index] = embed


ValueError: ignored

In [57]:
df.head()

Unnamed: 0,player_id,elo,match_type,ping,region,level,playtime,age_group,text
0,pid_1682550803,1845,practice,83.19844338252759,australia-southeast1,advanced,morning,youth,pid_1682550803 1845 practice 83.19844338252759...
1,pid_1682609504,837,battle royale,45.965771639661824,us-west1,intermediate,morning,adult,pid_1682609504 837 battle royale 45.9657716396...
2,pid_1682558464,2091,practice,70.13295999116639,asia-east1,beginner,morning,youth,pid_1682558464 2091 practice 70.13295999116639...
3,pid_1682604598,1932,practice,25.18105680819091,us-east1,intermediate,morning,youth,pid_1682604598 1932 practice 25.18105680819091...
4,pid_1682615383,1284,battle royale,66.35314212794793,europe-west1,beginner,morning,youth,pid_1682615383 1284 battle royale 66.353142127...


In [71]:
# Create embedding and append to DF
total_time_start = time.time()
df_small['embedding'] = df_small['text'].apply(lambda x: embedding_model.get_embeddings([x])[0].values) #embedding API takes string as input

total_time_end = time.time()
print("Total time (s):", (total_time_end - total_time_start))

Total time (s): 10.965145349502563


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['embedding'] = df_small['text'].apply(lambda x: embedding_model.get_embeddings([x])[0].values) #embedding API takes string as input


## Create Index

In [72]:
total_time_start = time.time()
# Embedding length is 768, this is creating a np array to feed into the semantic matching searcher.  dataset_len is number of embeddings to feed into matching engine
dataset_len = len(df_small)
dataset = np.empty((dataset_len, 768))
for i in range(dataset_len):
    dataset[i] = df.embedding[i]

normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]
# configure ScaNN as a tree - asymmetric hash hybrid with reordering
# anisotropic quantization as described in the paper; see README

# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = scann.scann_ops_pybind.builder(normalized_dataset, 10, "dot_product").tree(
    num_leaves=dataset_len, num_leaves_to_search=dataset_len, training_sample_size=dataset_len).score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(dataset_len).build()

total_time_end = time.time()
print("Total time (s):", (total_time_end - total_time_start))

RuntimeError: ignored

## Queries the Index

Search the index.

In [None]:
import time
def search_users(query, num_neighbors=3):
    start = time.time()
    print('User: ' + query)
    query = model.get_embeddings([query])[0].values
    neighbors, distances = searcher.search(query, final_num_neighbors=num_neighbors)
    end = time.time()

    for id, dist in zip(neighbors, distances):
        print(f'[userid:{id}] {df.text[int(id)][:125]} -- [{dist}]')
    print("Latency (ms):", 1000*(end - start))
    print('-------------------')

In [None]:
total_time_start = time.time()
for i in df_small.text:
  search_users(i)

total_time_end = time.time()
print("Total time (s):", (total_time_end - total_time_start))

User: pid_1682550803 1845 practice 83.19844338252759 australia-southeast1 advanced morning youth
[userid:0] pid_1682550803 1845 practice 83.19844338252759 australia-southeast1 advanced morning youth -- [0.9999991655349731]
[userid:77] pid_1682559909 1486 practice 61.3715768205865 australia-southeast1 advanced morning youth -- [0.9834303855895996]
[userid:78] pid_1682549124 1847 practice 37.728594246986646 australia-southeast1 intermediate morning youth -- [0.9693171977996826]
Latency (ms): 113.48390579223633
-------------------
User: pid_1682609504 837 battle royale 45.965771639661824 us-west1 intermediate morning adult
[userid:1] pid_1682609504 837 battle royale 45.965771639661824 us-west1 intermediate morning adult -- [0.9999987483024597]
[userid:94] pid_1682616649 1728 battle royale 45.38254420499133 us-west1 intermediate morning adult -- [0.9935961961746216]
[userid:5] pid_1682604714 1668 battle royale 39.55532556085238 us-west1 intermediate morning adult -- [0.9931918382644653]
La

In [None]:
# ad hoc search
search_users(df_small.text[0])

User: pid_1682550803 1845 practice 83.19844338252759 australia-southeast1 advanced morning youth
[userid:0] pid_1682550803 1845 practice 83.19844338252759 australia-southeast1 advanced morning youth -- [0.9999991655349731]
[userid:77] pid_1682559909 1486 practice 61.3715768205865 australia-southeast1 advanced morning youth -- [0.9834303855895996]
[userid:78] pid_1682549124 1847 practice 37.728594246986646 australia-southeast1 intermediate morning youth -- [0.9693171977996826]
Latency (ms): 88.2120132446289
-------------------
