# 2. Semantic Retriever - Gemini API

https://ai.google.dev/gemini-api/docs/semantic_retrieval?utm_source=chatgpt.com

## Setup

In [14]:
service_account_file_name = '../service_account_key.json'

from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(service_account_file_name)

scoped_credentials = credentials.with_scopes(
    ['https://www.googleapis.com/auth/cloud-platform', 'https://www.googleapis.com/auth/generative-language.retriever'])

In [15]:
import google.ai.generativelanguage as glm
generative_service_client = glm.GenerativeServiceClient(credentials=scoped_credentials)
retriever_service_client = glm.RetrieverServiceClient(credentials=scoped_credentials)
permission_service_client = glm.PermissionServiceClient(credentials=scoped_credentials)

## Knowledge Base

### Corpus

**Corpus** in the context of Google's Gemini API (and semantic retrieval in general) refers to a structured collection of data that you want the model to search through.

#### Create Corpus

In [16]:
list_corpora_response = retriever_service_client.list_corpora()

existing_corpus = None
for corpus in list_corpora_response.corpora:
    if corpus.display_name == "Tiktoks from MLB":  # Change this name as needed
        existing_corpus = corpus
        break

if existing_corpus:
    print(f"Corpus already exists: {existing_corpus.name}")
    corpus_resource_name = existing_corpus.name
else:
    print("Corpus does not exist. Creating a new one...")
    corpus = glm.Corpus(display_name="Tiktoks from MLB")
    create_corpus_request = glm.CreateCorpusRequest(corpus=corpus)
    # Make the request
    create_corpus_response = retriever_service_client.create_corpus(create_corpus_request)
    # Set the `corpus_resource_name` for subsequent sections.
    corpus_resource_name = create_corpus_response.name
    print(create_corpus_response)


Corpus already exists: corpora/tiktoks-from-mlb-ecz04so6tdfy


### Document

A Corpus can contain up to 10,000 Documents. 

In [17]:
list_documents_request = glm.ListDocumentsRequest(parent=corpus_resource_name)
list_documents_response = retriever_service_client.list_documents(list_documents_request)

if len(list_documents_response.documents) == 0:
    print('Creating document')
    # Create a document with a custom display name.
    document = glm.Document(display_name="2016-mlb-homeruns")
    # Add metadata.
    document_metadata = [
        glm.CustomMetadata(key="file_name", string_value="2016-mlb-homeruns.csv")]
    document.custom_metadata.extend(document_metadata)
    # Make a request
    create_document_request = glm.CreateDocumentRequest(parent=corpus_resource_name, document=document)
    create_document_response = retriever_service_client.create_document(create_document_request)
    # Set the `document_resource_name` for subsequent sections.
    document_resource_name = create_document_response.name
    print(create_document_response)
elif len(list_documents_response.documents) == 1:
    for document in list_documents_response.documents:
        print(f"Document ID: {document.name}, Title: {document.display_name}")
        document_resource_name = document.name
else:
    assert len(list_documents_response.documents) <= 1, "We need a single document for our purposes"

Creating document
name: "corpora/tiktoks-from-mlb-ecz04so6tdfy/documents/2016mlbhomeruns-nua8bwthg1mp"
display_name: "2016-mlb-homeruns"
custom_metadata {
  key: "file_name"
  string_value: "2016-mlb-homeruns.csv"
}
update_time {
  seconds: 1740054886
  nanos: 829604000
}
create_time {
  seconds: 1740054886
  nanos: 829604000
}



### Ingest & Chunk the Document

In [18]:
import pandas as pd

df_homeruns = pd.read_csv('2016-mlb-homeruns.csv')
df_homeruns.head(1)

Unnamed: 0,play_id,title,ExitVelocity,HitDistance,LaunchAngle,video
0,5b254850-9e14-48d2-8baf-c3e12ecbe68d,John Jaso homers (6) on a fly ball to center f...,102.9,412.0,25.0,https://sporty-clips.mlb.com/YjlLTlpfWGw0TUFRP...


In [19]:
df_homeruns.isna().sum()

play_id          0
title            2
ExitVelocity    49
HitDistance      5
LaunchAngle     49
video            0
dtype: int64

In [20]:
df_homeruns = df_homeruns[~df_homeruns['title'].isna()]

In [21]:
df_homeruns.isna().sum()

play_id          0
title            0
ExitVelocity    48
HitDistance      5
LaunchAngle     48
video            0
dtype: int64

In [22]:
# Dictionary to get row id based on response from semantic api
result_dict = {row[1]: index for index, row in df_homeruns.iterrows() for col in ["title"]}
for key, value in result_dict.items():
    print(f"key: {key}, value: {value}")
    break

  result_dict = {row[1]: index for index, row in df_homeruns.iterrows() for col in ["title"]}


key: John Jaso homers (6) on a fly ball to center field., value: 0


In [23]:
chunks = df_homeruns["title"].tolist()

In [24]:
chunks[0]

'John Jaso homers (6) on a fly ball to center field.'

### Batch create chunks, create and upload each chunk

In [25]:
# Create and upload each chunk
for i, chunk_text in enumerate(chunks):
    try:
        # Create chunk objest from chunk of text
        chunk = glm.Chunk(data={'string_value': chunk_text})
        chunk.custom_metadata.append(glm.CustomMetadata(key="part", string_value=f"Part {i+1}"))
        # glm.CreateChunkRequest prepares the API request to upload the chunk to GCP
        create_chunk_request = glm.CreateChunkRequest(parent=document_resource_name, chunk=chunk)
        retriever_service_client.create_chunk(create_chunk_request)
        if i % 50 == 0:
            print(f'{i+1} out of {len(chunks)}')
            print(chunk_text)
    except:
        print(chunk_text)
        print(i)
        print(type(chunk_text))

1 out of 5497
John Jaso homers (6) on a fly ball to center field.
51 out of 5497
Scooter Gennett homers (13) on a fly ball to right center field.    Andrew Susac scores.
101 out of 5497
Brandon Belt homers (17) on a fly ball to center field.   Madison Bumgarner scores.    Denard Span scores.
151 out of 5497
Mark Trumbo homers (26) on a fly ball to left field.
201 out of 5497
Jackie Bradley Jr.  homers (16) on a line drive to right center field.
251 out of 5497
Nelson Cruz homers (27) on a fly ball to left field.
301 out of 5497
Justin Turner homers (16) on a fly ball to left center field.   Adrian Gonzalez scores.
351 out of 5497
Ender Inciarte homers (2) on a fly ball to right field.    Chase d'Arnaud scores.
401 out of 5497
Ryan Braun homers (20) on a fly ball to center field.
451 out of 5497
Justin Turner homers (24) on a fly ball to left center field.
501 out of 5497
Wilmer Flores homers (16) on a fly ball to left center field.   Curtis Granderson scores.
551 out of 5497
Robinson C

## Query the corpus

Perform semantic search

In [36]:
# Define the user query
user_query = "Show be the best hit"
content = glm.Content(parts=[glm.Part(text=user_query)])

# Configure the semantic retriever
retriever_config = glm.SemanticRetrieverConfig(source=corpus_resource_name, query=content)

# Create the request
generate_answer_request = glm.GenerateAnswerRequest(
    model="models/aqa",
    contents=[content],
    semantic_retriever=retriever_config,
    answer_style="ABSTRACTIVE"  # Options: ABSTRACTIVE, VERBOSE, EXTRACTIVE
)

# Make the request
aqa_response = generative_service_client.generate_answer(generate_answer_request)

# Output the response
# final answer based on the model's understanding and retrieved information.

print(aqa_response.answer.content.parts[0].text)
print('\n')
# This outputs the first piece of source material that contributed to the AI's answer.
print(aqa_response.answer.grounding_attributions[0].content.parts[0].text)
print('\n')
print(aqa_response)

The best hit is a grand slam, which is worth four runs. In this game, there were three grand slams hit: one by Josh Bell of the Pittsburgh Pirates, one by Khris Davis of the Oakland Athletics, and one by Adam Eaton of the Chicago White Sox.


Adam Eaton hits a grand slam (11) to right field.   JB Shuck scores.    Tim Anderson scores.    Jason Coats scores.


answer {
  content {
    parts {
      text: "The best hit is a grand slam, which is worth four runs. In this game, there were three grand slams hit: one by Josh Bell of the Pittsburgh Pirates, one by Khris Davis of the Oakland Athletics, and one by Adam Eaton of the Chicago White Sox."
    }
  }
  finish_reason: STOP
  safety_ratings {
    category: HARM_CATEGORY_SEXUALLY_EXPLICIT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HATE_SPEECH
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CATEGORY_HARASSMENT
    probability: NEGLIGIBLE
  }
  safety_ratings {
    category: HARM_CAT

In [49]:
results = aqa_response.answer.grounding_attributions

In [52]:
list_results = []
for i in range(len(results)):
    list_results.append(results[i].content.parts[0].text)

In [61]:
df_homeruns.iloc[[id for title, id in result_dict.items() if title in list_results]]['video'].tolist()

['https://sporty-clips.mlb.com/NE1vNUJfWGw0TUFRPT1fQTFKWkFsRURCd3NBRFZFRFZBQUFCRk5WQUZnQUJsSUFCVnhYQXdSUUNBTURCd3BU.mp4',
 'https://sporty-clips.mlb.com/cjhCUjFfWGw0TUFRPT1fVjFOUlhGMVdWVlFBWGxCUUFnQUFCRkplQUFOVVYxRUFVVjBNQ1ZVREJRb0dVd0FE.mp4',
 'https://sporty-clips.mlb.com/TTd4ZDNfWGw0TUFRPT1fRGdZREIxTlJYMWNBRFZGV1V3QUFCUWRRQUZrRFVGWUFBZ01HQVFVTkF3SlFBZ29B.mp4',
 'https://sporty-clips.mlb.com/eDlLREJfWGw0TUFRPT1fVWdnRlhGZFJBd29BV1ZjREF3QUFCQUFDQUFBR1VsVUFCVk5RQXdJRlZ3QUJDVlpm.mp4',
 'https://sporty-clips.mlb.com/cTZZNmdfWGw0TUFRPT1fVWdBRFZnSlNWQVlBRFZKVUF3QUFVZzhDQUZnRFdnQUFVRndOVmxBRUJsRlVCQUpU.mp4',
 'https://sporty-clips.mlb.com/NTJvM05fWGw0TUFRPT1fQkZSUkJRY0hWUVFBV1FZRUFBQUFWQVZTQUZrRkIxSUFVMUpVVXdCUkNGVUJCQXBS.mp4',
 'https://sporty-clips.mlb.com/NTJvS1hfWGw0TUFRPT1fQmxJRkJsME1VQUlBV2xRSFZBQUFWRlVGQUFNRVVBUUFBMUpSVTFVTUNBWUVWbFlB.mp4',
 'https://sporty-clips.mlb.com/V2R6YVlfWGw0TUFRPT1fQVFOWVYxUUVWbFFBWFZJSFZnQUFBRmNIQUZrR0FnQUFBd1pUQkFOWEJndFVCRkFE.mp4',
 'https://sporty-clips.m

## Move the first video to bucket

In [1]:
import requests
from google.cloud import storage

# Step 1: Download the video from the URL
video_url = "https://sporty-clips.mlb.com/NE1vNUJfWGw0TUFRPT1fQTFKWkFsRURCd3NBRFZFRFZBQUFCRk5WQUZnQUJsSUFCVnhYQXdSUUNBTURCd3BU.mp4"
local_filename = "video.mp4"

response = requests.get(video_url)
if response.status_code == 200:
    with open(local_filename, "wb") as f:
        f.write(response.content)
    print(f"Downloaded video to {local_filename}")
else:
    print("Failed to download the video.")

# Step 2: Upload the video to your GCP bucket into the "videos/" folder
bucket_name = "mlb_hackathon_bucket"  # Replace with your bucket name
destination_blob_name = "videos/video.mp4"  # This creates a folder-like prefix in your bucket

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)

blob.upload_from_filename(local_filename)
print(f"File uploaded to gs://{bucket_name}/{destination_blob_name}")


Downloaded video to video.mp4
File uploaded to gs://mlb_hackathon_bucket/videos/video.mp4
