##### Copyright 2025 Google LLC.

In [1]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini API: Similarity Search using Qdrant

<a target="_blank" href="https://colab.research.google.com/github/google-gemini/cookbook/blob/main/examples/qdrant/Qdrant_similarity_search.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" height=30/></a>

## Overview

The [Gemini API](https://ai.google.dev/models/gemini) provides access to a family of generative AI models for generating content and solving problems. These models are designed and trained to handle both text and images as input.

[Qdrant](https://qdrant.tech/) is a vector similarity search engine that offers an easy-to-use API for managing, storing, and searching vectors, with an additional payload. It is a production-ready service.

In this notebook, you'll learn how to perform a similarity search on data from a website with the help of Gemini API and Qdrant.

## Setup

First, you must install the packages and set the necessary environment variables.

### Installation

Install google's python client SDK for the Gemini API, `google-genai`. Next, install Qdrant's Python client SDK, `qdrant-client`.

In [2]:
%pip install -q "google-genai>=1.0.0"
%pip install -q protobuf==4.25.1 qdrant-client[fastembed]

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.7/306.7 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Configure your API key

To run the following cell, your API key must be stored it in a Colab Secret named `GOOGLE_API_KEY`. If you don't already have an API key, or you're not sure how to create a Colab Secret, see [Authentication](https://github.com/google-gemini/cookbook/blob/main/quickstarts/Authentication.ipynb) for an example.

In [4]:
from google.colab import userdata
from google import genai

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai_client = genai.Client(api_key=GOOGLE_API_KEY)

### Importing and Cleaning Data

In [5]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "TMDB_movie_dataset_v11.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "asaniczka/tmdb-movies-dataset-2023-930k-movies",
  file_path,
)


  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/asaniczka/tmdb-movies-dataset-2023-930k-movies?dataset_version_number=538&file_name=TMDB_movie_dataset_v11.csv...


100%|██████████| 522M/522M [00:10<00:00, 51.5MB/s]


In [6]:
print("\nDataset Columns:")
print(df.columns)

print("\nMissing Values per Column:")
print(df.isnull().sum())

print(f"\nNumber of rows: {len(df)}")
print(f"Number of unique IDs: {df['id'].nunique()}")


Dataset Columns:
Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

Missing Values per Column:
id                            0
title                        13
vote_average                  0
vote_count                    0
status                        0
release_date             216944
revenue                       0
runtime                       0
adult                         0
backdrop_path            891119
budget                        0
homepage                1079187
imdb_id                  587175
original_language             0
original_title               13
overview                 253712
popularity                    0
poster_path 

In [7]:
import pandas as pd
import numpy as np
import ast
print(f"Original rows: {len(df)}")

columns_to_keep = ['id', 'title', 'overview', 'genres', 'keywords', 'tagline', 'release_date']

df_relevant = df[columns_to_keep].copy()

print(f"Rows before dropping missing title: {len(df_relevant)}")
df_relevant.dropna(subset=['title'], inplace=True)
df_relevant = df_relevant[~(df_relevant['genres'].isna() & df_relevant['overview'].isna())]
print(f"Rows after dropping missing title and dropping missing (genres and overview): {len(df_relevant)}")

text_cols_to_fill = ['overview', 'genres', 'keywords', 'tagline']
for col in text_cols_to_fill:
    df_relevant[col] = df_relevant[col].fillna('')


def get_year(date_str):
    if pd.isna(date_str) or not isinstance(date_str, str) or len(date_str) < 4:
        return None
    try:
        return int(date_str[:4])
    except (ValueError, TypeError):
        return None

df_relevant['release_year'] = df_relevant['release_date'].apply(get_year)

print("\nSample data after cleaning (keeping missing overviews):")
print(df_relevant[['id', 'title', 'overview', 'genres', 'keywords', 'tagline', 'release_year']].head())

Original rows: 1206010
Rows before dropping missing title: 1206010
Rows after dropping missing title and dropping missing (genres and overview): 1072918

Sample data after cleaning (keeping missing overviews):
       id            title                                           overview  \
0   27205        Inception  Cobb, a skilled thief who commits corporate es...   
1  157336     Interstellar  The adventures of a group of explorers who mak...   
2     155  The Dark Knight  Batman raises the stakes in his war on crime. ...   
3   19995           Avatar  In the 22nd century, a paraplegic Marine is di...   
4   24428     The Avengers  When an unexpected enemy emerges and threatens...   

                                        genres  \
0           Action, Science Fiction, Adventure   
1            Adventure, Drama, Science Fiction   
2               Drama, Action, Crime, Thriller   
3  Action, Adventure, Fantasy, Science Fiction   
4           Science Fiction, Action, Adventure   

  

In [8]:
def create_embedding_text(row):
    """Combines available movie metadata into a single string for embedding."""
    # We can use title directly as we know all entries have title
    title_str = f"Title: {row['title']}"
    overview_str = f"Overview: {row['overview']}" if row['overview'] else ""
    year_str = f"Release Year: {int(row['release_year'])}" if pd.notna(row['release_year']) else ""
    genre_str = f"Genres: {row['genres']}" if row['genres'] else ""
    keywords_str = f"Keywords: {row['keywords']}" if row['keywords'] else ""
    tagline_str = f"Tagline: {row['tagline']}" if row['tagline'] else ""

    parts = [
        title_str,
        overview_str,
        year_str,
        genre_str,
        keywords_str,
        tagline_str
    ]
    return "\n".join(part for part in parts if part)

df_relevant['text_for_embedding'] = df_relevant.apply(create_embedding_text, axis=1)

# We can observe how data is now structured in df_relevant
print(df_relevant[['id', 'title', 'text_for_embedding']].head())

       id            title                                 text_for_embedding
0   27205        Inception  Title: Inception\nOverview: Cobb, a skilled th...
1  157336     Interstellar  Title: Interstellar\nOverview: The adventures ...
2     155  The Dark Knight  Title: The Dark Knight\nOverview: Batman raise...
3   19995           Avatar  Title: Avatar\nOverview: In the 22nd century, ...
4   24428     The Avengers  Title: The Avengers\nOverview: When an unexpec...


### Sampling 5k movies out of the 1M collection

In [9]:
SAMPLE_SIZE = 5000

if len(df_relevant) > SAMPLE_SIZE:
    print(f"\nTaking a random sample of {SAMPLE_SIZE} movies for development.")
    df_sample = df_relevant.sample(n=SAMPLE_SIZE, random_state=42)
else:
    print(f"\nCleaned dataset size ({len(df_relevant)}) is smaller than or equal to SAMPLE_SIZE. Using the full cleaned dataset.")
    df_sample = df_relevant

print(f"Working with {len(df_sample)} movies for the next steps.")
print(df_sample[['id', 'title', 'release_year']].head())

columns_for_payload = ['title', 'overview', 'genres', 'keywords', 'tagline', 'release_year']
columns_final = ['id', 'text_for_embedding'] + columns_for_payload
df_sample = df_sample[columns_final]

print("\nFinal sample DataFrame structure for embedding/indexing:")
print(df_sample.info())



Taking a random sample of 5000 movies for development.
Working with 5000 movies for the next steps.
             id                              title  release_year
1063931  178009               Fathers of the Sport        2008.0
427990    76618  Dragon Tales: It's Cool to be Me!        2002.0
19431     38006             The Abominable Snowman        1957.0
123459   131907                      God Is on Air        2002.0
1140151  746134                  Run, Jackson, Run        1972.0

Final sample DataFrame structure for embedding/indexing:
<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 1063931 to 988347
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  5000 non-null   int64  
 1   text_for_embedding  5000 non-null   object 
 2   title               5000 non-null   object 
 3   overview            5000 non-null   object 
 4   genres              5000 non-null   object 

In [10]:
from qdrant_client import QdrantClient, models
import time
from tqdm.auto import tqdm

COLLECTION_NAME = "tmdb_movies_sample"

VECTOR_SIZE = 768
DISTANCE_METRIC = models.Distance.COSINE


client = QdrantClient(":memory:")

Extracing embeddings of movies in batches

In [11]:
import time
from google.api_core import exceptions, retry

MODEL_FOR_EMBEDDING = "models/embedding-001"

BATCH_SIZE = 100
QDRANT_BATCH_SIZE = 768


@retry.Retry(timeout=3000)
def get_embeddings_batch(texts: list[str], task_type="RETRIEVAL_DOCUMENT") -> list[list[float]] | None:
    """
    Generates embeddings for a batch of texts using Gemini API with retry.

    Args:
        texts: A list of strings to embed.
        task_type: The task type for the embedding model.

    Returns:
        A list of embedding vectors (list of floats), or None if a non-retryable error occurs.
    """
    if not texts:
        return []
    try:
        response = genai_client.models.embed_content(
          model=MODEL_FOR_EMBEDDING,
          contents=texts,
          config={
            "task_type":task_type,
          }
        )
        return response.embeddings
    except exceptions.RetryError as e:
        print(f"Embedding batch failed after retries: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during embedding: {e}")
        return None

print(f"Batch embedding function 'get_embeddings_batch' defined using model: {MODEL_FOR_EMBEDDING}")

Batch embedding function 'get_embeddings_batch' defined using model: models/embedding-001


### Creating collections for storing embeddings

In [12]:
# In case someone tries running the whole notebook again we want to create the collection again

try:
    client.delete_collection(collection_name=COLLECTION_NAME)
    print(f"Existing collection '{COLLECTION_NAME}' deleted.")
except Exception as e:
    print(f"Error deleting collection (it might not exist): {e}")

try:
    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=VECTOR_SIZE,
            distance=DISTANCE_METRIC
        )
    )
    print(f"Collection '{COLLECTION_NAME}' created successfully.")
except Exception as e:
    print(f"Error creating collection: {e}")

Existing collection 'tmdb_movies_sample' deleted.
Collection 'tmdb_movies_sample' created successfully.


In [13]:
def create_payload(row, payload_columns):
    payload = {}
    for col in payload_columns:
        value = row[col]
        if pd.isna(value):
            payload[col] = None
        elif isinstance(value, (np.int64, np.int32)):
            payload[col] = int(value)
        elif isinstance(value, (np.float64, np.float32)):
             payload[col] = float(value)
        else:
            payload[col] = value
    return payload

payload_columns = [
    'title', 'overview', 'genres', 'keywords', 'tagline', 'release_year'
]

In [14]:
print(f"Starting batch embedding and indexing process for {len(df_sample)} movies...")
print(f"Using Gemini Batch Size: {BATCH_SIZE}, Qdrant Upsert Batch Size: {QDRANT_BATCH_SIZE}")

points_to_upsert_buffer = []
total_processed = 0
total_failed_embedding = 0
total_upserted = 0

num_batches = (len(df_sample) + BATCH_SIZE - 1) // BATCH_SIZE
for i in tqdm(range(0, len(df_sample), BATCH_SIZE), total=num_batches, desc="Processing Batches"):

    batch_df = df_sample.iloc[i : i + BATCH_SIZE]
    batch_texts = batch_df['text_for_embedding'].tolist()
    batch_ids = batch_df['id'].tolist()

    if not batch_texts:
        continue

    batch_embeddings = get_embeddings_batch(batch_texts, task_type="RETRIEVAL_DOCUMENT")

    if batch_embeddings and len(batch_embeddings) == len(batch_texts):
        for j in range(len(batch_ids)):
            item_id = batch_ids[j]
            item_embedding = batch_embeddings[j]
            row_data = batch_df.iloc[j]

            payload = create_payload(row_data, payload_columns)

            point = models.PointStruct(
                id=int(item_id),
                vector=item_embedding.values,
                payload=payload
            )
            points_to_upsert_buffer.append(point)

        total_processed += len(batch_ids)

    else:
        print(f"Failed to get embeddings for batch starting at index {i}. Skipping {len(batch_ids)} items.")
        total_failed_embedding += len(batch_ids)
        continue

    if len(points_to_upsert_buffer) >= QDRANT_BATCH_SIZE or (i + BATCH_SIZE >= len(df_sample)):
        if points_to_upsert_buffer:
            try:
                client.upsert(
                    collection_name=COLLECTION_NAME,
                    points=points_to_upsert_buffer,
                    wait=False
                )
                total_upserted += len(points_to_upsert_buffer)
                points_to_upsert_buffer = []
            except Exception as e:
                print(f"Error upserting chunk to Qdrant: {e}")
                points_to_upsert_buffer = []
                time.sleep(5)
                # Following best practices we should take pauses before accessing collection again after erros

if points_to_upsert_buffer:
    print(f"Upserting final remaining chunk of {len(points_to_upsert_buffer)} points.")
    try:
        client.upsert(
            collection_name=COLLECTION_NAME,
            points=points_to_upsert_buffer,
            wait=True
        )
        total_upserted += len(points_to_upsert_buffer)
        points_to_upsert_buffer = []
    except Exception as e:
        print(f"Error upserting final chunk: {e}")

print("\nBatch embedding and indexing finished.")
print(f"Total items processed (attempted embedding): {total_processed}")
print(f"Total items failed embedding: {total_failed_embedding}")
print(f"Total points successfully prepared for upsert: {total_upserted}")

Starting batch embedding and indexing process for 5000 movies...
Using Gemini Batch Size: 100, Qdrant Upsert Batch Size: 768


Processing Batches:   0%|          | 0/50 [00:00<?, ?it/s]


Batch embedding and indexing finished.
Total items processed (attempted embedding): 5000
Total items failed embedding: 0
Total points successfully prepared for upsert: 5000


In [15]:
# Waiting for collection to settle
time.sleep(5)

try:
    count = client.count(collection_name=COLLECTION_NAME, exact=True)
    print(f"\nVerification: Collection '{COLLECTION_NAME}' now contains {count.count} points.") # it should print 5000

except Exception as e:
    print(f"Error verifying collection count: {e}")


Verification: Collection 'tmdb_movies_sample' now contains 5000 points.


In [16]:
def recommend_movies(query_text, top_k=5):
    """
    Finds movies similar to the query_text using the Qdrant index.

    Args:
        query_text (str): The user's query (e.g., movie title, description, theme).
        top_k (int): The maximum number of recommendations to return.

    Returns:
        list: A list of dictionaries, where each dictionary contains the
              payload (movie details) and similarity score of a recommended movie.
              Returns an empty list if query embedding fails or no results found.
    """
    print(f"\nSearching for recommendations based on: '{query_text}'")

    query_embedding = get_embeddings_batch(query_text, task_type="RETRIEVAL_QUERY")[0].values

    if query_embedding is None:
        print("Error: Could not generate embedding for the query.")
        return []

    try:
        search_result = client.search(
            collection_name=COLLECTION_NAME,
            query_vector=query_embedding,
            limit=top_k,
            with_payload=True
        )

        recommendations = []
        if search_result:
            print(f"Found {len(search_result)} potential recommendations:")
            for hit in search_result:
                recommendation = {
                    "id": hit.id,
                    "score": hit.score,
                    "payload": hit.payload
                }
                recommendations.append(recommendation)
        else:
            print("No recommendations found matching the query.")

        return recommendations

    except Exception as e:
        print(f"Error during Qdrant search: {e}")
        return []

In [17]:
query = "spy and action based movies"
recommendations = recommend_movies(query, top_k=5)

if recommendations:
    print("\n--- Recommendations ---")
    for rec in recommendations:
        print(f"  - Score: {rec['score']:.4f}")
        print(f"    Title: {rec['payload'].get('title', 'N/A')}")
        print(f"    Genre: {rec['payload'].get('genres', 'N/A')}")
        print(f"    Year: {rec['payload'].get('release_year', 'N/A')}")
        print("-" * 10)


Searching for recommendations based on: 'spy and action based movies'
Found 5 potential recommendations:

--- Recommendations ---
  - Score: 0.6738
    Title: Goldsnake: Anonima Killers
    Genre: Action, Adventure
    Year: 1966.0
----------
  - Score: 0.6693
    Title: Assassination in Rome
    Genre: Comedy, Crime, Mystery, Romance, Thriller
    Year: 1965.0
----------
  - Score: 0.6620
    Title: Himmat
    Genre: Action
    Year: 1996.0
----------
  - Score: 0.6615
    Title: The Detonator
    Genre: Action, Thriller
    Year: 2006.0
----------
  - Score: 0.6581
    Title: Monarch
    Genre: Action
    Year: 2015.0
----------


  search_result = client.search(
