## Install Dependencies

In [1]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install redis
!{sys.executable} -m pip install themoviedb
!{sys.executable} -m pip install awscli
!{sys.executable} -m pip install boto3
!{sys.executable} -m pip install botocore
!{sys.executable} -m pip install themoviedb

Collecting redis
  Downloading redis-5.0.7-py3-none-any.whl.metadata (9.3 kB)
Downloading redis-5.0.7-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.1/252.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: redis
Successfully installed redis-5.0.7
Collecting themoviedb
  Downloading themoviedb-0.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting aiohttp==3.8.4 (from themoviedb)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting async-timeout==4.0.2 (from themoviedb)
  Downloading async_timeout-4.0.2-py3-none-any.whl.metadata (4.2 kB)
Collecting attrs==23.1.0 (from themoviedb)
  Downloading attrs-23.1.0-py3-none-any.whl.metadata (11 kB)
Collecting certifi==2023.5.7 (from themoviedb)
  Downloading certifi-2023.5.7-py3-none-any.whl.metadata (2.2 kB)
Collecting charset-normalizer==3.1.0 (from themoviedb)
  Downloading charset_normalizer-3.1.

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import uuid
#from sentence_transformers import SentenceTransformer
import datetime
import os
import redis
import time
import json
import boto3
import botocore
import pickle
import re
import gzip
import shutil
from urllib import request
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField, NumericField
from redis.commands.search.field import TagField
from redis.commands.search.query import Query, NumericFilter
from redis.commands.search.result import Result
from redis.cluster import RedisCluster as MemoryDB
from redis.commands.search.field import VectorField, TextField, NumericField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from themoviedb import TMDb
from multiprocessing.pool import ThreadPool

## Test Connection to MemoryDB

In [3]:
%%time
memorydb_host = os.environ.get("MEMORYDB_HOST", "clustercfg.xxxx.xxxxxx.memorydb.xx-xxxx-x.amazonaws.com")
memorydb_port = os.environ.get("MEMORYDB_PORT", 6379)
print(f"MemoryDB Url = {memorydb_host}:{memorydb_port}")
rc = MemoryDB(host=memorydb_host, port=memorydb_port, ssl=True, decode_responses=False, ssl_cert_reqs="none")
rc.ping()
# rc.flushall()

MemoryDB Url = clustercfg.xxxx.xxxxxx.memorydb.xx-xxxx-x.amazonaws.com:6379
CPU times: user 73.4 ms, sys: 2.9 ms, total: 76.3 ms
Wall time: 84.3 ms


True

## Download IMDB Datasets

### Define Functions

In [4]:
dataset_path = "datasets"

In [5]:
def download_compressed_dataset(url):
    if not os.path.isdir(dataset_path):
        os.makedirs(dataset_path) 
    compressed_filename = dataset_path + "/" + re.split(pattern='/', string=url)[-1]
    request.urlretrieve(url=url, filename=compressed_filename)
    uncompressed_filename = re.split(pattern=r'\.gz', string=compressed_filename)[0]
    print(f"downloaded {compressed_filename}")
    with gzip.open(compressed_filename, 'rb') as f_in:
        with open(uncompressed_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"extracted {uncompressed_filename}")
    return uncompressed_filename

In [6]:
datasets = [
    "https://datasets.imdbws.com/title.basics.tsv.gz",
    "https://datasets.imdbws.com/title.principals.tsv.gz",
    "https://datasets.imdbws.com/name.basics.tsv.gz",
    "https://datasets.imdbws.com/title.ratings.tsv.gz",
]

retrieved_datasets = ThreadPool(5).imap_unordered(download_compressed_dataset, datasets)
for retrieved_dataset in retrieved_datasets:
    print(f"loading {retrieved_dataset}")
    pickle.dump(pd.read_table(retrieved_dataset,sep="\t",low_memory=False, na_values=["\\N","nan"]), open(retrieved_dataset[:-4]+".sav","wb"))

downloaded datasets/title.ratings.tsv.gz
extracted datasets/title.ratings.tsv
loading datasets/title.ratings.tsv
downloaded datasets/name.basics.tsv.gz
downloaded datasets/title.basics.tsv.gz
downloaded datasets/title.principals.tsv.gz
extracted datasets/name.basics.tsv
loading datasets/name.basics.tsv
extracted datasets/title.basics.tsv
extracted datasets/title.principals.tsv
loading datasets/title.basics.tsv
loading datasets/title.principals.tsv


## Show Datasets 

In [7]:
df_title_basics = pickle.load(open(f"{dataset_path}/title.basics.sav","rb"))
df_title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short"


In [8]:
df_title_principals = pickle.load(open(f"{dataset_path}/title.principals.sav","rb"))
df_title_principals.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0005690,producer,producer,
3,tt0000001,4,nm0374658,cinematographer,director of photography,
4,tt0000002,1,nm0721526,director,,


In [9]:
df_name_basics = pickle.load(open(f"{dataset_path}/name.basics.sav","rb"))
df_name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467"


In [10]:
df_title_ratings = pickle.load(open(f"{dataset_path}/title.ratings.sav","rb"))
df_title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2063
1,tt0000002,5.6,279
2,tt0000003,6.5,2030
3,tt0000004,5.4,180
4,tt0000005,6.2,2796


## Drop Specific Titles

### Drop Specific Title Types

In [11]:
# initial title count
df_title_basics.shape[0]

10913680

In [12]:
# identify title types
df_title_basics.titleType.unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [13]:
# only keep movie tvMovie tvSeries tvMiniSeries
df_title_basics = df_title_basics[(df_title_basics.titleType == "movie") | (df_title_basics.titleType == "tvMovie") | (df_title_basics.titleType == "tvSeries") | (df_title_basics.titleType == "tvMiniSeries")]
df_title_basics.shape[0]

1155452

### Drop Older Titles

In [14]:
# drop titles older than 1925
df_title_basics = df_title_basics[df_title_basics['startYear'] > 1935]
# df_title_basics = df_title_basics[df_title_basics['endYear'] > 1935]
df_title_basics.shape[0]

978305

### Drop Unused Columns

In [15]:
# drop unused columns
df_title_basics = df_title_basics.drop(['originalTitle', 'endYear'], axis=1)
df_title_basics.columns = ['tconst', 'titleType', 'title', 'isAdult', 'year', 'runtime', 'genres']
df_title_basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 978305 entries, 3816 to 10913630
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tconst     978305 non-null  object 
 1   titleType  978305 non-null  object 
 2   title      978303 non-null  object 
 3   isAdult    978305 non-null  float64
 4   year       978305 non-null  float64
 5   runtime    624379 non-null  object 
 6   genres     888957 non-null  object 
dtypes: float64(2), object(5)
memory usage: 59.7+ MB


## Merge Tables

### Prepare Directors Table

In [16]:
df_title_directors = df_title_principals[df_title_principals['category'] == 'director'][['tconst', 'nconst']]
df_title_directors = df_title_directors.merge(df_name_basics[['nconst', 'primaryName']], on='nconst', how='left')
df_title_directors['primaryName'] = df_title_directors['primaryName'].fillna('')
df_title_directors = df_title_directors.groupby('tconst')['primaryName'].apply(lambda x: ', '.join(x)).reset_index()
df_title_directors.columns = ['tconst', 'directors']

### Prepare Actors Table

In [17]:
df_title_actors = df_title_principals[df_title_principals['category'].isin(['actor', 'actress'])][['tconst', 'nconst']]
df_title_actors = df_title_actors.merge(df_name_basics[['nconst', 'primaryName']], on='nconst', how='left')
df_title_actors['primaryName'] = df_title_actors['primaryName'].fillna('')
df_title_actors = df_title_actors.groupby('tconst')['primaryName'].apply(lambda x: ', '.join(x)).reset_index()
df_title_actors.columns = ['tconst', 'actors']

### Merge Tables

In [18]:
df_title_ratings.columns = ['tconst', 'rating', 'numVotes']
df_merged = pd.merge(df_title_basics, df_title_ratings, on='tconst', how='left')
df_merged = pd.merge(df_merged, df_title_directors, on='tconst', how='left')
df_merged = pd.merge(df_merged, df_title_actors, on='tconst', how='left')
df_merged['isAdult'] = df_merged['isAdult'].astype('bool')
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 978305 entries, 0 to 978304
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tconst     978305 non-null  object 
 1   titleType  978305 non-null  object 
 2   title      978303 non-null  object 
 3   isAdult    978305 non-null  bool   
 4   year       978305 non-null  float64
 5   runtime    624379 non-null  object 
 6   genres     888957 non-null  object 
 7   rating     469310 non-null  float64
 8   numVotes   469310 non-null  float64
 9   directors  632265 non-null  object 
 10  actors     701124 non-null  object 
dtypes: bool(1), float64(3), object(7)
memory usage: 75.6+ MB


### Drop More Titles (without directors and without actors)

In [19]:
# drop rows with no directors or actors
df_merged.dropna(subset=['directors'], inplace=True)
df_merged.dropna(subset=['actors'], inplace=True)
df_merged.shape[0]

495542

### Drop Lower Rated Titles (and titles with low number of ratings)

In [20]:
df_merged = df_merged[df_merged['numVotes'] > 2000]
df_merged = df_merged[df_merged['rating'] > 4.0]
df_merged.shape[0]

27910

### Get Plot from TMDB

#### Define Function to Get Plot

In [21]:
from time import sleep
tmdb = TMDb(key="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", language="en-US", region="US")
def get_overview(imdb_id):
    title =  tmdb.find().by_imdb(imdb_id)
    sleep(0.025)
    if title.movie_results:
        return title.movie_results[0].overview
    if  title.tv_results:
        return title.tv_results[0].overview
    return

#### Get Plot for All Remaining Titles

In [22]:
df_subset = df_merged
df_subset['plot'] = df_subset['tconst'].map(get_overview)
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27910 entries, 106 to 978292
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tconst     27910 non-null  object 
 1   titleType  27910 non-null  object 
 2   title      27910 non-null  object 
 3   isAdult    27910 non-null  bool   
 4   year       27910 non-null  float64
 5   runtime    27894 non-null  object 
 6   genres     27907 non-null  object 
 7   rating     27910 non-null  float64
 8   numVotes   27910 non-null  float64
 9   directors  27910 non-null  object 
 10  actors     27910 non-null  object 
 11  plot       27780 non-null  object 
dtypes: bool(1), float64(3), object(8)
memory usage: 2.6+ MB


#### Drop Titles with No Plot

In [23]:
df_subset['plot'].replace('', np.nan, inplace=True)
df_subset.dropna(subset=['plot'], inplace=True)
df_subset.shape[0]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_subset['plot'].replace('', np.nan, inplace=True)


27711

### Generate Vector Embedding for Plot

#### Define Function to Query Bedrock for Vector Embedding

In [24]:
bedrock_runtime = boto3.client('bedrock-runtime')

def get_text_embedding(text, dimensions=512, normalize=True):
    if not isinstance(text, str):
        return ""
    try:
        body = json.dumps({"inputText": text, "dimensions" : dimensions, "normalize": normalize})
        modelId = "amazon.titan-embed-text-v2:0"
        accept = "application/json"
        contentType = "application/json"

        response = bedrock_runtime.invoke_model(
            body=body, modelId=modelId, accept=accept, contentType=contentType
        )
        response_body = json.loads(response.get("body").read())
        return(response_body.get("embedding"))
    except botocore.exceptions.ClientError as error:
        print(text)
        if error.response['Error']['Code'] == 'AccessDeniedException':
               print(f"\x1b[41m{error.response['Error']['Message']}\
                    \nTo troubeshoot this issue please refer to the following resources.\
                     \nhttps://docs.aws.amazon.com/IAM/latest/UserGuide/troubleshoot_access-denied.html\
                     \nhttps://docs.aws.amazon.com/bedrock/latest/userguide/security-iam.html\x1b[0m\n")

        else:
            print(text)
            raise error

#### Add Vector Embedding to Each Title

In [25]:
df_subset["v_plot"] = df_subset["plot"].map(get_text_embedding)
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27711 entries, 106 to 978292
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tconst     27711 non-null  object 
 1   titleType  27711 non-null  object 
 2   title      27711 non-null  object 
 3   isAdult    27711 non-null  bool   
 4   year       27711 non-null  float64
 5   runtime    27704 non-null  object 
 6   genres     27710 non-null  object 
 7   rating     27711 non-null  float64
 8   numVotes   27711 non-null  float64
 9   directors  27711 non-null  object 
 10  actors     27711 non-null  object 
 11  plot       27711 non-null  object 
 12  v_plot     27711 non-null  object 
dtypes: bool(1), float64(3), object(9)
memory usage: 2.8+ MB


In [26]:
df_subset.head()

Unnamed: 0,tconst,titleType,title,isAdult,year,runtime,genres,rating,numVotes,directors,actors,plot,v_plot
106,tt0027260,movie,After the Thin Man,False,1936.0,112,"Comedy,Crime,Mystery",7.6,12211.0,W.S. Van Dyke,"William Powell, Myrna Loy, James Stewart, Elis...",Nick and Nora Charles investigate when Nora's ...,"[-0.112098925, 0.0368897, -0.00024240141, -0.0..."
121,tt0027286,movie,The Amazing Adventure,False,1936.0,61,"Comedy,Drama,Romance",6.3,2193.0,Alfred Zeisler,"Cary Grant, Mary Brian, Peter Gawthorne, Henry...",A bored millionaire wagers his doctor that he ...,"[-0.11646503, -0.016336221, 0.022670673, -0.11..."
134,tt0027300,movie,Anthony Adverse,False,1936.0,141,"Adventure,Drama,Romance",6.3,2047.0,Mervyn LeRoy,"Fredric March, Olivia de Havilland, Donald Woo...","Based on the novel by Hervey Allen, this expan...","[-0.10065765, 0.01913084, -0.029284902, 0.0194..."
158,tt0027336,movie,The Lower Depths,False,1936.0,90,"Crime,Drama,Romance",7.5,3704.0,Jean Renoir,"Jean Gabin, Suzy Prim, Louis Jouvet, Jany Holt...",Inhabitants of a flophouse struggle to survive...,"[-0.049684554, 0.120132804, 0.0017727931, -0.0..."
180,tt0027367,movie,Black Legion,False,1937.0,83,"Crime,Drama,Film-Noir",6.9,3940.0,Archie Mayo,"Humphrey Bogart, Ann Sheridan, Dick Foran, Eri...",When a hard-working machinist loses a promotio...,"[-0.09975542, 0.11154916, 0.009889546, -0.0111..."


## MemoryDB Ingestion

### Define Functions for Index Creation

In [27]:
def generate_key(prefix = ""):
    return prefix + str(uuid.uuid4())

def create_hnsw_index(rc, index_name, vector_field_name, number_of_vectors, vector_dimensions=512, distance_metric='L2', M=16, EF=512, key_prefix=''):
    # Create the index
    try:
        rc.ft(index_name).create_index([
            VectorField(vector_field_name, "HNSW", {
                "TYPE": "FLOAT32",
                "DIM": vector_dimensions,
                "DISTANCE_METRIC": distance_metric,
                "INITIAL_CAP": number_of_vectors,
                "M": M,
                "EF_CONSTRUCTION": EF
            }),
            TagField("isAdult"),
            TagField("titleType"),
            TagField("directors"),
            NumericField("rating"),
            TagField("genres"),
            TextField("plot"),
            TextField("title"),
            NumericField("runtime"),
            TagField("actors"),
            NumericField("year")
        ], definition=IndexDefinition(prefix=[key_prefix]))
        print(f"Index {index_name} created successfully.")
    except ResponseError as e:
        print(f"Index {index_name} created previously: {str(e)}")

### Define Variables for MemoryDB Index

In [28]:
KEY_PREFIX = "movies:"
index_name = "imdb_hnsw_index"
vector_field_name = "v_plot"
number_of_vectors = len(df_subset)
print(f"Creating Index {index_name} on Field {vector_field_name} expecting {number_of_vectors:,} vectors")

Creating Index imdb_hnsw_index on Field v_plot expecting 27,711 vectors


### Create Index (and delete existing items)

In [29]:
%%time
# Create index in MemoryDB
rc.ft(index_name).dropindex()
for key in rc.scan_iter("movies:*"):
    rc.delete(key)
create_hnsw_index(rc, index_name, vector_field_name, number_of_vectors, 
                  vector_dimensions=512, distance_metric='Cosine', M=16, EF=512, key_prefix=KEY_PREFIX)

Index imdb_hnsw_index created successfully.
CPU times: user 10.4 ms, sys: 0 ns, total: 10.4 ms
Wall time: 306 ms


### Load Data into MemoryDB

In [30]:
%%time
# Load data into MemoryDB
BATCH_SIZE = 100
pipe = rc.pipeline()
for index, row in df_subset.iterrows():
    key = generate_key(prefix=KEY_PREFIX)
    v_plot = np.array(row['v_plot'], dtype=np.float32).tobytes()
    pipe.hset(key, mapping={
        'directors': row['directors'],
        'rating': row['rating'],
        'genres': row['genres'],
        'plot': row['plot'],
        'title': row['title'],
        'runtime': row['runtime'],
        'actors': row['actors'],
        'year': row['year'],
        'titleType': row['titleType'],
        'isAdult': str(row['isAdult']).lower(),
        'v_plot': v_plot
    })
    if index % BATCH_SIZE == 0 or index == number_of_vectors - 1:
        pipe.execute()
        pipe = rc.pipeline()
print("Data indexed successfully.")

Data indexed successfully.
CPU times: user 4.35 s, sys: 93.6 ms, total: 4.44 s
Wall time: 24.3 s


### Find Random Key and Fetch Value

In [31]:
%%time
# Add a python script to find a random key that stats with the prefix and fetch the value and show it
count = 0
while True:
    count += 1
    keyname = rc.randomkey()
    keyname = keyname.decode('utf-8')
    print(str(keyname))
    if keyname.startswith(KEY_PREFIX) == True:
        print(rc.hgetall(keyname))
        break
    elif count > 10:
        break

movies:0d762b6b-f119-45ad-be15-120aebce3703
{b'actors': b'Aly Michalka, Vanessa Hudgens, Gaelan Connell, Scott Porter, Ryan Donowho, Charlie Saxton, Lisa Kudrow, Tim Jo, Elvy, Lisa Chung', b'titleType': b'movie', b'title': b'Bandslam', b'rating': b'6.3', b'year': b'2009.0', b'plot': b'A high school social outcast and the popular girl bond through a shared love of music.', b'isAdult': b'false', b'runtime': b'111', b'genres': b'Comedy,Drama,Family', b'v_plot': b'~\x04\x18\xbc\x87\xc4p<j\xf3\xe0\xbc\x1e9f\xbd\xe7)\xd8:*\xbe\xaf\xbd\x1e\x9f\xb0<\xa4\xfbJ\xbd\xb1\x80\x94=\x9c\xd5\'\xbd/\xb7\x0b\xbd\xb5\x13&\xbcT\x14\t\xbdP\xb5\xe2<\xe7)X=T\xae>\xbc\xceQ\xa4\xbc\x1a\x0c\x9f\xbd\x8fP^=\xbd9I\xbd\xa4\xfbJ=\x87*;\xbd\r\xed\x9f\xbcYA\xd0<iY\xab\xbd\xd3~k\xbdP\x1b-=\xe3\x96\xc6\xbdH\xf5\x89=]:,\xbde`O=\xbd\x9f\x93\xbc\x00\xce ="\xcc\xf7\xbdP\xb5\xe2=\xa8\xf4\xa6=7wd=\xd7\x11}=\xa9\x8e\\=\x93I:=\xec"4<e`\xcf<D\xfc-\xban\x86r<vx\xaa\xbc\xa4a\x95<\xd3~\xeb<\xb9\x0c\x02=\x8f\xb6(=e`O<~\x04\x18>\r\xed

### Query MemoryDB

#### Define Functions

In [32]:
def similarity_search(redis_client, index_name, query_vector, filters="*",top_n=1):
    # Convert the query vector to bytes
    query_vector_bytes = np.array(query_vector, dtype=np.float32).tobytes()
    # Create the query
    query = Query(f"{filters}=>[KNN {top_n} @v_plot  $query_vec AS score ]") \
        .sort_by("score") \
        .return_fields(  "titleType", "title", "year", "plot", "rating", "genres", "runtime") \
        .paging(0, top_n) \
        .dialect(2)
    params = {
        "query_vec": query_vector_bytes
    }
    # Process the query
    result = redis_client.ft(index_name).search(query, query_params=params)
    return result
def format_results(results):
    for doc in results.docs:
        doc_id = doc.id
        titleType = doc.titleType
        title = doc.title
        year = doc.year
        rating = doc.rating
        genres = doc.genres
        runtime = doc.runtime
        print(f"ID: {doc_id}, Title Type: {titleType}, Title: {title}, Year: {year}, Rating: {rating}, Genres: {genres}, Runtime: {str(datetime.timedelta(seconds=float(runtime)))}")

#### Get Random Title from Dataset

In [33]:
randomdf = df_subset.sample()
randomdf.get("title")

49401    The Sword and the Sorcerer
Name: title, dtype: object

In [34]:
selected_row = randomdf.iloc[0]
print(selected_row)
query_vector = selected_row['v_plot']

tconst                                               tt0084749
titleType                                                movie
title                               The Sword and the Sorcerer
isAdult                                                  False
year                                                    1982.0
runtime                                                     99
genres                                Action,Adventure,Fantasy
rating                                                     5.5
numVotes                                                7657.0
directors                                          Albert Pyun
actors       Lee Horsley, Kathleen Beller, Simon MacCorkind...
plot         A mercenary with a three-bladed sword rediscov...
v_plot       [-0.024327628, -0.008590264, 0.012438703, -0.0...
Name: 49401, dtype: object


#### Do Simple Similarity Search

In [35]:
%%time
results = similarity_search(rc, index_name, query_vector, top_n=15)

CPU times: user 1.4 ms, sys: 97 µs, total: 1.5 ms
Wall time: 49.1 ms


In [36]:
format_results(results)

ID: movies:1be15283-502a-4b06-a2ca-e00dbb2cb36b, Title Type: movie, Title: The Sword and the Sorcerer, Year: 1982, Rating: 5.5, Genres: Action,Adventure,Fantasy, Runtime: 0:01:39
ID: movies:273bdbbc-d1ae-4ab4-9f08-b383ae30a300, Title Type: movie, Title: Delgo, Year: 2008, Rating: 4.3, Genres: Adventure,Animation,Comedy, Runtime: 0:01:34
ID: movies:5e0398e0-095e-44b9-bd69-824fe96b274d, Title Type: movie, Title: The Magic Sword, Year: 1962, Rating: 4.8, Genres: Adventure,Drama,Fantasy, Runtime: 0:01:20
ID: movies:690b480a-0ad1-477b-8ce4-93c4c26ee399, Title Type: movie, Title: Crouching Tiger, Hidden Dragon, Year: 2000, Rating: 7.9, Genres: Action,Adventure,Drama, Runtime: 0:02:00
ID: movies:aef0d9e6-9409-4fc9-a702-242c59efd3d8, Title Type: movie, Title: Cuba, Year: 1979, Rating: 5.6, Genres: Adventure,Romance,Thriller, Runtime: 0:02:02
ID: movies:b036d78d-d81d-4ca0-9775-5d4781c97e34, Title Type: movie, Title: Kingdom of Heaven, Year: 2005, Rating: 7.3, Genres: Action,Adventure,Drama, Run

#### Do Similarity Search with Filters

In [37]:
%%time
filters = "@isAdult:{false} @titleType:{movie} @year:[2000 2009] @rating:[6 (INF] (@genres:{Horror} | @genres:{Animation} | @genres:{Action}| @genres:{Drama}) "
results = similarity_search(rc, index_name, query_vector, filters=filters, top_n=15)

CPU times: user 2.43 ms, sys: 0 ns, total: 2.43 ms
Wall time: 12.7 ms


In [38]:
format_results(results)

ID: movies:690b480a-0ad1-477b-8ce4-93c4c26ee399, Title Type: movie, Title: Crouching Tiger, Hidden Dragon, Year: 2000, Rating: 7.9, Genres: Action,Adventure,Drama, Runtime: 0:02:00
ID: movies:b036d78d-d81d-4ca0-9775-5d4781c97e34, Title Type: movie, Title: Kingdom of Heaven, Year: 2005, Rating: 7.3, Genres: Action,Adventure,Drama, Runtime: 0:02:24
ID: movies:7f927361-b80f-43b4-a100-edb2ca3c3212, Title Type: movie, Title: Shadowless Sword, Year: 2005, Rating: 6.4, Genres: Action,Adventure,History, Runtime: 0:01:57
ID: movies:be855769-5e7d-48ef-baf9-a7d8bc102106, Title Type: movie, Title: Time and Tide, Year: 2000, Rating: 6.7, Genres: Action,Crime,Thriller, Runtime: 0:01:53
ID: movies:d6bd6498-838f-4db8-8312-e9a4edeed9ea, Title Type: movie, Title: Bichunmoo, Year: 2000, Rating: 6.1, Genres: Action,Drama,Fantasy, Runtime: 0:01:58
ID: movies:bd8413d9-5993-427f-8ca2-25180b8d9f73, Title Type: movie, Title: Azur & Asmar: The Princes' Quest, Year: 2006, Rating: 7.4, Genres: Adventure,Animation

## Save Dataset

In [39]:
pickle.dump(df_subset, open(f"{dataset_path}/title.subset.sav","wb"))