# Movie recomendation system with Semantic Search:


- Dataset: [25k movies](https://www.kaggle.com/datasets/utsh0dey/25k-movie-dataset/)
- Vector database: [Chroma](https://docs.trychroma.com/)
- Embedding: [sentence-transformer](https://www.sbert.net/)

## Installing and Importing libraries

In [1]:
!pip install -U sentence-transformers
!pip install pinecone-client



In [2]:
import pinecone
import pandas as pd
from sentence_transformers import SentenceTransformer
from ast import literal_eval
from getpass import getpass


  from tqdm.autonotebook import tqdm


## Pre-processing

In [3]:
df = pd.read_csv('25k-imdb-movie-dataset_6876f742-4304-4e96-b695-41e146a046c3.csv')

In [4]:
df.shape

(24402, 12)

In [5]:
df.sample(5)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path
23095,Spasmo,not-released,6.1,1.9K,"['Mystery', 'Thriller']",Christian (Robert Hoffman) and his girlfriend ...,"['female nudity', 'dead woman with eyes open',...",Umberto Lenzi,"['Massimo Franciosa', 'Luisa Montagnana', 'Rob...",Pino Boller,-1974,/title/tt0072196/
12828,Military Wives,"$4,890,505",6.5,4.9K,"['Comedy', 'Drama', 'Music']",With their partners away serving in Afghanista...,"['friendship', 'singing', 'bus', 'drinking', '...",Peter Cattaneo,"['Rosanne Flynn', 'Kristin Scott Thomas', 'Sha...",Rachel Tunnard,-2019,/title/tt8951692/
19004,The Book of Mormon,not-released,no-rating,0,"['Comedy', 'Musical']",Two young Latter-day Saints are sent on a miss...,"['village', 'based on play or musical', 'afric...",Casey Nicholaw,"['Robert Lopez', 'Trey Parker', 'Matt Stone', ...",Trey Parker,,/title/tt2058092/
14689,"Dance, Girl, Dance",not-released,6.8,2.7K,"['Comedy', 'Drama', 'Music']","When a troupe of danseuses becomes unemployed,...","['national film registry', 'burlesque house', ...",Dorothy Arzner,"['Tess Slesinger', 'Frank Davis', 'Vicki Baum'...",Roy Del Ruth,-1940,/title/tt0032376/
24347,Duke,not-released,4.1,264,"['Crime', 'Drama']",Duke is a modern day telling of a classic west...,['one word title'],Anthony Gaudioso,"['Anthony Gaudioso', 'Carmine Giovinazzo', 'Ha...",James Gaudioso,(II) (2019),/title/tt1727254/


In [6]:
def concat_list(lista):
  lista = literal_eval(lista)
  return ' '.join(lista)

def string_to_list(lista):
  lista = literal_eval(lista)
  return lista

def list_to_string(lista):
  return ' '.join(lista)


In [7]:
df = df.fillna(' ')
df['Keywords'] = df['Plot Kyeword'].apply(concat_list)
df['Stars'] = df['Top 5 Casts'].apply(concat_list)
df['Generes'] = df['Generes'].apply(string_to_list)
df['Generes'] = df['Generes'].apply(list_to_string)

df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce').fillna(0).astype('float')

In [8]:
df.drop(['Plot Kyeword', 'Top 5 Casts'], axis = 1, inplace = True)

In [9]:
# Reemplazar valores nulos con cadenas vacías en las columnas necesarias
df['Overview'] = df['Overview'].fillna(' ')
df['Keywords'] = df['Keywords'].fillna(' ')
df['Stars'] = df['Stars'].fillna(' ')
df['Generes'] = df['Generes'].fillna(' ')

# Concatenar las columnas en la nueva columna 'text'
df['text'] = df.apply(lambda x: str(x['Overview']) + ' ' + x['Keywords'] + ' ' + str(x['Stars']), axis=1)

## Embeddings

In [10]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [11]:
embeddings = model.encode(df['text'], batch_size=64, show_progress_bar = True)

Batches:   0%|          | 0/382 [00:00<?, ?it/s]

In [12]:
df['embeddings'] = embeddings.tolist()
df['ids'] = df.index
df['ids'] = df['ids'].astype('str')

## VectorDB

In [14]:
pinecone_api = getpass("Enter pinecone value")

Enter pinecone value··········


In [20]:
pinecone.init(api_key=pinecone_api, environment ='gcp-starter')

In [21]:
dimension_embeddings = len(df['embeddings'][0])
index_name = "movies-embeddings"

all_index = pinecone.list_indexes()

if index_name in all_index:
  index = pinecone.Index(index_name)
else:
  pinecone.create_index(index_name, dimension = dimension_embeddings, metric = 'cosine')
  index = pinecone.Index(index_name)

In [22]:
index

<pinecone.index.Index at 0x7dd40454b370>

In [23]:
from tqdm.auto import tqdm

batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
  i_end = min(i+batch_size, len(df))
  batch = df[i:i_end]
  ids = batch['ids']
  emb = batch['embeddings']
  metadata = batch.drop(['ids','embeddings','text','path'], axis = 1).to_dict('records')

  to_upsert = list(zip(ids, emb, metadata))
  _ = index.upsert(vectors = to_upsert)


  0%|          | 0/382 [00:00<?, ?it/s]

In [24]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.23552,
 'namespaces': {'': {'vector_count': 23552}},
 'total_vector_count': 23552}

## Query

In [29]:
def search(query, genre, rating, top_k):
  query_vector = model.encode(query).tolist()

  if rating:
    filter_rating = rating
  else:
    filter_rating = 0

  if genre:
    conditions = {
        'Generes' : {
            '$in' : [genre]
        },
        'Rating' : {
            '$gte' : filter_rating
        }
    }
  else:
    conditions = {
        'Rating' : {
            '$gte' : filter_rating
        }
    }

  responses = index.query(
      vector = query_vector,
      top_k = top_k,
      include_metadata = True,
      filter = conditions
  )

  response_data = []
  for response in responses['matches']:
    response_data.append({
        'Title' : response['metadata']['movie title'],
        'Overview' : response['metadata']['Overview'],
        'Director' : response['metadata']['Director'],
        'Genre' : response['metadata']['Generes'],
        'year' : response['metadata']['year'],
        'Rating' : response['metadata']['Rating'],
        'Score' : response['score'],
    })

  df = pd.DataFrame(response_data)
  return df


In [30]:
search('scary movie about clowns',None,0,5)

Unnamed: 0,Title,Overview,Director,Genre,year,Rating,Score
0,It,"In the summer of 1989, a group of bullied kids...",Andy Muschietti,Horror,(I) (2017),7.3,0.643041
1,It Chapter Two,Twenty-seven years after their first encounter...,Andy Muschietti,Drama Fantasy Horror,-2019,6.5,0.612126
2,Clowntergeist,"Emma, a college student with a crippling fear ...",Aaron Mirtes,Crime Horror Thriller,-2017,3.0,0.584097
3,Clownhouse,"Just before Halloween, three young brothers al...",Victor Salva,Horror,-1989,5.5,0.581851
4,Behind the Sightings,Two filmmakers from North Carolina set out to ...,Tony Cadwell,Horror Thriller,-2021,3.4,0.575149
