In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/imdb_top_1000.csv')

# Displaying the content and top 5 rows!!
df.head()


Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
# Checking for the null values in the entire dataset for futher processing!!
null_values = df.isnull().sum()

# Displaying the column names containing null vales!!
print(null_values)


Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64


In [4]:
import pandas as pd



# Combining relevant columns into a single text column for embeddings
df['combined_text'] = df.apply(lambda row: f"{row['Series_Title']}. {row['Overview']}. "
                                               f"Genre: {row['Genre']}. "
                                               f"Directed by {row['Director']}. "
                                               f"Starring: {row['Star1']}, {row['Star2']}, {row['Star3']}, {row['Star4']}", axis=1)

# Previewing the new 'combined_text' column!!
print(df[['Series_Title', 'combined_text']].head())


               Series_Title                                      combined_text
0  The Shawshank Redemption  The Shawshank Redemption. Two imprisoned men b...
1             The Godfather  The Godfather. An organized crime dynasty's ag...
2           The Dark Knight  The Dark Knight. When the menace known as the ...
3    The Godfather: Part II  The Godfather: Part II. The early life and car...
4              12 Angry Men  12 Angry Men. A jury holdout attempts to preve...


In [5]:
import re

# Preprocessing the combined text
def preprocess_text(text):
    # Converting them into lowercase
    text = text.lower()
    # Removing extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['processed_text'] = df['combined_text'].apply(preprocess_text)

print(df[['combined_text', 'processed_text']].head())


                                       combined_text  \
0  The Shawshank Redemption. Two imprisoned men b...   
1  The Godfather. An organized crime dynasty's ag...   
2  The Dark Knight. When the menace known as the ...   
3  The Godfather: Part II. The early life and car...   
4  12 Angry Men. A jury holdout attempts to preve...   

                                      processed_text  
0  the shawshank redemption. two imprisoned men b...  
1  the godfather. an organized crime dynasty's ag...  
2  the dark knight. when the menace known as the ...  
3  the godfather: part ii. the early life and car...  
4  12 angry men. a jury holdout attempts to preve...  


In [6]:
!pip install sentence-transformers
#importing the BERT- sentence model for embeddings



In [7]:
from sentence_transformers import SentenceTransformer

# Loading the Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # 'all-mpnet-base-v2' for more accuracy for now lets keep this ;)

# Generating embeddings for each movie's processed text, hehe almost done
df['embeddings'] = model.encode(df['processed_text'].tolist(), show_progress_bar=True).tolist()

print(df[['processed_text', 'embeddings']].head())


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

                                      processed_text  \
0  the shawshank redemption. two imprisoned men b...   
1  the godfather. an organized crime dynasty's ag...   
2  the dark knight. when the menace known as the ...   
3  the godfather: part ii. the early life and car...   
4  12 angry men. a jury holdout attempts to preve...   

                                          embeddings  
0  [-0.08407538384199142, -0.06424596160650253, -...  
1  [-0.09604866057634354, -0.044859278947114944, ...  
2  [-0.006180617958307266, -0.030387721955776215,...  
3  [-0.04965287446975708, -0.018728990107774734, ...  
4  [-0.06084310635924339, 0.02292006090283394, -0...  


In [8]:
!pip install faiss-cpu
#for storing and retriving the indexes:)

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [9]:
import faiss
import numpy as np

# Converting embeddings to a numpy array (required by FAISS)
embedding_dim = len(df['embeddings'][0])  # Get the dimension of the embeddings
embeddings_matrix = np.array(df['embeddings'].tolist()).astype('float32')

# Creating FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance is common for semantic search
index.add(embeddings_matrix)  # Add embeddings to the index

# Defining a function to perform a semantic search
def search_movie(query, top_k=5):
    # Generating embedding for the query
    query_embedding = model.encode([query])[0].astype('float32')
    # Searching for the top_k nearest neighbors
    distances, indices = index.search(np.array([query_embedding]), top_k)
    # tadaaaa Retrieving the resultssssss!!!
    results = df.iloc[indices[0]]
    return results[['Series_Title', 'Overview', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating']]




In [10]:


# Example usage:
query = "going in wormhole"  # Replace with any query
results = search_movie(query, top_k=5)
print(results)


        Series_Title                                           Overview  \
21      Interstellar  A team of explorers travel through a wormhole ...   
66            WALL·E  In the distant future, a small waste-collectin...   
329      The Martian  An astronaut becomes stranded on Mars after hi...   
307  Ace in the Hole  A frustrated former big-city journalist now st...   
106           Aliens  Fifty-seven years after surviving an apocalypt...   

                            Genre           Director                Star1  \
21       Adventure, Drama, Sci-Fi  Christopher Nolan  Matthew McConaughey   
66   Animation, Adventure, Family     Andrew Stanton            Ben Burtt   
329      Adventure, Drama, Sci-Fi       Ridley Scott           Matt Damon   
307              Drama, Film-Noir       Billy Wilder         Kirk Douglas   
106     Action, Adventure, Sci-Fi      James Cameron     Sigourney Weaver   

                Star2             Star3          Star4  IMDB_Rating  
21      Anne Hat

In [13]:
import optuna

# Function for hyperparameter tuning
def tune_hyperparameters(trial):
    # Suggest Sentence-BERT model
    model_name = trial.suggest_categorical('model_name', ['all-MiniLM-L6-v2', 'all-mpnet-base-v2'])
    # Suggest number of top_k results
    top_k = trial.suggest_int('top_k', 3, 10)
    # Suggest similarity metric: L2 or Inner Product
    similarity_metric = trial.suggest_categorical('similarity_metric', ['L2', 'IP'])

    # Load the model
    model = SentenceTransformer(model_name)

    # Recreate embeddings
    embeddings = model.encode(df['processed_text'].tolist(), show_progress_bar=True)
    embeddings_matrix = np.array(embeddings).astype('float32')

    # Create FAISS index
    if similarity_metric == 'L2':
        index = faiss.IndexFlatL2(len(embeddings[0]))
    elif similarity_metric == 'IP':
        index = faiss.IndexFlatIP(len(embeddings[0]))
    index.add(embeddings_matrix)

    # Evaluate performance using a set of validation queries
    validation_queries = [
        "science fiction in space",
        "romantic movie",
        "superhero film",
        "historical drama",
        "comedy about family",
    ]
    scores = []
    for query in validation_queries:
        query_embedding = model.encode([query])[0].astype('float32')
        distances, indices = index.search(np.array([query_embedding]), top_k)
        # Calculate a dummy score for tuning purposes (e.g., using IMDB_Rating mean)
        retrieved_movies = df.iloc[indices[0]]
        avg_rating = retrieved_movies['IMDB_Rating'].mean()
        scores.append(avg_rating)

    # Objective to maximize: average of retrieved movie ratings
    return np.mean(scores)

# Start hyperparameter tuning
study = optuna.create_study(direction='maximize')  # Maximize IMDB rating
study.optimize(tune_hyperparameters, n_trials=20)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)


[I 2024-12-09 12:16:03,104] A new study created in memory with name: no-name-7f335b83-a37b-4574-b377-e721d4c08515


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:05,184] Trial 0 finished with value: 7.9333333333333345 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 3, 'similarity_metric': 'IP'}. Best is trial 0 with value: 7.9333333333333345.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:07,213] Trial 1 finished with value: 7.969999999999999 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 4, 'similarity_metric': 'IP'}. Best is trial 1 with value: 7.969999999999999.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:18,500] Trial 2 finished with value: 7.885714285714286 and parameters: {'model_name': 'all-mpnet-base-v2', 'top_k': 7, 'similarity_metric': 'IP'}. Best is trial 1 with value: 7.969999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:21,024] Trial 3 finished with value: 7.944 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 5, 'similarity_metric': 'IP'}. Best is trial 1 with value: 7.969999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:28,514] Trial 4 finished with value: 7.93 and parameters: {'model_name': 'all-mpnet-base-v2', 'top_k': 10, 'similarity_metric': 'IP'}. Best is trial 1 with value: 7.969999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:31,311] Trial 5 finished with value: 7.969999999999999 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 4, 'similarity_metric': 'L2'}. Best is trial 1 with value: 7.969999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:33,764] Trial 6 finished with value: 7.965714285714286 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 7, 'similarity_metric': 'L2'}. Best is trial 1 with value: 7.969999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:35,919] Trial 7 finished with value: 7.973999999999999 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 10, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:37,867] Trial 8 finished with value: 7.965714285714286 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 7, 'similarity_metric': 'L2'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:39,955] Trial 9 finished with value: 7.967500000000001 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 8, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:46,572] Trial 10 finished with value: 7.93 and parameters: {'model_name': 'all-mpnet-base-v2', 'top_k': 10, 'similarity_metric': 'L2'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:48,599] Trial 11 finished with value: 7.944 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 5, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:50,589] Trial 12 finished with value: 7.953333333333333 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 9, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:52,635] Trial 13 finished with value: 7.944 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 5, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:16:55,051] Trial 14 finished with value: 7.9333333333333345 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 3, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:17:01,415] Trial 15 finished with value: 7.9 and parameters: {'model_name': 'all-mpnet-base-v2', 'top_k': 6, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:17:03,396] Trial 16 finished with value: 7.953333333333333 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 9, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:17:05,505] Trial 17 finished with value: 7.969999999999999 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 4, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:17:12,241] Trial 18 finished with value: 7.8775 and parameters: {'model_name': 'all-mpnet-base-v2', 'top_k': 8, 'similarity_metric': 'L2'}. Best is trial 7 with value: 7.973999999999999.


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[I 2024-12-09 12:17:14,276] Trial 19 finished with value: 7.956666666666666 and parameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 6, 'similarity_metric': 'IP'}. Best is trial 7 with value: 7.973999999999999.


Best hyperparameters: {'model_name': 'all-MiniLM-L6-v2', 'top_k': 10, 'similarity_metric': 'IP'}


In [25]:
def search_movie_optimized(query, top_k):  # Updated
    # Generate embedding for the query
    query_embedding = model.encode([query])[0].astype('float32')
    query_embedding /= np.linalg.norm(query_embedding)

    distances, indices = index.search(np.array([query_embedding]), top_k)
    # Retrieve results
    results = df.iloc[indices[0]]
    return results[['Series_Title', 'Overview', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4', 'IMDB_Rating']]


In [81]:
# Example usage after optimising!
query = "Miguel seeks legendary singer in Land of the Dead  # Replace with your own query dude"
results = search_movie_optimized(query, top_k=3)
print(results)


            Series_Title                                           Overview  \
61                  Coco  Aspiring musician Miguel, confronted with his ...   
68              Oldeuboi  After being kidnapped and imprisoned for fifte...   
674  Dip huet seung hung  A disillusioned assassin accepts one last hit ...   

                            Genre        Director          Star1  \
61   Animation, Adventure, Family     Lee Unkrich  Adrian Molina   
68         Action, Drama, Mystery  Chan-wook Park   Choi Min-sik   
674          Action, Crime, Drama        John Woo   Yun-Fat Chow   

                Star2               Star3           Star4  IMDB_Rating  
61   Anthony Gonzalez  Gael García Bernal  Benjamin Bratt          8.4  
68         Yoo Ji-Tae      Kang Hye-jeong   Kim Byeong-Ok          8.4  
674         Danny Lee           Sally Yeh        Kong Chu          7.8  


In [40]:
import numpy as np

# Precision@k
def precision_at_k(results, relevant_items, k):
    retrieved_items = results[:k]
    relevant_retrieved = [item for item in retrieved_items if item in relevant_items]
    return len(relevant_retrieved) / k

# Recall@k
def recall_at_k(results, relevant_items, k):
    retrieved_items = results[:k]
    relevant_retrieved = [item for item in retrieved_items if item in relevant_items]
    return len(relevant_retrieved) / len(relevant_items)

# Mean Reciprocal Rank (MRR)
def mean_reciprocal_rank(all_results, all_relevant_items):
    mrr = 0
    for results, relevant_items in zip(all_results, all_relevant_items):
        for rank, item in enumerate(results, start=1):
            if item in relevant_items:
                mrr += 1 / rank
                break
    return mrr / len(all_results)

# nDCG@k
def ndcg_at_k(results, relevant_items, k):
    def dcg(items, k):
        """Calculate the Discounted Cumulative Gain."""
        return sum((1 / np.log2(i + 2)) for i, item in enumerate(items[:k]) if item in relevant_items)

    # DCG for the results
    dcg_value = dcg(results, k)

    # Ideal DCG assumes all relevant items are ranked at the top
    ideal_ranking = relevant_items[:k]
    idcg_value = dcg(ideal_ranking, k)

    return dcg_value / idcg_value if idcg_value > 0 else 0






In [84]:
# Updated list with correct top 1 or 2 predictions for each query
test_queries = [
    "going in wormhole",
    "space exploration",
    "Wizardry for their third year of study",
    "Batman faces psychological and physical tests as he confronts the Joker's chaos in Gotham",
    "Roman General seeks revenge on emperor",
    "A young Viking befriends a dragon and discovers they are not as dangerous as believed",
    "Former neo-nazi tries to stop his brother from making the same mistakes",
    "Sole survivor recounts gun battle after criminals meet at a lineup",
    "Tramp falls in love with blind flower girl and gets help from a wealthy tippler to support her",
    "Miguel seeks legendary singer in Land of the Dead",
    "Eight years after the Joker's reign, Batman saves Gotham from Bane",
    "Two friends search for their lost companion and recall college memories",
    "Eight-year-old boy is misunderstood until an art teacher helps him",
    "Waste-collecting robot embarks on a journey to save humanity",
    "Agent surveils writer in East Berlin, becomes absorbed in their lives",
    "Man imprisoned for 15 years seeks revenge on captor",
    "Man with memory loss hunts his wife's murderer",
    "Ashitaka seeks cure and peace in a war between forest gods and humans",
    "Prohibition-era gangster returns to Manhattan to confront past regrets",
    "Indiana Jones seeks Ark of the Covenant before Nazis find it",
    "Family in isolated hotel faces violence and supernatural forces",
    "U.S. officer must assassinate a rogue Special Forces Colonel in Vietnam",
    "Space crew attacked by lifeform after receiving distress call",
    "Terminally ill man lives life to the fullest before death",
    "Shoe executive's son kidnapped for ransom",
    "Insane general threatens nuclear war while politicians scramble to stop him",
    "British barrister defends client in a shocking murder trial",
    "General accuses soldiers of cowardice; commanding officer defends them",
    "Photographer spies on neighbors and believes one committed murder",
    "Screenwriter develops dangerous relationship with faded film star",
    "Dictator expands empire while poor Jewish barber avoids persecution",
    "Soldiers race against time to stop deadly trap in WWI",
    "Goddess who created universe faces consequences of a temple built for her first-born",
    "Blind pianist's life changes as he uncovers a crime he shouldn't know about",
    "Man takes extreme measures to save family from punishment after accidental crime"
]

ground_truth_relevant = [
    ["Interstellar"],
    ["Star Trek"],
    ["Harry Potter and the Prisoner of Azkaban"],
    ["The Dark Knight"],
    ["Gladiator"],
    ["How to Train Your Dragon"],
    ["American History X"],
    ["The Usual Suspects"],
    ["City Lights"],
    ["Coco"],
    ["The Dark Knight Rises", "The Dark Knight"],
    ["Searching", "The Searchers"],
    ["Taare Zameen Par", "Wonder"],
    ["WALL·E", "The Terminator"],
    ["The Lives of Others"],
    ["Celda 211"],
    ["Memento", "Anatomy of a Murder"],
    ["Mononoke-hime", "Kaze no tani no Naushika"],
    ["Once Upon a Time in America", "The Untouchables"],
    ["Raiders of the Lost Ark"],
    ["The Shining", "Room"],
    ["Apocalypse Now"],
    ["Alien", "Serenity"],
    ["Anand"],
    ["The Usual Suspects", "Tengoku to jigoku"],
    ["Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb"],
    ["Witness for the Prosecution", "Anatomy of a Murder"],
    ["Paths of Glory", "Platoon"],
    ["Rear Window", "The Conversation"],
    ["Her", "Lost in Translation"],
    ["The Great Dictator", "Der Untergang"],
    ["1917", "Stalag 17"],
    ["Tumbbad", "The Theory of Everything"],
    ["Andhadhun", "The Pianist"],
    ["Drishyam"]
]



In [85]:
# Performing search for test queries
all_results = []
for query in test_queries:
    results = search_movie_optimized(query, top_k=10)  # Use the optimized search function
    all_results.append(results['Series_Title'].tolist())
    print(f"Query: {query}")
    print(f"Predicted Results: {results['Series_Title'].tolist()}")

# Evaluating each query for Top=1
ks = [1]
for k in ks:
    print(f"\nEvaluation Metrics for Top-{k}:")
    precision_scores = [precision_at_k(results, relevant, k) for results, relevant in zip(all_results, ground_truth_relevant)]
    recall_scores = [recall_at_k(results, relevant, k) for results, relevant in zip(all_results, ground_truth_relevant)]
    ndcg_scores = [ndcg_at_k(results, relevant, k) for results, relevant in zip(all_results, ground_truth_relevant)]

    # Calculating score  MRR
    mrr_score = mean_reciprocal_rank(all_results, ground_truth_relevant)

    #Printing  scores
    print(f"Precision@{k}: {np.mean(precision_scores):.4f}")
    print(f"Recall@{k}: {np.mean(recall_scores):.4f}")
    print(f"nDCG@{k}: {np.mean(ndcg_scores):.4f}")
    print(f"MRR: {mrr_score:.4f}")


Query: going in wormhole
Predicted Results: ['Interstellar', 'WALL·E', 'The Martian', 'Ace in the Hole', 'Aliens', 'Hidden Figures', '2001: A Space Odyssey', 'Arrival', 'Gravity', 'Back to the Future']
Query: space exploration
Predicted Results: ['Star Trek', 'Moon', 'Interstellar', 'The Right Stuff', 'Gravity', 'Arrival', 'Solaris', 'The Martian', 'WALL·E', 'Planet of the Apes']
Query: Wizardry for their third year of study
Predicted Results: ["Harry Potter and the Sorcerer's Stone", 'Harry Potter and the Prisoner of Azkaban', 'Harry Potter and the Goblet of Fire', 'Harry Potter and the Half-Blood Prince', 'Harry Potter and the Deathly Hallows: Part 1', 'The Wizard of Oz', 'Harry Potter and the Deathly Hallows: Part 2', 'The Secret of Kells', 'Les choristes', 'Dazed and Confused']
Query: Batman faces psychological and physical tests as he confronts the Joker's chaos in Gotham
Predicted Results: ['The Dark Knight', 'Joker', 'The Dark Knight Rises', 'Batman Begins', 'Batman: Mask of the