In [1]:
#Name: Devarsh Vora

import kaggle
import pandas as pd
import zipfile
import transformers
# Authenticate with Kaggle API
kaggle.api.authenticate()

# Download dataset
dataset = "harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows"
kaggle.api.dataset_download_files(dataset)

# Extract the downloaded zip file
with zipfile.ZipFile("imdb-dataset-of-top-1000-movies-and-tv-shows.zip", "r") as zip_ref:
    zip_ref.extractall(".")

# Read the dataset into a DataFrame
movies = pd.read_csv("imdb_top_1000.csv")

# Keep only desired columns
desired_columns = ['Series_Title', 'Genre', 'Overview', 'Director']
movies = movies[desired_columns]

# Display selected columns
print(movies.head(10))


                                    Series_Title                      Genre  \
0                       The Shawshank Redemption                      Drama   
1                                  The Godfather               Crime, Drama   
2                                The Dark Knight       Action, Crime, Drama   
3                         The Godfather: Part II               Crime, Drama   
4                                   12 Angry Men               Crime, Drama   
5  The Lord of the Rings: The Return of the King   Action, Adventure, Drama   
6                                   Pulp Fiction               Crime, Drama   
7                               Schindler's List  Biography, Drama, History   
8                                      Inception  Action, Adventure, Sci-Fi   
9                                     Fight Club                      Drama   

                                            Overview              Director  
0  Two imprisoned men bond over a number of years... 

In [2]:
#Converting pandas dataframe to Huggingface dataset
from datasets import Dataset
movie_dataset = Dataset.from_pandas(movies)
movie_dataset

Dataset({
    features: ['Series_Title', 'Genre', 'Overview', 'Director'],
    num_rows: 1000
})

In [3]:
#Concatenating all the text field 
def concatenate_text(data): 
    return {"text": data['Series_Title']+ '\n' + data['Genre']+ '\n' + data['Overview']+ '\n'+ data['Director']}

movie_dataset = movie_dataset.map(concatenate_text)
movie_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['Series_Title', 'Genre', 'Overview', 'Director', 'text'],
    num_rows: 1000
})

In [4]:
movie_dataset['text'][0]

'The Shawshank Redemption\nDrama\nTwo imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.\nFrank Darabont'

In [5]:
#Sentence Embedding Model
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

In [6]:
  # Embed the Text
  def embed_text(data):
    text = data['text']
    embeddings = model.encode(text)  # Get embeddings for the text
    return {"text": text, "embedding": embeddings.tolist()}  # Convert to a list

  movie_dataset = movie_dataset.map(embed_text, batched=True)  # Process in batches for efficiency

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
from elasticsearch import Elasticsearch

# Authentication credentials
username = "devarsh"
password = "devarsh"

# Connect to Elasticsearch with authentication (adjust details as needed)
es = Elasticsearch(["http://localhost:9200"], http_auth=(username, password))

# Define the index name
index_name = "1000_topmovies" 

# Iterate through the dataset and index each record
for data in movie_dataset:
    text = data["text"]
    embedding = data["embedding"]
    # Index the document
    es.index(index=index_name, body={"text": text, "embedding": embedding}, id=text)

print("Embeddings stored in Elasticsearch!")


  es = Elasticsearch(["http://localhost:9200"], http_auth=(username, password))


Embeddings stored in Elasticsearch!


In [8]:
from sentence_transformers import util

# Function to perform semantic search
def semantic_search(query, es, model):
    # Embed the query
    query_embedding = model.encode([query])[0]
    
    # Search similar embeddings
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_embedding.tolist()}
            }
        }
    }
    
    # Search with custom scoring
    results = es.search(index=index_name, body={"size": 5, "query": script_query})
    
    return results['hits']['hits']

# Function to print results
def print_results(results):
    print("Search Results:")
    for i, result in enumerate(results):
        print(f"Result {i+1}:")
        print(f"Text: {result['_source']['text']}")

        print("")

# Example usage
query = ""
results = semantic_search(query, es, model)

# Print the results
print_results(results)



Search Results:
Result 1:
Text: The Incredibles
Animation, Action, Adventure
A family of undercover superheroes, while trying to live the quiet suburban life, are forced into action to save the world.
Brad Bird

Result 2:
Text: OMG: Oh My God!
Comedy, Drama, Fantasy
A shopkeeper takes God to court when his shop is destroyed by an earthquake.
Umesh Shukla

Result 3:
Text: The Peanut Butter Falcon
Adventure, Comedy, Drama
Zak runs away from his care home to make his dream of becoming a wrestler come true.
Tyler Nilson

Result 4:
Text: Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni
Animation, Action, Drama
Concurrent theatrical ending of the TV series Shin seiki evangerion (1995).
Hideaki Anno

Result 5:
Text: Vampire Hunter D: Bloodlust
Animation, Action, Fantasy
When a girl is abducted by a vampire, a legendary bounty hunter is hired to bring her back.
Yoshiaki Kawajiri

