# Search Engine using BM25

In [1]:
# Install required packages
!pip install numpy pandas rank_bm25 nltk



You should consider upgrading via the 'C:\Users\dell\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')
from rank_bm25 import BM25Okapi

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dataset
  The dataset includes startup names, descriptions, locations, and pictures. The raw parsed data can be accessed via this https://huggingface.co/datasets/hugginglearners/netflix-shows/tree/main.

In [4]:
# Load the dataset from CSV
csv_path = '../netflix_titles.csv'
df = pd.read_csv(csv_path)

In [5]:
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


## Preprocess Data :

In this step, we will preprocess the data to ensure data quality, consistency, and suitability for further processing, improving analysis and modeling accuracy.

   -  Handling Missing Values:
       - For numeric columns, fill missing values with the median.
       - For categorical columns, fill missing values with the most frequent value.
    
   - Drop Duplicates:
       - Remove duplicate rows from the dataset if any.
    
   - Convert Text to Lowercase:
       - Convert all text data to lowercase for uniformity.

   - Tokenization and Stopword Removal:
       - Tokenize text data using NLTK's word_tokenize.
       - Remove English stopwords using NLTK's set of stopwords.
    
   - Stemming:
       - Apply Snowball Stemming for English language to reduce words to their root form.

In [6]:
def preprocess_data(df):
    # Drop duplicates, if any
    df = df.drop_duplicates()

    # Handling missing values
    # For numeric columns, fill missing values with median
    numeric_cols = df.select_dtypes(include='number').columns
    for col in numeric_cols:
        df[col].fillna(df[col].median(), inplace=True)

    # For categorical columns, fill missing values with most frequent value
    categorical_cols = df.select_dtypes(include='object').columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Convert all text data to lowercase
    df = df.applymap(lambda x: x.lower() if isinstance(x, str) else '')

    # Prepare NLTK for English Language Preprocessing
    eng_stopwords = set(stopwords.words('english'))  # Set of English stopwords
    stemmer = SnowballStemmer('english')  # Snowball Stemmer for English language

    # Preprocess text in categorical columns using NLTK
    for col in categorical_cols:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in eng_stopwords]))
            
    return df


df_cleaned = preprocess_data(df)

In [7]:
def preprocess_query(input_string):
    # Convert the input string to lowercase
    input_string = input_string.lower()

    # Prepare NLTK for English Language Preprocessing
    eng_stopwords = set(stopwords.words('english'))  # Set of English stopwords
    stemmer = SnowballStemmer('english')  # Snowball Stemmer for Englsih language

    # Tokenize the input string
    words = word_tokenize(input_string)

    # Remove English stopwords and apply stemming
    preprocessed_words = [stemmer.stem(word) for word in words if word.lower() not in eng_stopwords]

    # Combine the preprocessed words into a single string
    preprocessed_string = ' '.join(preprocessed_words)

    return preprocessed_string

In [8]:
df_cleaned.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movi,dick johnson dead,kirsten johnson,david attenborough,unit state,"septemb 25 , 2021",,pg-13,90 min,documentari,"father near end life , filmmak kirsten johnson..."


In [9]:
# Define the search function using BM25Okapi
def search(query, column_name, num_results=3):
    # Preprocess the search query
    search_tokens = preprocess_query(query).split(" ")

    # Create the tokenized corpus for the specified column
    tokenized_corpus = [doc.split(" ") for doc in df_cleaned[column_name]]

    # Create BM25Okapi object for the specified column
    bm25 = BM25Okapi(tokenized_corpus)

    # Calculate BM25 scores
    scores = bm25.get_scores(search_tokens)

    # Get top indexes based on BM25 scores
    top_indexes = np.argsort(scores)[::-1][:num_results]
    return top_indexes

In [10]:
query = 'fluffy animal'
column_name = 'description'

# Perform search using BM25Okapi
search_results_indexes = search(query,column_name)

In [11]:
search_results_indexes

array([5660, 4153, 8172], dtype=int64)

In [12]:
# Get the search results DataFrame
df_search = df.loc[search_results_indexes]

# Print the search results
print("Search Results:")
for i, row in enumerate(df_search.iterrows(), 1):
    print(f"\nResult {i}:")
    for col, value in row[1].items():
        print(f"\t{col}: {value}")

Search Results:

Result 1:
	show_id: s5661
	type: Movie
	title: Gabriel lglesias: I’m Sorry For What I Said When I Was Hungry
	director: Manny Rodriguez
	cast: Gabriel Iglesias
	country: United States
	date_added: December 20, 2016
	release_year: 2016
	rating: TV-14
	duration: 88 min
	listed_in: Stand-Up Comedy
	description: Hawaiian-shirt enthusiast Gabriel "Fluffy" Iglesias finds the laughs in racist gift baskets, Prius-driving cops and all-female taco trucks.

Result 2:
	show_id: s4154
	type: Movie
	title: Gabriel "Fluffy" Iglesias: One Show Fits All
	director: Manny Rodriguez
	cast: Gabriel Iglesias
	country: nan
	date_added: January 29, 2019
	release_year: 2019
	rating: TV-14
	duration: 91 min
	listed_in: Stand-Up Comedy
	description: Gabriel "Fluffy" Iglesias discusses his teenage son and encounters with Snoop Dogg, Chris Rock and Vicente Fernández in this stand-up special for 2019.

Result 3:
	show_id: s8173
	type: TV Show
	title: Th Eena Meena Deeka Chase Comedy Show
	director: