In [1]:
# Install required packages
!pip install transformers nltk



You should consider upgrading via the 'C:\Users\dell\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cdist
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset from CSV
csv_path = '../netflix_titles.csv'
df = pd.read_csv(csv_path)

In [8]:
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


In [9]:
def preprocess_data(df):
    # Drop duplicates, if any
    df = df.drop_duplicates()

    # Handling missing values
    # For numeric columns, fill missing values with median
    numeric_cols = df.select_dtypes(include='number').columns
    for col in numeric_cols:
        df[col].fillna(df[col].median(), inplace=True)

    # For categorical columns, fill missing values with most frequent value
    categorical_cols = df.select_dtypes(include='object').columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Convert all text data to lowercase
    df = df.applymap(lambda x: x.lower() if isinstance(x, str) else '')

    # Prepare NLTK for English Language Preprocessing
    eng_stopwords = set(stopwords.words('english'))  # Set of English stopwords
    stemmer = SnowballStemmer('english')  # Snowball Stemmer for English language

    # Preprocess text in categorical columns using NLTK
    for col in categorical_cols:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in eng_stopwords]))
            
    return df


df_movies = preprocess_data(df)

In [10]:
df_movies.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movi,dick johnson dead,kirsten johnson,david attenborough,unit state,"septemb 25 , 2021",,pg-13,90 min,documentari,"father near end life , filmmak kirsten johnson..."


In [11]:
def preprocess_query(input_string):
    # Convert the input string to lowercase
    input_string = input_string.lower()

    # Prepare NLTK for English Language Preprocessing
    eng_stopwords = set(stopwords.words('english'))  # Set of English stopwords
    stemmer = SnowballStemmer('english')  # Snowball Stemmer for Englsih language

    # Tokenize the input string
    words = word_tokenize(input_string)

    # Remove English stopwords and apply stemming
    preprocessed_words = [stemmer.stem(word) for word in words if word.lower() not in eng_stopwords]

    # Combine the preprocessed words into a single string
    preprocessed_string = ' '.join(preprocessed_words)

    return preprocessed_string

## Different Models from Transformers library from Hugging Face
   1. BERT
       https://huggingface.co/docs/transformers/model_doc/bert
   2. ALBERT
       https://huggingface.co/docs/transformers/model_doc/albert
   3. RoBERTa
       https://huggingface.co/docs/transformers/model_doc/roberta
   4. XLNet
       https://huggingface.co/docs/transformers/model_doc/xlnet

In [12]:
# text encoding steps

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [16]:
# Encode the 'LongDesc' column using BERT
plots = list(df_movies['description'])
batch_size = 10
num_texts = len(plots)
encoded_texts = []
for i in tqdm(range(0, num_texts, batch_size)):
    batch_texts = plots[i:i + batch_size]
    # Replace NaN values with an empty string
    batch_texts = [text if isinstance(text, str) else "" for text in batch_texts]
    encoded_batch = [tokenizer.encode(text, max_length=125, padding='max_length', truncation=True, add_special_tokens=True) for text in batch_texts]
    encoded_texts.extend(encoded_batch)

100%|████████████████████████████████████████████████████████████████████████████████| 881/881 [00:10<00:00, 85.28it/s]


In [17]:
input_ids = torch.tensor(encoded_texts)
embeddings = []

In [18]:
# Generate embeddings for the query and move it to the device
model.eval()
with torch.no_grad():
    for i in tqdm(range(0, num_texts, batch_size)):
        input_batch = input_ids[i:i + batch_size]
        outputs = model(input_batch)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.extend(batch_embeddings)

100%|██████████████████████████████████████████████████████████████████████████████| 881/881 [2:19:58<00:00,  9.53s/it]


In [19]:
# Add the encoded embeddings into the DataFrame
df_movies['encoded_text'] = embeddings

In [32]:
def perform_search(query_text, num_results=5):
    query = preprocess_query(query_text)
    encoded_input = tokenizer.encode(query, max_length=125, padding='max_length', truncation=True, return_tensors='pt')
    output = model(encoded_input)
    embedding_query = output.last_hidden_state.mean(dim=1).detach().numpy()

    # Convert text embeddings to a numpy array
    texts_encoded = list(df_movies['encoded_text'])
    text_embeddings = np.array(texts_encoded)

    # Convert the 1-dimensional array to a 2-dimensional array
    text_embeddings = np.stack(text_embeddings)

    similarities = 1 - cdist(text_embeddings, embedding_query, metric='cosine').flatten()
    df['similarities'] = similarities
    filtered_df = df.nlargest(num_results, 'similarities')

    suggestions_dict = {}
    for index, row in filtered_df.iterrows():
        suggestion_key = f"Index {index + 1}"
        suggestion_value = {}
        for col in df.columns:
            if col != 'encoded_text':  # Exclude 'encoded_text' from the suggestions
                suggestion_value[col] = row[col]
        suggestions_dict[suggestion_key] = suggestion_value

    return suggestions_dict

In [33]:
# Input your search query
query = 'fluffy animal'
k = 3  # Number of suggestions to retrieve

# Perform the search and get suggestions
suggestions = perform_search(query, k)

  text_embeddings = np.array(texts_encoded)
  text_embeddings = np.array(texts_encoded)


In [48]:
# Print the suggestions with specific parameters
for i, (suggestion_key, suggestion_value) in enumerate(suggestions.items(), 1):
    print(f"\nResult {i}:")
    print(f"\tTitle: {suggestion_value['title']}")
    print(f"\tType: {suggestion_value['type']}")
    print(f"\tRelease Year: {suggestion_value['release_year']}")
    print(f"\tRating: {suggestion_value['rating']}")
    print(f"\tDescription: {suggestion_value['description']}")
    print(f"\tScore: {suggestion_value['similarities']}")
    print("_"*120)


Result 1:
	Title: The Magic School Bus Rides Again Kids In Space
	Type: Movie
	Release Year: 2020
	Rating: TV-Y
	Description: The Magic School Bus kids blast into orbit — and onto the International Space Station — only to find themselves on the run from a giant tardigrade!
	Score: 0.990022973038176
________________________________________________________________________________________________________________________

Result 2:
	Title: Super Monsters Monster Pets
	Type: TV Show
	Release Year: 2019
	Rating: TV-Y
	Description: The adorably magical Monster Pets star in a series of short adventures that are big on fun – and full of surprises!
	Score: 0.9899718590349479
________________________________________________________________________________________________________________________

Result 3:
	Title: Mighty Express
	Type: TV Show
	Release Year: 2021
	Rating: TV-Y
	Description: Catch a ride with the Mighty Express — a team of trains and their kid friends who overcome trouble on the t