# Import dependencies, data and first glance on data

In [None]:
# packages for simplest dealing with data
import pandas as pd
import numpy as np

# packages for dealing with text data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.lang.en import stop_words
nlp = spacy.load('en_core_web_sm')
stop_words = stop_words.STOP_WORDS

In [None]:
# necessary data
df_main = pd.read_csv('/content/drive/MyDrive/imdb_clean_titles_4_10.csv')
df_summary = pd.read_json('/content/drive/MyDrive/title_summary.json')

In [None]:
df_main.head(2)

Unnamed: 0,original_title_name,title_name,poster,rating_view,release_year,country,language,title_duration_second,title_duration,imdb_rating,vote_count,title_url,title_trailer,genres,review_count,critic_review_count,cast,directors_creators,short_plot
0,Zeitgeist: Moving Forward,Zeitgeist: Moving Forward,https://m.media-amazon.com/images/M/MV5BNzc0OD...,Unrated,2011,United States,English,9660.0,2h 41m,8.1,18841,https://www.imdb.com/title/tt1781069/,https://www.imdb.com/video/vi2765593881/,Documentary,67,3,"Peter Joseph, Robert Sapolsky, Gabor Maté, Ric...",Peter Joseph,A feature length documentary work which presen...
1,Life in a Day,Life in a Day,https://m.media-amazon.com/images/M/MV5BMjE4MD...,PG-13,2011,"United Kingdom, United States","English, Italian, Japanese, German, Spanish, I...",5700.0,1h 35m,7.6,15664,https://www.imdb.com/title/tt1687247/,https://www.imdb.com/video/vi3867843609/,"Documentary, Drama",64,95,"Hiroaki Aikawa, Cindy Baer, Teagan Bentley, Es...","Tegan Bukowski, Loressa Clisby, Kevin Macdonald",A documentary shot by film-makers all over the...


In [None]:
df_summary.head()

Unnamed: 0,title_url,title_summary
0,https://www.imdb.com/title/tt0971209,Two pairs of lovers on a Hawaiian vacation dis...
1,https://www.imdb.com/title/tt12731980,A farmer takes in an injured man with a satche...
2,https://www.imdb.com/title/tt0112722,A criminal psychologist who turned agoraphobic...
3,https://www.imdb.com/title/tt0032976,A self-conscious woman juggles adjusting to he...
4,https://www.imdb.com/title/tt4374208,American man Rob gets Irish woman Sharon pregn...


# Making simple text preprocessing for building future recommendation system

In [None]:
# but first of all we find out the largest title summary text and then
# combine with necessary columns from main dataframe

def find_the_biggest_piece_of_text(text):
  text_and_length = {}
  pieces_of_text = text.split('/')
  max_length = 0
  for item in pieces_of_text:
    text_length = len(item)
    text_and_length.update({text_length: item})
    if text_length > max_length:
      max_length = text_length


  return text_and_length[max_length]


df_summary['biggest_title_summary'] = df_summary['title_summary'].apply(lambda text: find_the_biggest_piece_of_text(text))
df_summary['title_url'] = df_summary['title_url'].apply(lambda url: url+'/')
df = df_summary.merge(df_main[['original_title_name', 'title_url', 'genres', 'cast', 'directors_creators']], how='inner', left_on='title_url', right_on='title_url')
df = df[['original_title_name', 'title_url', 'genres', 'cast', 'directors_creators', 'title_summary', 'biggest_title_summary']]

In [None]:
df.head()

Unnamed: 0,original_title_name,title_url,genres,cast,directors_creators,title_summary,biggest_title_summary
0,A Perfect Getaway,https://www.imdb.com/title/tt0971209/,"Drama, Mystery, Thriller","Milla Jovovich, Steve Zahn, Timothy Olyphant, ...",David Twohy,Two pairs of lovers on a Hawaiian vacation dis...,"On Hawaiian honeymoon, Hollywood Screenwriter ..."
1,Old Henry,https://www.imdb.com/title/tt12731980/,"Action, Drama, Western","Tim Blake Nelson, Scott Haze, Gavin Lewis, Tra...",Potsy Ponciroli,A farmer takes in an injured man with a satche...,A long trail of blood and a worn-out leather s...
2,Copycat,https://www.imdb.com/title/tt0112722/,"Drama, Mystery, Thriller","Sigourney Weaver, Holly Hunter, Dermot Mulrone...",Jon Amiel,A criminal psychologist who turned agoraphobic...,"In San Francisco, the criminal psychologist He..."
3,Rebecca,https://www.imdb.com/title/tt0032976/,"Drama, Mystery, Romance, Thriller","Laurence Olivier, Joan Fontaine, George Sander...",Alfred Hitchcock,A self-conscious woman juggles adjusting to he...,"""It wouldn't make for sanity, would it? Living..."
4,Catastrophe,https://www.imdb.com/title/tt4374208/,Comedy,"Sharon Horgan, Rob Delaney, Mark Bonnar, Ashle...",No info,American man Rob gets Irish woman Sharon pregn...,American man Rob gets Irish woman Sharon pregn...


## Remove punctuation, stopwords, making lemmaization words

In [None]:
def remove_punctuation(string, punctuation_to_remove="""#$%"-!&()*+,-./:;<=>?@[\\]^_`{|}~'"""):
  return ''.join(char for char in string if char not in punctuation_to_remove)


def remove_stop_words(string, stoplist=stop_words):
  doc = nlp(string)
  return ' '.join(token.text for token in doc if token.text not in stoplist)
  # # simplest way to deal with stopwords
  # words = string.split()
  # return ' '.join(token for token in words if token not in stoplist)



# lemmatization words
def lemmatize(tokens):
  doc = nlp(tokens)
  return ' '.join(token.lemma_ for token in doc)


df['text_for_recommendation'] = df['biggest_title_summary'].apply(lambda text: text.lower())
df['text_for_recommendation'] = df['text_for_recommendation'].apply(remove_punctuation)
df['text_for_recommendation'] = df['text_for_recommendation'].apply(remove_stop_words)
df['text_for_recommendation'] = df['text_for_recommendation'].apply(lemmatize)

In [None]:
# let's compare what we get
print(df['biggest_title_summary'].iloc[0])
print('\n')
print(df['text_for_recommendation'].iloc[0])

On Hawaiian honeymoon, Hollywood Screenwriter Cliff Anderson and his bride Cydney go from Honolulu to pristine island Kaua'i for an adventurous hike in the nature reserve, for which Cliff forgets his permit in the supply store. Reluctance to give macho Kale and his girl Cleo a ride creates tension that remains when they meet on the trail, with a third couple taking the lead: studly special forces Irak veteran Nick, who even hunts wild goats, and his equally exhibitionist steady girl Gina. Receiving the news that a serial killer who murdered a couple on Oahu probably passed to the island, as they did, starts the couples mistrusting each-other. Furthermore they seem to be followed by mystery stalkers, as turns out of two kinds. After the battle of whit, a bloody struggle for survival is inevitable: the killer is among them and strikes, despite a police helicopter searching for him.


hawaiian honeymoon hollywood screenwriter cliff anderson bride cydney honolulu pristine island kauai adve

# Making simplest tf-idf movie content based recommendation system

## Title summary tf-idf recommendation system

TfidfVectorizer - calculates term frequency-inverse document frequency value for each word(TF_IDF). TF-IDF increases the weight of terms that occur rarely. TF-IDF = term frequency * (1/document frequency).

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df["text_for_recommendation"])

# then we make cosine similarity matrix for future searching similar summary text
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# make simple function for searching top 10 similar titles

def get_recommendation(movie, cosine_similarity_matrix):
    movie_index = df[df["original_title_name"] == movie].index[0]
    sim_scores = list(enumerate(cosine_similarity_matrix[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # first movie is for what we try to find similar
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]

    return df[['original_title_name', 'title_url']].iloc[movie_indices]

### Testing our simplest summary content based tf-idf recommendation system

In [None]:
# let's try to find out similar movies for Iron Man, The Dark Knight, Interstellar
titles = ['Iron Man', 'The Dark Knight', 'Interstellar']

for title in titles:
    print("=" * 40)
    print(f"🎬 Recommendations for: \033[1m{title}\033[0m")
    print("=" * 40)

    recommendations = get_recommendation(title, cosine_sim_matrix)

    for indx, item in enumerate(recommendations.itertuples()):
        print(f"{indx+1}. \033[1m{item.original_title_name}\033[0m ({item.title_url}).")

    print("\n" + "=" * 40 + "\n")

🎬 Recommendations for: [1mIron Man[0m
1. [1mIron Man 2[0m (https://www.imdb.com/title/tt1228705/).
2. [1mIron Man Three[0m (https://www.imdb.com/title/tt1300854/).
3. [1mZero Effect[0m (https://www.imdb.com/title/tt0120906/).
4. [1mScarface[0m (https://www.imdb.com/title/tt0023427/).
5. [1mMarried to the Mob[0m (https://www.imdb.com/title/tt0095593/).
6. [1mUrban Legends: Final Cut[0m (https://www.imdb.com/title/tt0192731/).
7. [1mHours[0m (https://www.imdb.com/title/tt2094018/).
8. [1mThe Avengers[0m (https://www.imdb.com/title/tt0848228/).
9. [1mThe Dark Half[0m (https://www.imdb.com/title/tt0106664/).
10. [1mSpider-Man: Homecoming[0m (https://www.imdb.com/title/tt2250912/).


🎬 Recommendations for: [1mThe Dark Knight[0m
1. [1mThe Dark Knight Rises[0m (https://www.imdb.com/title/tt1345836/).
2. [1mBatman Returns[0m (https://www.imdb.com/title/tt0103776/).
3. [1mBatman Begins[0m (https://www.imdb.com/title/tt0372784/).
4. [1mBatman: The Long Halloween, P

The results are decent, though not perfect. For example, we get good results with franchises like The Dark Knight (Batman), but with complicated movies like Interstellar, the results are a bit confusing. That means we get not full and good summary text or may be we should use more complex algorithm.

## Title summary and other title features tf-idf recommendation system

In [None]:
# for clean text summary i just add genres, cast and directors text, let's see if that make recommendation system better

df['greater_text_for_recommendation'] = (
    df['text_for_recommendation'].fillna('') + ' ' +
    df['genres'].fillna('').str.lower().str.replace(',', '') + ' ' +
    df['cast'].fillna('').str.lower().str.replace(',', '') + ' ' +
    df['directors_creators'].fillna('').str.lower().str.replace(',', '')
)

In [None]:
# not necessary, but create a new tfidf
tfidf_2 = TfidfVectorizer(stop_words='english')
tfidf_matrix_2 = tfidf_2.fit_transform(df["greater_text_for_recommendation"])

# then we make cosine similarity matrix for future searching similar summary text
cosine_sim_matrix_2 = cosine_similarity(tfidf_matrix_2, tfidf_matrix_2)

In [None]:
# save better tf-idf vectors
np.save('tf_idf_title_vectors.npy', tfidf_matrix_2)

### Testing recommendation system with additional features

In [None]:
titles = ['Iron Man', 'The Dark Knight', 'Interstellar']

for title in titles:
    print("=" * 40)
    print(f"🎬 Recommendations for: \033[1m{title}\033[0m")
    print("=" * 40)

    recommendations = get_recommendation(title, cosine_sim_matrix_2)
    for indx, item in enumerate(recommendations.itertuples()):
        print(f"{indx+1}. \033[1m{item.original_title_name}\033[0m ({item.title_url}).")

    print("\n" + "=" * 40 + "\n")

🎬 Recommendations for: [1mIron Man[0m
1. [1mIron Man 2[0m (https://www.imdb.com/title/tt1228705/).
2. [1mIron Man Three[0m (https://www.imdb.com/title/tt1300854/).
3. [1mSpider-Man: Homecoming[0m (https://www.imdb.com/title/tt2250912/).
4. [1mThe Avengers[0m (https://www.imdb.com/title/tt0848228/).
5. [1mZero Effect[0m (https://www.imdb.com/title/tt0120906/).
6. [1mHours[0m (https://www.imdb.com/title/tt2094018/).
7. [1mThe Dark Half[0m (https://www.imdb.com/title/tt0106664/).
8. [1mCaptain America: Civil War[0m (https://www.imdb.com/title/tt3498820/).
9. [1mSpider-Man: Far from Home[0m (https://www.imdb.com/title/tt6320628/).
10. [1mMarried to the Mob[0m (https://www.imdb.com/title/tt0095593/).


🎬 Recommendations for: [1mThe Dark Knight[0m
1. [1mThe Dark Knight Rises[0m (https://www.imdb.com/title/tt1345836/).
2. [1mBatman Begins[0m (https://www.imdb.com/title/tt0372784/).
3. [1mBatman Returns[0m (https://www.imdb.com/title/tt0103776/).
4. [1mBatman: Ma

I get somewhat better results with this method than just using a simple title summary recommendation system. But the results aren't perfect because there aren't enough good, detailed title summaries or tf-idf cann't get complex imagination (of course it cann't) of text context and connections. That's why I will try using bert model for more complex movie recommendation.

# Making more complex recommendation system using SBERT

In [None]:
!pip install sentence_transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

In [None]:
import tensorflow as tf

from sentence_transformers import SentenceTransformer
from sentence_transformers import util

In [None]:
# for this SBERT model we can use non preprocessing data
text_list = list(df['biggest_title_summary'])

model = SentenceTransformer('all-MiniLM-L6-v2')
title_summary_embeddings = model.encode(text_list, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/341 [00:00<?, ?it/s]

In [None]:
# save sbert embeddings for future usage
np.save('sbert_title_vectors.npy', title_summary_embeddings)

In [None]:
cosine_sim_matrix_3 = util.cos_sim(title_summary_embeddings, title_summary_embeddings)

## Testing SBERT recommendation system

In [None]:
# we save all previous concept, but using more powerfull tool like a bert
titles = ['Iron Man', 'The Dark Knight', 'Interstellar']

for title in titles:
    print("=" * 40)
    print(f"🎬 Recommendations for: \033[1m{title}\033[0m")
    print("=" * 40)

    recommendations = get_recommendation(title, cosine_sim_matrix_3)
    for indx, item in enumerate(recommendations.itertuples()):
        print(f"{indx+1}. \033[1m{item.original_title_name}\033[0m ({item.title_url}).")

    print("\n" + "=" * 40 + "\n")

🎬 Recommendations for: [1mIron Man[0m
1. [1mIron Man 2[0m (https://www.imdb.com/title/tt1228705/).
2. [1mAfter Life[0m (https://www.imdb.com/title/tt8398600/).
3. [1mIron Man Three[0m (https://www.imdb.com/title/tt1300854/).
4. [1mCaptain America: Civil War[0m (https://www.imdb.com/title/tt3498820/).
5. [1mSpider-Man: Far from Home[0m (https://www.imdb.com/title/tt6320628/).
6. [1mThe Avengers[0m (https://www.imdb.com/title/tt0848228/).
7. [1mAvengers: Age of Ultron[0m (https://www.imdb.com/title/tt2395427/).
8. [1mI Dream of Jeannie[0m (https://www.imdb.com/title/tt0058815/).
9. [1mScarface[0m (https://www.imdb.com/title/tt0086250/).
10. [1mDu rififi chez les hommes[0m (https://www.imdb.com/title/tt0048021/).


🎬 Recommendations for: [1mThe Dark Knight[0m
1. [1mBatman[0m (https://www.imdb.com/title/tt0096895/).
2. [1mBatman Begins[0m (https://www.imdb.com/title/tt0372784/).
3. [1mThe Dark Knight Rises[0m (https://www.imdb.com/title/tt1345836/).
4. [1mBat

Overall, we observe results that are both varied and more precise in some aspects, yet still not ideal in others. While Interstellar yields titles with similar concepts, Iron Man's results include movies with dissimilar concepts (not superhero concept, but still similar in some ways with title). Furthermore, this method demonstrates increased accuracy compared to traditional TF-IDF, albeit at a significantly higher computational cost.

# Making film searching by user's specific requests

In [None]:
text_requests = [
    'An amazing superhero film featuring action, shooting, and an engaging plot.',
    'War movies that focus on the psychological impact of war.'
    ]

In [None]:
def find_movie_by_request_sbert(text, movie_summary_embeddings):
  model = SentenceTransformer('all-MiniLM-L6-v2')

  request_embedding = model.encode(text, show_progress_bar=False)
  cosine_scores = util.cos_sim(request_embedding, movie_summary_embeddings)

  top_matches = cosine_scores[0].argsort(descending=True)[:10].numpy()

  return df[['original_title_name', 'title_url']].iloc[top_matches]

In [None]:
for text in text_requests:
    print("=" * 40)
    print(f"🎬 Recommendations for: \033[1m{text}\033[0m")
    print("=" * 40)

    recommendations = find_movie_by_request_sbert(text, title_summary_embeddings)
    for indx, item in enumerate(recommendations.itertuples()):
        print(f"{indx+1}. \033[1m{item.original_title_name}\033[0m ({item.title_url}).")

    print("\n" + "=" * 40 + "\n")

🎬 Recommendations for: [1mAn amazing superhero film featuring action, shooting, and an engaging plot.[0m
1. [1mSuperman: The Animated Series[0m (https://www.imdb.com/title/tt0115378/).
2. [1mWhat If...?[0m (https://www.imdb.com/title/tt10168312/).
3. [1mShooter[0m (https://www.imdb.com/title/tt4181172/).
4. [1mKingsman: The Secret Service[0m (https://www.imdb.com/title/tt2802144/).
5. [1mJustice League Unlimited[0m (https://www.imdb.com/title/tt6025022/).
6. [1mGrindhouse[0m (https://www.imdb.com/title/tt0462322/).
7. [1mSaaho[0m (https://www.imdb.com/title/tt6836936/).
8. [1mValerian and the City of a Thousand Planets[0m (https://www.imdb.com/title/tt2239822/).
9. [1mHancock[0m (https://www.imdb.com/title/tt0448157/).
10. [1mEl hoyo 2[0m (https://www.imdb.com/title/tt27729779/).


🎬 Recommendations for: [1mWar movies that focus on the psychological impact of war.[0m
1. [1mThe World at War[0m (https://www.imdb.com/title/tt0071075/).
2. [1mHitler: The Rise of 

In final get not really bad result!

DONE