In [2]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache
import ipywidgets as widgets
from IPython.display import display



In [3]:
movies = pd.read_csv(r'C:\Users\admin\movies.csv')
#https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies
#used dataset

In [4]:
print("Head of the dataset:")
print(movies.head())

Head of the dataset:
       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult                     backdrop_path  ...  \
0   825532764      148  False  /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg  ...   
1   701729206      169  False  /pbrkL804c8yAv3zBZR4QPEafpAR.jpg  ...   
2  1004558444      152  False  /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg  ...   
3  2923706026      162  False  /vL5LR6WdxWPjLPFRLe133jXWsh5.jpg  ...   
4  1518815515      143  False  /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg  ...   

   original_language   original_title  \
0                 en        In

In [5]:
print("\nSummary statistics:")
print(movies.describe())


Summary statistics:
                 id   vote_average     vote_count       revenue  \
count  9.787430e+05  978743.000000  978743.000000  9.787430e+05   
mean   6.486576e+05       2.145069      21.910045  7.552866e+05   
std    3.474492e+05       3.125145     343.145266  1.800109e+07   
min    2.000000e+00       0.000000       0.000000 -1.200000e+01   
25%    3.636025e+05       0.000000       0.000000  0.000000e+00   
50%    6.503540e+05       0.000000       0.000000  0.000000e+00   
75%    9.514595e+05       5.000000       1.000000  0.000000e+00   
max    1.230246e+06      10.000000   34495.000000  3.000000e+09   

             runtime        budget     popularity  
count  978743.000000  9.787430e+05  978743.000000  
mean       51.072760  2.970264e+05       1.377591  
std        62.354375  5.103793e+06       8.165239  
min         0.000000  0.000000e+00       0.000000  
25%         1.000000  0.000000e+00       0.600000  
50%        32.000000  0.000000e+00       0.600000  
75%        

In [6]:
print("\nInfo about the dataset:")
print(movies.info())



Info about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 978743 entries, 0 to 978742
Data columns (total 23 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    978743 non-null  int64  
 1   title                 978731 non-null  object 
 2   vote_average          978743 non-null  float64
 3   vote_count            978743 non-null  int64  
 4   status                978743 non-null  object 
 5   release_date          875577 non-null  object 
 6   revenue               978743 non-null  int64  
 7   runtime               978743 non-null  int64  
 8   adult                 978743 non-null  bool   
 9   backdrop_path         281262 non-null  object 
 10  budget                978743 non-null  int64  
 11  homepage              106352 non-null  object 
 12  imdb_id               565526 non-null  object 
 13  original_language     978743 non-null  object 
 14  original_title        97873

In [7]:
print("\nMissing values:")
print(movies.isnull().sum())


Missing values:
id                           0
title                       12
vote_average                 0
vote_count                   0
status                       0
release_date            103166
revenue                      0
runtime                      0
adult                        0
backdrop_path           697481
budget                       0
homepage                872391
imdb_id                 413217
original_language            0
original_title              12
overview                174639
popularity                   0
poster_path             262939
tagline                 837896
genres                  354498
production_companies    508212
production_countries    391409
spoken_languages        383107
dtype: int64


In [8]:

movies = movies[['id', 'title', 'overview', 'genres', 'vote_average', 'tagline', 'poster_path']]


In [9]:

print("\nModified dataset:")
print(movies)


Modified dataset:
             id                                              title  \
0         27205                                          Inception   
1        157336                                       Interstellar   
2           155                                    The Dark Knight   
3         19995                                             Avatar   
4         24428                                       The Avengers   
...         ...                                                ...   
978738   653113                      내 꿈은 컬러 꿈 #1 : the Green Moon   
978739   653114                         내 꿈은 컬러꿈 #2 : the Red Door   
978740   653115                                    Johannes Larsen   
978741   653116                     내 꿈은 컬러 꿈 #3 : the Purple Rain   
978742  1230175  THE GODFATHER, CODA: THE DEATH OF MICHAEL CORL...   

                                                 overview  \
0       Cobb, a skilled thief who commits corporate es...   
1       The advent

In [10]:

movies['tags'] = movies['overview'] + movies['genres']


In [11]:

print("\nDataset after adding 'tags' column:")
print(movies)


Dataset after adding 'tags' column:
             id                                              title  \
0         27205                                          Inception   
1        157336                                       Interstellar   
2           155                                    The Dark Knight   
3         19995                                             Avatar   
4         24428                                       The Avengers   
...         ...                                                ...   
978738   653113                      내 꿈은 컬러 꿈 #1 : the Green Moon   
978739   653114                         내 꿈은 컬러꿈 #2 : the Red Door   
978740   653115                                    Johannes Larsen   
978741   653116                     내 꿈은 컬러 꿈 #3 : the Purple Rain   
978742  1230175  THE GODFATHER, CODA: THE DEATH OF MICHAEL CORL...   

                                                 overview  \
0       Cobb, a skilled thief who commits corporate es...   


In [12]:

my_new_data = movies.drop(columns=['overview', 'genres'])

print("\nNew dataset without 'overview' and 'genres' columns:")
print(my_new_data)


New dataset without 'overview' and 'genres' columns:
             id                                              title  \
0         27205                                          Inception   
1        157336                                       Interstellar   
2           155                                    The Dark Knight   
3         19995                                             Avatar   
4         24428                                       The Avengers   
...         ...                                                ...   
978738   653113                      내 꿈은 컬러 꿈 #1 : the Green Moon   
978739   653114                         내 꿈은 컬러꿈 #2 : the Red Door   
978740   653115                                    Johannes Larsen   
978741   653116                     내 꿈은 컬러 꿈 #3 : the Purple Rain   
978742  1230175  THE GODFATHER, CODA: THE DEATH OF MICHAEL CORL...   

        vote_average                                            tagline  \
0              8.364          

In [13]:

def clean_title(title: str) -> str:
    """
    Clean a movie title by removing non-alphanumeric characters.
    """
    if isinstance(title, str):
        return re.sub("[^a-zA-Z0-9]", "", title)
    else:
        return ""

movies["clean_title"] = movies["title"].apply(clean_title)

movies["clean_title"].fillna("", inplace=True)



In [14]:

my_new_data['title'].fillna('', inplace=True)
my_new_data['tags'].fillna('', inplace=True)

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(my_new_data['title'] + my_new_data['tags'])


In [15]:
def search(title: str, num_results=5) -> pd.DataFrame:
    """
    Search for movies similar to a given title using item-based filtering.
    """
    title = clean_title(title)
    if not title:
        return pd.DataFrame()

    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argsort(similarity)[-num_results:][::-1]
    results1 = movies[['id', 'title', 'vote_average', 'genres']].iloc[indices].reset_index(drop=True)

    return results1


In [16]:

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [17]:

@lru_cache(maxsize=128)
def find_similars(movie_id: int, num_results=5) -> pd.DataFrame:
    """
    Find similar movies to a given movie using user-based collaborative filtering.
    """
    
    if movie_id not in movies['id'].values:
        print(f"Movie with ID {movie_id} not found.")
        return pd.DataFrame()


    movie_title = movies.loc[movies['id'] == movie_id, 'title'].values[0]


    similars = search(movie_title, num_results)
    return similars


In [21]:

user_input = widgets.BoundedIntText(
    value=1,
    min=1,
    max=10000,
    step=1,
    description='Movie ID:',
    disabled=False
)
user_rec_list = widgets.Output()

def on_user_input(data):
    with user_rec_list:
        user_rec_list.clear_output()
        movie_id = data["new"]
        display(find_similars(movie_id))

user_input.observe(on_user_input, names='value')

display(user_input, user_rec_list)


BoundedIntText(value=1, description='Movie ID:', max=10000, min=1)

Output()