### Importing the necessary Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from IPython import get_ipython
import warnings 
warnings.filterwarnings('ignore')

import re
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import pickle

### Reading and inspecting the dataset 

In [51]:
### Reading the dataset 
movies_df = pd.read_csv('movies_df.csv')
movies_df.head()

Unnamed: 0,userid,movieid,title,genres,original_language,overview,popularity,production_companies,release_date,budget,...,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
0,1,615656,Meg 2: The Trench,Action-Science Fiction-Horror,en,An exploratory dive into the deepest depths of...,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,02/08/2023,129000000,...,116,Released,Back for seconds.,7.079,1365,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,based on novel or book-sequel-kaiju,/4m1Au3YkjqsxF8iwQy0fPYSxE0h.jpg,/qlxy8yo5bcgUw2KAmmojUKp4rHd.jpg,1006462-298618-569094-1061181-346698-1076487-6...
1,2,758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,05/04/2023,18000000,...,103,Released,Inspired by the actual files of Father Gabriel...,7.433,545,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...,spain-rome italy-vatican-pope-pig-possession-c...,/9JBEPLTPSm0d1mbEcLxULjJq9Eh.jpg,/hiHGRbyTcbZoLsYYkO4QiCLYe34.jpg,713704-296271-502356-1076605-1084225-1008005-9...
2,3,667538,Transformers: Rise of the Beasts,Action-Adventure-Science Fiction,en,When a new threat capable of destroying the en...,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,06/06/2023,200000000,...,127,Released,Unite or fall.,7.34,1007,Anthony Ramos-Dominique Fishback-Luna Lauren V...,peru-alien-end of the world-based on cartoon-b...,/gPbM0MK8CP8A174rmUwGsADNYKD.jpg,/woJbg7ZqidhpvqFGGMRhWQNoxwa.jpg,496450-569094-298618-385687-877100-598331-4628...
3,4,640146,Ant-Man and the Wasp: Quantumania,Action-Adventure-Science Fiction,en,Super-Hero partners Scott Lang and Hope van Dy...,4425.387,Marvel Studios-Kevin Feige Productions,15/02/2023,200000000,...,125,Released,Witness the beginning of a new dynasty.,6.507,2811,Paul Rudd-Evangeline Lilly-Jonathan Majors-Kat...,hero-ant-sequel-superhero-based on comic-famil...,/qnqGbB22YJ7dSs4o6M7exTpNxPz.jpg,/m8JTwHFwX7I7JY5fPe4SjqejWag.jpg,823999-676841-868759-734048-267805-965839-1033...
4,5,677179,Creed III,Drama-Action,en,After dominating the boxing world Adonis Creed...,3994.342,Metro-Goldwyn-Mayer-Proximity Media-Balboa Pro...,01/03/2023,75000000,...,116,Released,You can't run from your past.,7.262,1129,Michael B. Jordan-Tessa Thompson-Jonathan Majo...,philadelphia pennsylvania-husband wife relatio...,/cvsXj3I9Q2iyyIo95AecSd1tad7.jpg,/5i6SjyDbDWqyun8klUuCxrlFbyw.jpg,965839-267805-943822-842942-1035806-823999-107...


In [52]:
movies_df.shape

(16239, 21)

In [53]:
movies_df.describe()

Unnamed: 0,userid,movieid,popularity,budget,revenue,runtime,vote_average,vote_count
count,16239.0,16239.0,16239.0,16239.0,16239.0,16239.0,16239.0,16239.0
mean,8120.0,192358.455632,25.122935,14132290.0,40720830.0,102.81021,6.620422,1092.008498
std,4687.939846,237916.593424,130.684011,31843610.0,126150300.0,25.761857,0.735909,2351.625152
min,1.0,2.0,0.6,0.0,0.0,0.0,5.003,101.0
25%,4060.5,11842.5,8.8335,0.0,0.0,91.0,6.1,162.0
50%,8120.0,44450.0,12.357,0.0,131357.0,101.0,6.6,308.0
75%,12179.5,367270.5,19.981,14000000.0,23376890.0,114.0,7.195,881.5
max,16239.0,968051.0,8763.998,460000000.0,2920357000.0,583.0,9.2,33262.0


In [54]:
movies_df.nunique()

userid                  16239
movieid                 16057
title                   15369
genres                   2878
original_language          57
overview                16031
popularity              11837
production_companies    12939
release_date             8574
budget                   1087
revenue                  7983
runtime                   236
status                      1
tagline                 11535
vote_average             2169
vote_count               3401
credits                 15965
keywords                14015
poster_path             16056
backdrop_path           16011
recommendations         14854
dtype: int64

### Selecting key features in the dataset and creating metadata column  

In [55]:
#Selecting the necessary features in the dataset
movies = movies_df[['userid','movieid', 'title', 'genres', 'overview', 'tagline', 'vote_average','keywords']]

In [56]:
movies.isnull().sum()

userid             0
movieid            0
title              0
genres             6
overview          28
tagline         4563
vote_average       0
keywords        1605
dtype: int64

In [57]:
null_ = round(movies.isnull().sum().sort_values(ascending=False)/movies.shape[0]*100,2).reset_index().rename(columns={'index':'column_name', 0:'percent_missing'})
null_

Unnamed: 0,column_name,percent_missing
0,tagline,28.1
1,keywords,9.88
2,overview,0.17
3,genres,0.04
4,userid,0.0
5,movieid,0.0
6,title,0.0
7,vote_average,0.0


In [58]:
## Working on the genres column
movies.iloc[0].genres

'Action-Science Fiction-Horror'

In [59]:
#Making the genres columns more readable
movies['genres']= movies['genres'].str.replace('-',' ')
movies.head()

Unnamed: 0,userid,movieid,title,genres,overview,tagline,vote_average,keywords
0,1,615656,Meg 2: The Trench,Action Science Fiction Horror,An exploratory dive into the deepest depths of...,Back for seconds.,7.079,based on novel or book-sequel-kaiju
1,2,758323,The Pope's Exorcist,Horror Mystery Thriller,Father Gabriele Amorth Chief Exorcist of the V...,Inspired by the actual files of Father Gabriel...,7.433,spain-rome italy-vatican-pope-pig-possession-c...
2,3,667538,Transformers: Rise of the Beasts,Action Adventure Science Fiction,When a new threat capable of destroying the en...,Unite or fall.,7.34,peru-alien-end of the world-based on cartoon-b...
3,4,640146,Ant-Man and the Wasp: Quantumania,Action Adventure Science Fiction,Super-Hero partners Scott Lang and Hope van Dy...,Witness the beginning of a new dynasty.,6.507,hero-ant-sequel-superhero-based on comic-famil...
4,5,677179,Creed III,Drama Action,After dominating the boxing world Adonis Creed...,You can't run from your past.,7.262,philadelphia pennsylvania-husband wife relatio...


In [60]:
movies.iloc[0].genres

'Action Science Fiction Horror'

In [61]:
## Inspecting the overviews columns
movies.iloc[0].overview

'An exploratory dive into the deepest depths of the ocean of a daring research team spirals into chaos when a malevolent mining operation threatens their mission and forces them into a high-stakes battle for survival.'

In [62]:
## Inspecting the tagline column
movies.iloc[0].tagline

'Back for seconds.'

In [63]:
## Inspecting the keywords column
movies.iloc[0].keywords

'based on novel or book-sequel-kaiju'

In [64]:
# filling any missing values in movies DataFrame with empty strings
movies.fillna('', inplace=True)

#Creating 'metadata' column in movies dataframe, combining: genres, overview, tagline, keyword columns with a space separator
movies['metadata'] = movies[['genres', 'overview', 'tagline', 'keywords']].apply(lambda x: ' '.join(x), axis=1)
movies.head(2)

Unnamed: 0,userid,movieid,title,genres,overview,tagline,vote_average,keywords,metadata
0,1,615656,Meg 2: The Trench,Action Science Fiction Horror,An exploratory dive into the deepest depths of...,Back for seconds.,7.079,based on novel or book-sequel-kaiju,Action Science Fiction Horror An exploratory d...
1,2,758323,The Pope's Exorcist,Horror Mystery Thriller,Father Gabriele Amorth Chief Exorcist of the V...,Inspired by the actual files of Father Gabriel...,7.433,spain-rome italy-vatican-pope-pig-possession-c...,Horror Mystery Thriller Father Gabriele Amorth...


In [65]:
null_ = round(movies.isnull().sum().sort_values(ascending=False)/movies.shape[0]*100,2).reset_index().rename(columns={'index':'column_name', 0:'percent_missing'})
null_

Unnamed: 0,column_name,percent_missing
0,userid,0.0
1,movieid,0.0
2,title,0.0
3,genres,0.0
4,overview,0.0
5,tagline,0.0
6,vote_average,0.0
7,keywords,0.0
8,metadata,0.0


In [66]:
movies_dataset=movies.copy()

In [67]:
def metadata_cleaner(data):
    # Convert everything to lowercase
#     data = data.lower() 
    # Remove mentions   
    data = re.sub('@[\w]*',' ',data)  
    # Remove url's
    data = re.sub(r'https?:\/\/.*\/\w*',' ', data)
    # Remove hashtags
    data = re.sub(r'#\w*', ' ', data)    
    # Remove numbers
    data = re.sub(r'\d+', ' ', data)  
#     # Remove punctuation
#     data = re.sub(r"[,.;':@#?!\&/$]+\ *", ' ', data)
    # Remove that funny diamond
    data = re.sub(r"U+FFFD ", ' ',data)
    # Remove extra whitespace
    data = re.sub(r'\s\s+', ' ', data)
    # Remove space in front of tweet
    data = data.lstrip(' ') 
#     #tokenize the message column of the pandas Dataframe
#     tokeniser = TreebankWordTokenizer()
#     data = tokeniser.tokenize(data)
#     #remove words less than 2 letter word
#     data = [word for word in data if len(word) >3]
#     #get the unigue values in each row
#     data = set(data)
#     #convert tokenised word list to string
#     data =' '.join(data)
    return data


# Clean the tweets in the message column
movies_dataset['metadata'] = movies_dataset['metadata'].apply(metadata_cleaner)

In [68]:
movies_dataset['metadata'][0]

'Action Science Fiction Horror An exploratory dive into the deepest depths of the ocean of a daring research team spirals into chaos when a malevolent mining operation threatens their mission and forces them into a high-stakes battle for survival. Back for seconds. based on novel or book-sequel-kaiju'

In [69]:
movies_dataset['metadata'].tolist()[1]

"Horror Mystery Thriller Father Gabriele Amorth Chief Exorcist of the Vatican investigates a young boy's terrifying possession and ends up uncovering a centuries-old conspiracy the Vatican has desperately tried to keep hidden. Inspired by the actual files of Father Gabriele Amorth, Chief Exorcist of the Vatican. spain-rome italy-vatican-pope-pig-possession-conspiracy-devil-exorcist-skepticism-catholic priest- s-supernatural horror"

In [70]:
movies_dataset.head(2)

Unnamed: 0,userid,movieid,title,genres,overview,tagline,vote_average,keywords,metadata
0,1,615656,Meg 2: The Trench,Action Science Fiction Horror,An exploratory dive into the deepest depths of...,Back for seconds.,7.079,based on novel or book-sequel-kaiju,Action Science Fiction Horror An exploratory d...
1,2,758323,The Pope's Exorcist,Horror Mystery Thriller,Father Gabriele Amorth Chief Exorcist of the V...,Inspired by the actual files of Father Gabriel...,7.433,spain-rome italy-vatican-pope-pig-possession-c...,Horror Mystery Thriller Father Gabriele Amorth...


In [71]:
movies_dataset = movies_dataset[['movieid','title','metadata']]
movies_dataset.head()

Unnamed: 0,movieid,title,metadata
0,615656,Meg 2: The Trench,Action Science Fiction Horror An exploratory d...
1,758323,The Pope's Exorcist,Horror Mystery Thriller Father Gabriele Amorth...
2,667538,Transformers: Rise of the Beasts,Action Adventure Science Fiction When a new th...
3,640146,Ant-Man and the Wasp: Quantumania,Action Adventure Science Fiction Super-Hero pa...
4,677179,Creed III,Drama Action After dominating the boxing world...


In [72]:
movies_dataset.shape

(16239, 3)

In [73]:
movies_dataset = movies_dataset.iloc[:2000, :]

### Checking for duplicates in out final dataset

In [74]:
movies_dataset.duplicated().sum()

3

In [75]:
movies_dataset.columns

Index(['movieid', 'title', 'metadata'], dtype='object')

In [76]:
dup_rows = pd.DataFrame(movies_dataset, columns=['movieid', 'title', 'metadata'])

duplicate =dup_rows[dup_rows.duplicated()]
print("Duplicated Rows : ")
duplicate

Duplicated Rows : 


Unnamed: 0,movieid,title,metadata
146,831223,Gone Mom: The Disappearance of Jennifer Dulos,TV Movie Thriller Crime Jennifer Dulos the wea...
258,777350,Dory's Reef Cam,Family Animation Comedy Adventure Dive into th...
266,777350,Dory's Reef Cam,Family Animation Comedy Adventure Dive into th...


In [77]:
movies_dataset = movies_dataset.drop_duplicates()

In [78]:
movies_dataset.duplicated().sum()

0

<h3>Using CountVectorizer to convert text into vectors</h3>

CountVectorizer converts text into vectors on the basis of the frequency

(counts) of each word that occurs in the entire text.

converts a collection of text documents to a matrix of token counts

In [79]:
# from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000, stop_words="english")

In [80]:
# Learn the vocabulary dictionary and return document-term matrix
vectors = cv.fit_transform(movies_dataset['metadata']).toarray()

In [81]:
vectors.shape

(1997, 2000)

In [82]:
# Array mapping from feature integer indices to feature names 
cv.get_feature_names_out()

array(['abandoned', 'abducted', 'abduction', ..., 'zombies', 'zone',
       'zoo'], dtype=object)

In [83]:
len(cv.get_feature_names_out())

2000

### Applying Cosine Distance Concept 

In [84]:
# from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(vectors).shape

(1997, 1997)

In [85]:
similarity = cosine_similarity(vectors)
similarity[0]

array([1.        , 0.05802589, 0.22758963, ..., 0.        , 0.        ,
       0.13957263])

In [86]:
# Using the enumerate function to return both the indexes and values of lists 
sorted(list(enumerate(similarity[0])),reverse=True, key = lambda x:x[1])[1:10]

[(582, 0.38924947208076144),
 (810, 0.32907259085720875),
 (868, 0.3111879571320709),
 (1452, 0.3063358324269934),
 (743, 0.29346959282671103),
 (188, 0.28829998806257884),
 (19, 0.2823912473624525),
 (904, 0.279946255477927),
 (846, 0.27713265386271346)]

### Creating a recommendation system

In [87]:
def recommend(movie):
    movie_index = movies_dataset[movies_dataset['title'] == movie].index[0] ## fectching the movie index
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)),reverse=True, key = lambda x:x[1])[1:10]
    
    for i in movie_list:
        print(movies_dataset.iloc[i[0]].title)

In [88]:
recommend('Avatar')

Rebel Moon - Part One: A Child of Fire
Alien
Dune
Battle: Los Angeles
Lightyear
Aliens
Project Gemini
Rogue One: A Star Wars Story
Space Sweepers


In [89]:
recommend('Toy Story')

Dragon Ball Z: Lord Slug
Dragon Ball: Episode of Bardock
Dragon Ball Z: Cooler's Revenge
Dragon Ball Z: Resurrection 'F - Future Trunks Special Edition
Dragon Ball: Yo! Son Goku and His Friends Return!!
Dragon Ball Z: The Tree of Might
Dragon Ball Z: Battle of Gods
Mobile Suit Gundam Hathaway
Dream 9 Toriko & One Piece & Dragon Ball Z Super Collaboration Special!!


### Pickling the Model  

In [90]:
# import pickle
pickle.dump(movies_dataset, open('movies_dataset.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [91]:
# saving the the movie dataset as dictionary 
pickle.dump(movies_dataset.to_dict(), open('movies_dataset_dict.pkl', 'wb'))

In [92]:
movies_dataset['title'].values

array(['Meg 2: The Trench', "The Pope's Exorcist",
       'Transformers: Rise of the Beasts', ..., 'Lady Vengeance',
       'Liar Liar', 'My Policeman'], dtype=object)

In [93]:
movies_dataset.to_dict()

#### Opening the Picled file

In [94]:
pickle.load(open('movies_dataset.pkl', 'rb'))

Unnamed: 0,movieid,title,metadata
0,615656,Meg 2: The Trench,Action Science Fiction Horror An exploratory d...
1,758323,The Pope's Exorcist,Horror Mystery Thriller Father Gabriele Amorth...
2,667538,Transformers: Rise of the Beasts,Action Adventure Science Fiction When a new th...
3,640146,Ant-Man and the Wasp: Quantumania,Action Adventure Science Fiction Super-Hero pa...
4,677179,Creed III,Drama Action After dominating the boxing world...
...,...,...,...
1995,87502,Flight,Drama Commercial airline pilot Whip Whitaker h...
1996,13387,Transporter 3,Action Thriller Crime Frank Martin puts the dr...
1997,4550,Lady Vengeance,Drama Thriller Released after being wrongfully...
1998,1624,Liar Liar,Comedy Fletcher Reede is a fast-talking attorn...


### Opening the pickled dictionary dataset file

In [95]:
# saving the the movie dataset as dictionary 
pickle.load(open('movies_dataset_dict.pkl', 'rb'))

In [96]:
movie_open = pickle.load(open('movies_dataset_dict.pkl', 'rb'))
movie_pd = pd.DataFrame(movie_open)
movie_pd.head()

Unnamed: 0,movieid,title,metadata
0,615656,Meg 2: The Trench,Action Science Fiction Horror An exploratory d...
1,758323,The Pope's Exorcist,Horror Mystery Thriller Father Gabriele Amorth...
2,667538,Transformers: Rise of the Beasts,Action Adventure Science Fiction When a new th...
3,640146,Ant-Man and the Wasp: Quantumania,Action Adventure Science Fiction Super-Hero pa...
4,677179,Creed III,Drama Action After dominating the boxing world...


In [97]:
# ### Pickling the Model  

# # import pickle
# pickle.dump(new_df, open('movies_df.pkl', 'wb'))
# pickle.dump(similarity, open('similarity.pkl', 'wb'))

# new_df['title'].values

# new_df.to_dict()

# pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))

In [98]:
# main = df.index[~df.index.isin(df1.index)]
# main

In [99]:
# import pandas as pd

# # Read the CSV file into a DataFrame
# movie_db = pd.read_csv('movies.csv')

# # Filter movies based on conditions
# latest = movie_db[(movie_db['vote_average'] > 5) & (movie_db['vote_count'] > 100)]

# # Save the filtered DataFrame to a new CSV file
# latest.to_csv('movies_df.csv', index=False)

# # Read the saved CSV file into a new DataFrame
# movies = pd.read_csv('movies_df.csv')

# # Display the first few rows of the new DataFrame
# print(movies.shape)
# movies.head()
