In [3]:

import sklearn as sk
import numpy as np
import pandas as pd

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data = pd.read_csv("/content/drive/MyDrive/data pro/movies.csv")


data.head(2)

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
0,315162,Puss in Boots: The Last Wish,Animation-Action-Adventure-Comedy-Family-Fantasy,en,Puss in Boots discovers that his passion for a...,10011.23,Universal Pictures-DreamWorks Animation,2022-12-07,90000000.0,297504470.0,103.0,Released,Say hola to his little friends.,8.611,2369.0,Antonio Banderas-Salma Hayek-Harvey Guillén-Wa...,fairy tale-talking dog-spin off-aftercreditsst...,/kuf6dutpsT0vSVehic3EZIqkOBt.jpg,/r9PkFnRUIthgBp2JZZzD380MWZy.jpg,830784-826173-417859-76600-877269-5334-46632-6...
1,536554,M3GAN,Science Fiction-Horror-Comedy,en,A brilliant toy company roboticist uses artifi...,7352.073,Universal Pictures-Blumhouse Productions-Atomi...,2022-12-28,12000000.0,101000000.0,102.0,Released,Friendship has evolved.,7.127,402.0,Allison Williams-Violet McGraw-Jenna Davis-Ami...,evil doll-aunt niece relationship-orphan-car a...,/7CNCv9uhqdwK7Fv4bR4nmDysnd9.jpg,/5kAGbi9MFAobQTVfK4kWPnIfnP0.jpg,615777-674324-661374-48209-593643-696157-67671...


# <p style="background-color:#C71A27;font-family:verdana;color:white;text-align:center;letter-spacing:0.5px;font-size:100%;padding: 10px"> Pre-processing data </p>

# <p style="background-color:#C71A27;font-family:verdana;color:white;text-align:center;letter-spacing:0.5px;font-size:100%;padding: 5px"> Analyzing data </p>

In [6]:
#check the shape of data
print(data.shape)

(728651, 20)


__Observation__ \
We have about __728K movies__ in the dataset and about __20 features__ of each movies.

In [7]:
# check columns of data and its data-types present in dataframe
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728651 entries, 0 to 728650
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    728651 non-null  int64  
 1   title                 728647 non-null  object 
 2   genres                514726 non-null  object 
 3   original_language     728651 non-null  object 
 4   overview              608581 non-null  object 
 5   popularity            728651 non-null  float64
 6   production_companies  338007 non-null  object 
 7   release_date          674787 non-null  object 
 8   budget                728651 non-null  float64
 9   revenue               728651 non-null  float64
 10  runtime               692976 non-null  float64
 11  status                728651 non-null  object 
 12  tagline               108684 non-null  object 
 13  vote_average          728651 non-null  float64
 14  vote_count            728651 non-null  float64
 15  

In [8]:
# check null value in dataset
data.isnull().sum()

id                           0
title                        4
genres                  213925
original_language            0
overview                120070
popularity                   0
production_companies    390644
release_date             53864
budget                       0
revenue                      0
runtime                  35675
status                       0
tagline                 619967
vote_average                 0
vote_count                   0
credits                 226825
keywords                517461
poster_path             189291
backdrop_path           505751
recommendations         693700
dtype: int64

In [9]:
# check for duplicate values
data.duplicated().sum()

86

In [10]:
# dropping unnecesary title
df = data.drop(["production_companies", "popularity", "budget", "revenue", "status", "recommendations", "runtime", "vote_average", "backdrop_path", "tagline"], axis=1)

In [11]:
# droping duplicate values
df.drop_duplicates(inplace=True)

In [12]:
# checking duplicates in title
df.title.duplicated().sum()

86912

In [13]:
# check if duplicates titles have same release date
df[["title", "release_date"]].duplicated().sum()

2339

In [14]:
# get rid of duplicates with same release datae
df.drop_duplicates(subset=["title","release_date"], inplace=True)

In [15]:
# get rid of vote_count lower than 350 and reseting index
df = df[df.vote_count >= 350].reset_index()

In [16]:
df.isnull().sum()

index                  0
id                     0
title                  0
genres                 0
original_language      0
overview               1
release_date           0
vote_count             0
credits                8
keywords             237
poster_path            0
dtype: int64

In [17]:
# replacing all the null value from genres adn overview with "nothing"
df.fillna("", inplace = True)

In [18]:
# delete movies with no genres and overview
index = df[(df.genres == "") & (df.overview == "")].index
df.drop(index, inplace=True)

In [19]:
# replacing genres, credits and keywords - with " "
df.genres = df.genres.apply(lambda x: " ".join(x.split("-")))
df.keywords = df.keywords.apply(lambda x: " ".join(x.split("-")))
df.credits = df.credits.apply(lambda x: " ".join(x.replace(" ", "").split("-")[:5]))

In [20]:
# making tags for prediction
df["tags"] =df.overview + " "+ df.genres + " "  +df.credits + " " +df.keywords + " " + df.original_language

In [21]:
# making new framework with important features
new_df = df[["id", "title", "tags", 'poster_path']]

In [22]:
# making all the content of tags in lower case letter for better processing
new_df.tags = new_df.tags.apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.tags = new_df.tags.apply(lambda x:x.lower())


In [23]:
new_df.tags[0]

'puss in boots discovers that his passion for adventure has taken its toll: he has burned through eight of his nine lives leaving him with only one life left. puss sets out on an epic journey to find the mythical last wish and restore his nine lives. animation action adventure comedy family fantasy antoniobanderas salmahayek harveyguillén wagnermoura florencepugh fairy tale talking dog spin off aftercreditsstinger talking cat fear of death en'

In [24]:
new_df.head(1)

Unnamed: 0,id,title,tags,poster_path
0,315162,Puss in Boots: The Last Wish,puss in boots discovers that his passion for a...,/kuf6dutpsT0vSVehic3EZIqkOBt.jpg


# <p style="background-color:#C71A27;font-family:verdana;color:white;text-align:center;letter-spacing:0.5px;font-size:100%;padding: 10px"> Vectorizer </p>

In [25]:
#import nltk library and porter module
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [26]:
# stem function to take text and give output
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [27]:
# applying stem function in tags
new_df["tags"] = new_df["tags"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tags"] = new_df["tags"].apply(stem)


In [28]:
# importing countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
# setting for 5000 most repeated words, and exclude stop words
cv = CountVectorizer(stop_words="english",max_features=5000)

In [29]:
# fiting tags in count vector
vectors = cv.fit_transform(new_df["tags"]).toarray() #change it into array to use

In [33]:
vectors[:, 80:85]

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [34]:
# Similarity vector with cosine
from sklearn.metrics.pairwise import cosine_similarity
# calculating similarity of each movie with all movies
similarity = cosine_similarity(vectors)

In [35]:
# similarity of each movie with all the movies
similarity.shape

(7525, 7525)

***
# <p style="background-color:#C71A27;font-family:verdana;color:white;text-align:center;letter-spacing:0.5px;font-size:100%;padding: 10px"> Testing Model </p>

To test our model we will make a function which will take name of movie and recommend 5 movies, similar to that.
***

In [36]:
# making function to find movie and give similar movies as return
def recommend(movies):
    movie_index = new_df[new_df.title == movies].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key= lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)



In [37]:
# checking similar movies test 1
recommend("Batman")

The Dark Knight
Batman: Mask of the Phantasm
Batman & Robin
Batman Returns
Dick Tracy


In [38]:
# test 2
recommend("Black Adam")

Zack Snyder's Justice League
X-Men: Apocalypse
Justice League
The Wolverine
Justice League vs. Teen Titans


In [45]:
# import Pickle
import pickle
from google.colab import drive, files

In [46]:
# making portable movie pickle file to transport
pickle.dump(new_df, open("moviesNEW.pkl", "wb"))
files.download("moviesNEW.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [47]:
# making portable similarity file to transport
pickle.dump(similarity, open("similarity.pkl", "wb"))
files.download("similarity.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>