In [504]:
import pandas as pd
import numpy as np
import copy
import random
from sklearn.metrics.pairwise import cosine_similarity

In [505]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [506]:
#Load Dataset and Create a Backup (df2)
df = pd.read_csv('IMDb_Data_final.csv')
df2=copy.deepcopy(df)
df2.head(5) #Show Data

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130min,UA,2022
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139min,R,2022
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176min,UA,2022
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127min,UA,1993
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175min,A,1972


In [507]:
### Data Inspection and Preprocessing ###

In [508]:
#Title Inspection
#df2['Title'].value_counts(dropna=False) 
#Check Title Duplicates (Drishyam, Scarface 2 times)
#title_counts=df2['Title'].value_counts()>1 #Check for duplicates

In [509]:
#After inspection these are the original Malayalam version of 2013 and the Hindi remake of 2015 so both are valid
df2[df2['Title']=='Drishyam']

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
779,Drishyam,NishikantKamat,"AjayDevgn, ShriyaSaran, Tabu, RajatKapoor",8.2,"Crime,Drama,Mystery",163min,UA,2015
967,Drishyam,JeethuJoseph,"Mohanlal, Meena, AshaSharath, Ansiba",8.3,"Crime,Drama,Thriller",160min,U,2013


In [510]:
df2[df2['Title']=='Scarface']
#After inspection these are the original Scarface version of 1932 and the famous remake of 1983 so both are valid

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
96,Scarface,BrianDePalma,"AlPacino, MichellePfeiffer, StevenBauer, MaryE...",8.3,"Crime,Drama",170min,A,1983
837,Scarface,"HowardHawks,",", PaulMuni, AnnDvorak, KarenMorley, OsgoodPerkins",7.7,"Action,Crime,Drama",93min,,1932


In [511]:
#Creation of Processed Dataframe
Titles=df2['Title'] #Correct Titles (no changes needed)
Movie_ID = pd.Series(df2.index) #Creating a Movie_ID that corresponds to the row number
#Movie_ID.head(5)
Processed_Dataframe=pd.concat([Movie_ID, Titles],axis=1)
Processed_Dataframe.rename(columns={0:'Movie_ID'},inplace=True)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title
0,0,Top Gun: Maverick
1,1,Everything Everywhere All at Once
2,2,The Batman
3,3,Jurassic Park
4,4,The Godfather


In [512]:
#Director Inspection
#df2['Director'].value_counts(dropna=False) 
#Multiple Directors with more than one entry but entries are correct

In [514]:
#Stars Inspection
df2['Stars'].head(10) #Movie Stars are in lists and are numerous

0    TomCruise, JenniferConnelly, MilesTeller, ValK...
1    , MichelleYeoh, StephanieHsu, KeHuyQuan, James...
2    RobertPattinson, ZoëKravitz, JeffreyWright, Co...
3    SamNeill, LauraDern, JeffGoldblum, RichardAtte...
4       MarlonBrando, AlPacino, JamesCaan, DianeKeaton
5    TomHolland, Zendaya, BenedictCumberbatch, Jaco...
6      N.T.RamaRaoJr., RamCharan, AjayDevgn, AliaBhatt
7    JohnnyDepp, GeoffreyRush, OrlandoBloom, KeiraK...
8    TimothéeChalamet, RebeccaFerguson, Zendaya, Os...
9    RobertDeNiro, RayLiotta, JoePesci, LorraineBracco
Name: Stars, dtype: object

In [515]:
#People are probably interested in the leading actors so we keep the 2 protagonists and discard the rest
Lead_Actors=df2['Stars'].str.split(',', n=3).str[:3]
for Actor in range(len(Lead_Actors)): 
    Lead_Actors[Actor]=Lead_Actors[Actor][0:2] if Lead_Actors[Actor][0]!='' else Lead_Actors[Actor][1:3] #Some lists' first entry is blank so we skip it 

In [516]:
Lead_Actors.head(5)

0    [TomCruise,  JenniferConnelly]
1    [ MichelleYeoh,  StephanieHsu]
2    [RobertPattinson,  ZoëKravitz]
3            [SamNeill,  LauraDern]
4         [MarlonBrando,  AlPacino]
Name: Stars, dtype: object

In [517]:
Lead_Actors[0][0]

'TomCruise'

In [518]:
Processed_Dataframe=pd.concat([Processed_Dataframe,df2['Director'],Lead_Actors],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]"
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]"
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]"
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]"
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]"


In [519]:
#Check how many movies Tom Cruise has stared in...
for i in range(len(Lead_Actors)):
    if('TomCruise' in Lead_Actors[i]):
        print(Processed_Dataframe.iloc[i])

Movie_ID                                 0
Title                    Top Gun: Maverick
Director                    JosephKosinski
Stars       [TomCruise,  JenniferConnelly]
Name: 0, dtype: object
Movie_ID                               29
Title       Mission: Impossible - Fallout
Director             ChristopherMcQuarrie
Stars           [TomCruise,  HenryCavill]
Name: 29, dtype: object
Movie_ID                             93
Title                    A Few Good Men
Director                      RobReiner
Stars       [TomCruise,  JackNicholson]
Name: 93, dtype: object
Movie_ID                         111
Title               Edge of Tomorrow
Director                   DougLiman
Stars       [TomCruise,  EmilyBlunt]
Name: 111, dtype: object
Movie_ID                          174
Title                The Last Samurai
Director                  EdwardZwick
Stars       [TomCruise,  KenWatanabe]
Name: 174, dtype: object
Movie_ID                           201
Title                  Minority Report
D

In [520]:
#Category Inspection
df2['Category'].head(10) #List of genres

0                Action,Drama
1     Action,Adventure,Comedy
2          Action,Crime,Drama
3     Action,Adventure,Sci-Fi
4                 Crime,Drama
5    Action,Adventure,Fantasy
6                Action,Drama
7    Action,Adventure,Fantasy
8      Action,Adventure,Drama
9       Biography,Crime,Drama
Name: Category, dtype: object

In [None]:
df2['Category'].value_counts(dropna=False) #NaN value noted

In [521]:
#Number of distinct categories in movies
Categories=df2['Category'].str.split(',')
genres=[]
for movie in range(len(Categories)):
    if(isinstance(Categories[movie], float)):
        #print(Categories[movie])
        continue
    else:
        for genre in Categories[movie]:
            if(genre not in genres):
                genres.append(genre)
genres=sorted(genres)

In [522]:
#A nan value is included in categories so the movie is not categorised in terms of genre
Categories[639]

nan

In [523]:
genres
#len(genres)

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western']

In [524]:
#One hot-Encoding Movie Categories 
Categories_Encoded = pd.DataFrame(hard_coded, columns=categories)
Categories_Encoded=Categories_Encoded.astype(int)

In [525]:
Categories_Encoded.head(10)

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [526]:
#Locating where the NaN value is in the original dataset
idx=df2[df2['Category'].isna()].index #RowNumber of NaN value
Categories_Encoded.iloc[idx]=0 #Setting all genre values to 0 for that movie

In [527]:
Categories_Encoded.iloc[639] #Check Nan Values has been taken care of

Action       0
Adventure    0
Animation    0
Biography    0
Comedy       0
Crime        0
Drama        0
Family       0
Fantasy      0
Film-Noir    0
History      0
Horror       0
Music        0
Musical      0
Mystery      0
Romance      0
Sci-Fi       0
Sport        0
Thriller     0
War          0
Western      0
Name: 639, dtype: int32

In [528]:
Processed_Dataframe=pd.concat([Processed_Dataframe,Categories_Encoded],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [397]:
#Duration Inspection
df2['Duration'].head(10)
#df2['Duration'].value_counts(dropna=False) 
#1 NaN value and 1 Drama,Romance Value wrongly set 
#while the rest are durations in minutes followed by the ending mins which can be omitted

Index([785], dtype='int64')

In [None]:
#Censor Rating Inspection 
#df2['Censor-board-rating'].value_counts(dropna=False) 
#df2['Censor-board-rating'].unique()

#and Category inspection 
#df2['Category'].value_counts(dropna=False)

#through these inspections and some internet search regarding the entries in lines 639 and 785 it was found that the data
#were correct but were inserted in the wrong order in columns 'Category','Duration' and 'Censor-board-rating'

In [529]:
#Fixing data in rows 639 and 785 in the copy of the original dataframe
#The two invalid entries have information that are misalligned
#df2[639:640] #Initially
df2.loc[639,'Category']=df2.loc[639,'Duration']
df2.loc[639,'Duration']=df2.loc[639,'Censor-board-rating']
df2.loc[639,'Censor-board-rating']='Unrated'
#df2[639:640] #check results

#df2[785:786] #Initially
df2.loc[785,'Duration']=df2.loc[785,'Censor-board-rating']
df2.loc[785,'Censor-board-rating']='Unrated'
df2[785:786] #Check results

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
785,Ayla: The Daughter of War,"CanUlkay,",", ÇetinTekindor, IsmailHacioglu, Kyung-jinLee,...",8.3,"Biography,Drama,History",125min,Unrated,2017


In [530]:
# Remove 'min' from all entries in the 'duration' column
df2['Duration'] = df2['Duration'].str.replace('min', '')
df2['Duration'].head(5)

0    130
1    139
2    176
3    127
4    175
Name: Duration, dtype: object

In [531]:
#Now that the data have been correctly inserted
Duration=df2['Duration']
#Duration.head(5)

In [532]:
Processed_Dataframe=pd.concat([Processed_Dataframe,Duration],axis=1)
#Processed_Dataframe.head(5)

In [406]:
df2['Censor-board-rating'].head(10) #NEED ENCODING

0    UA
1     R
2    UA
3    UA
4     A
5    UA
6    UA
7    UA
8    UA
9     A
Name: Censor-board-rating, dtype: object

In [533]:
#df2['Censor-board-rating'].any() #Check if there are NaN values
#df2['Censor-board-rating'].unique() #Proof
#df2['Censor-board-rating'].value_counts(dropna=False) #How many NaN values (154)

In [534]:
#Set NaN values to Unrated because that is what not having a censorhip rating practically means
Censor_Rating=df2['Censor-board-rating']
Censor_Rating=Censor_Rating.fillna('Unrated')
Censor_Rating.value_counts(dropna=False) #Check Results

Censor-board-rating
U           270
A           215
UA          197
Unrated     158
R            76
PG-13        20
18           13
NotRated     11
PG           10
16            8
13            6
7             5
G             2
U/A           2
15+           2
UA13+         1
12+           1
All           1
(Banned)      1
M/PG          1
Name: count, dtype: int64

In [535]:
#Encoding Censor Ratings
Censor_rating_full=pd.get_dummies(Censor_Rating)
Censor_rating_full=Censor_rating_full.astype(int)
Censor_rating_full.head(5)

Unnamed: 0,(Banned),12+,13,15+,16,18,7,A,All,G,M/PG,NotRated,PG,PG-13,R,U,U/A,UA,UA13+,Unrated
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [536]:
#Check Results
#Censor_rating_full['Check']=censor_rating_full.sum(axis=1) #Sum needs to be 1 for each row because each movie has a single rating
#Censor_rating_full=censor_rating_full.drop(axis=1,columns=['Check'])
#Censor_rating_full.head(5)

In [537]:
#Grouping Ratings into 4 main categories
Censor_rating_full['Everyone']=Censor_rating_full.loc[:,['G','U','All','7']].max(axis=1)
Censor_rating_full['PG/13+']=Censor_rating_full.loc[:,['M/PG','PG','13','PG-13','12+','UA13+','UA']].max(axis=1)
Censor_rating_full['Over15']=Censor_rating_full.loc[:,['15+','16','18','A','M/PG','R','(Banned)']].max(axis=1)
Censor_rating_full['NO_RATING']=Censor_rating_full.loc[:,['Unrated','NotRated']].max(axis=1)
Censor_rating_full.head(5)

Unnamed: 0,(Banned),12+,13,15+,16,18,7,A,All,G,M/PG,NotRated,PG,PG-13,R,U,U/A,UA,UA13+,Unrated,Everyone,PG/13+,Over15,NO_RATING
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [538]:
Processed_Dataframe=pd.concat([Processed_Dataframe,Censor_rating_full['Everyone'],Censor_rating_full['PG/13+'],Censor_rating_full['Over15'],Censor_rating_full['NO_RATING']],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Duration,Everyone,PG/13+,Over15,NO_RATING
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130,0,1,0,0
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,139,0,0,1,0
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,176,0,1,0,0
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,127,0,1,0,0
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,175,0,0,1,0


In [539]:
#ReleaseYear Inspection
#df2['ReleaseYear'].value_counts(dropna=False).head(10)
#df2['ReleaseYear'].unique()
#The movies are going to be split into eras
#Each era is going to have a similar amount of movies (circa 200)

In [540]:
len(df2[df2['ReleaseYear']<=1950]) #Count movie released until 1950

81

In [541]:
len(df2[(df2['ReleaseYear']>2000) & (df2['ReleaseYear']<=2010)]) #Count movie released after 2000 until 2010

232

In [542]:
#Era 1:1960 or earlier
df2['Till60']=(df2['ReleaseYear']<=1960).astype(int) 
df2['Till60'].head(5)

0    0
1    0
2    0
3    0
4    0
Name: Till60, dtype: int32

In [543]:
#Era 2:After 1960 till 1980 
df2['Till80']=((df2['ReleaseYear']>1960) & (df2['ReleaseYear']<=1980)).astype(int)

In [544]:
#Era 3:After 1980 till 2000 
df2['Till00']=((df2['ReleaseYear']>1980) & (df2['ReleaseYear']<=2000)).astype(int)

In [545]:
#Era 4:After 2000 till 2010 
df2['Till10']=((df2['ReleaseYear']>2000) & (df2['ReleaseYear']<=2010)).astype(int)

In [546]:
#Era 5:After 2010 till today
df2['Till20']=(df2['ReleaseYear']>2010).astype(int)

In [547]:
ReleaseYears=df2.loc[:,'Till60':'Till20']
ReleaseYears.head(5)

Unnamed: 0,Till60,Till80,Till00,Till10,Till20
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,1,0,0
4,0,1,0,0,0


In [548]:
df2.head(5)

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear,Till60,Till80,Till00,Till10,Till20
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130,UA,2022,0,0,0,0,1
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139,R,2022,0,0,0,0,1
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176,UA,2022,0,0,0,0,1
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127,UA,1993,0,0,1,0,0
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175,A,1972,0,1,0,0,0


In [549]:
#Final Table with Ratings Included
Processed_Dataframe=pd.concat([Processed_Dataframe,ReleaseYears,df2['IMDb-Rating']],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Duration,Everyone,PG/13+,Over15,NO_RATING,Till60,Till80,Till00,Till10,Till20,IMDb-Rating
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130,0,1,0,0,0,0,0,0,1,8.6
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,139,0,0,1,0,0,0,0,0,1,8.3
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,176,0,1,0,0,0,0,0,0,1,7.9
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,127,0,1,0,0,0,0,1,0,0,8.2
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,175,0,0,1,0,0,1,0,0,0,9.2


In [557]:
#Non-encoded Data
TitleTable=dataframe.iloc[:,0:4]
TitleTable=pd.concat([TitleTable,df2['Category'],df2['Duration'],df2['Censor-board-rating'],df2['ReleaseYear'],df2['IMDb-Rating']],axis=1)
TitleTable.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Category,Duration,Censor-board-rating,ReleaseYear,IMDb-Rating
0,1,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]","Action,Drama",130,UA,2022,8.6
1,2,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]","Action,Adventure,Comedy",139,R,2022,8.3
2,3,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]","Action,Crime,Drama",176,UA,2022,7.9
3,4,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]","Action,Adventure,Sci-Fi",127,UA,1993,8.2
4,5,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]","Crime,Drama",175,A,1972,9.2


In [552]:
#Encoded Data that will be used to make recommendations to users 
#Containg 'Genre','Censor-board-rating' and 'ReleaseYear' 
Encoded_Data=dataframe.iloc[:,4:]
Encoded_Data=Encoded_Data.drop(['Duration'],axis=1)
Encoded_Data.head(5)

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Everyone,PG/13+,Over15,NO_RATING,Till60,Till80s,Till00s,Till10s,Till20s
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0


In [553]:
### END OF: Data Inspection and Preprocessing ###

In [554]:
### RECOMMENDATION SYSTEMS ###

In [555]:
## RECOMMENDATION METHOD 1: MOVIE SIMILARITY ##

In [561]:
#Use Cosine Similarity Metric to compare the one-hot encoded movie of interest with the rest of the movies of the dataset
#and find the top 5 most similar movies based on 'Genre','Censor-board-rating','ReleaseYear' and order them by their'IMDb-Rating'

#Choose a random movie based on its Movie_ID
MovieID=random.randint(0,999) #0 to 999 inclusive
#print(MovieID)

#Cosine Similarity as defined in sklearn.metrics.pairwise library
similarities=cosine_similarity(Encoded_Data)
Main_Movie=similarities[MovieID:MovieID+1,:] #Movie of interest

# Finding the indices of the 6 most similar movies
indices = np.argsort(Main_Movie, axis=1)
top6_indices = indices[0,-6:]
#print(top6_indices)

remove_list=[MovieID]    #Check if the Main Movie is in the list of most similar movies and remove it if that is the case
top5_Movies = [idx for idx in top6_indices if idx not in remove_list]    
top5_Movies=top5_Movies[-6:-1] if len(top5_Movies)>5 else top5_Movies #keep 5 movies
#print(top5_Movies)

## Top 5 most similar movies with highest rating
Results=TitleTable.iloc[top5_Movies,:]
Results.sort_values(by='IMDb-Rating',ascending=False)

Unnamed: 0,Movie_ID,Title,Director,Stars,Category,Duration,Censor-board-rating,ReleaseYear,IMDb-Rating
307,308,3 Idiots,RajkumarHirani,"[AamirKhan, Madhavan]","Comedy,Drama",170,UA,2009,8.4
901,902,Les triplettes de Belleville,SylvainChomet,"[MichèleCaucheteux, Jean-ClaudeDonda]","Animation,Comedy,Drama",80,PG-13,2003,7.7
637,638,Flipped,RobReiner,"[MadelineCarroll, CallanMcAuliffe]","Comedy,Drama,Romance",90,PG,2010,7.7
289,290,(500) Days of Summer,MarcWebb,"[ZooeyDeschanel, JosephGordon-Levitt]","Comedy,Drama,Romance",95,UA,2009,7.7
377,378,Lost in Translation,SofiaCoppola,"[BillMurray, ScarlettJohansson]","Comedy,Drama",102,UA,2003,7.7


In [484]:
#Movie of interest
TitleTable.iloc[MovieID,:]

Movie_ID                                        665
Title                            The Maltese Falcon
Director                                 JohnHuston
Stars                  [HumphreyBogart,  MaryAstor]
Category                    Crime,Film-Noir,Mystery
Duration                                        100
Censor-board-rating                             NaN
IMDb-Rating                                     8.0
Name: 664, dtype: object

In [498]:
#Find the most similar movie to the last watched based on a custom Function of similarity (point-based)
#3 points if the GENRE is exactly the same
#2 points if 2 GENRE are the same
#1 point if only a single GENRE is the same
#1 point if the ERA of movie is the same
#1 point if Censor-board-rating --> Full Rating is a 5 star rating

#Choose random movie based on MovieID
MovieID=random.randint(0,999)
#print(MovieID)

def most_similar_movie(MovieID,Encoded_Data):
    ed=Encoded_Data.to_numpy() #convert to NumPy
    Last_Watched=ed[MovieID:MovieID+1] #Movie of interest
    scoring_criteria=[] #Keep only positions of 1s from the encoded data (sparse matrix)
    for i in range(Last_Watched.shape[1]):
        if(Last_Watched[0,i]==1):
            scoring_criteria.append(i)    
    Last_Watched_Movie_categories=0 #Number of genres of the last watched movie 
    for criterion in scoring_criteria:
        if(criterion<21): #If criterion is a genre
            Last_Watched_Movie_categories=Last_Watched_Movie_categories+1

    best_score=0
    num_movies = ed.shape[0] #Total movies in database to compare
    best_rating=0
    for movie in range(num_movies):
        score=0
        common_genres=0 #Add one if the last watched movie and the one that is being inspected belong to a similar 
        count_extra=0 #Add one if movie has a similar censoring level or was filmed in the same era (as defined in the encoded matrix)
        if movie==MovieID:
            continue
        else:
            for criterion in scoring_criteria: 
                if(criterion<21):
                    if(Last_Watched[0][criterion]==ed[movie][criterion]):
                        common_genres=common_genres+1
                elif(criterion>=21 and Last_Watched[0][criterion]==ed[movie][criterion]):
                    count_extra=count_extra+1
        if(Last_Watched_Movie_categories==1 and common_genres==1):
            score=3+count_extra
        elif(Last_Watched_Movie_categories>1 and common_genres==Last_Watched_Movie_categories-1):
            score=2+count_extra
        elif(Last_Watched_Movie_categories>1 and common_genres==Last_Watched_Movie_categories):
            score=3+count_extra
        if score>best_score:
            best_score=score
            best_movie=movie
            best_rating=ed[movie][-1]
        elif score==best_score and ed[movie][-1]>best_rating:
            best_score=score
            best_movie=movie
            best_rating=ed[movie][-1]
    return best_movie,best_score

best_movie,best_score=most_similar_movie(MovieID,Encoded_Data)
print("The recommended movie based on your last movie is...",TitleTable.loc[best_movie,'Title'])
print("With a similarity score of "+ str(best_score) + "/5!") #How similar is this movie to the last watched

#Full Details of recommended movie
TitleTable.iloc[best_movie:best_movie+1,:]

132
The recommended movie based on your last movie is... Toy Story 2
With a similarity score of 5/5!


Unnamed: 0,Movie_ID,Title,Director,Stars,Category,Duration,Censor-board-rating,ReleaseYear,IMDb-Rating
288,289,Toy Story 2,"JohnLasseter,","[LeeUnkrich, ]","Animation,Adventure,Comedy",92,U,1999,7.9


In [499]:
#Full Details of last watched movie
TitleTable.iloc[MovieID:MovieID+1,:]

Unnamed: 0,Movie_ID,Title,Director,Stars,Category,Duration,Censor-board-rating,ReleaseYear,IMDb-Rating
132,133,Toy Story,JohnLasseter,"[TomHanks, TimAllen]","Animation,Adventure,Comedy",81,U,1995,8.3


In [None]:
## RECOMMENDATION METHOD 2: Collaborative Filtering ##

In [None]:
#This method is based on having access to user data.
#Movies are recommended to users based on what user with similar watch history have enjoyed or disliked
#Since there was no used data and ratings on movies I created 10000 users and assigned randomly to them a 4 movie 
#watchlist from the original dataset alongside a random rating for each of those 4 movies from 1 to 3 (1:Bad 2:Okay 3:Good)

#Total number of movies
total_movies = 1000

#Generate movie ids and ratings for the main user
main_user_movies = np.random.choice(range(0, total_movies), size=4, replace=False)
main_user_ratings = np.random.choice([1, 2, 3], size=4)

# Store main user data as a dataframe
main_user_data = pd.DataFrame({'movie_id': main_user_movies, 'rating': main_user_ratings})

# Generate random data for 10000 users
users = []
for user_id in range(10000):
    movies_watched = np.random.choice(range(0, total_movies), size=4, replace=False)
    ratings_given = np.random.choice([1, 2, 3], size=4)
    users.append(pd.DataFrame({
        'user_id': user_id,
        'movie_id': movies_watched,
        'rating': ratings_given
    }))

#Collect all the users in a dataframe
all_users_data = pd.concat(users)

In [503]:
#Function to calculate similarity between two users
def calculate_similarity(main_user_data, other_user_data):
    #Find common movies
    common_movies = pd.merge(main_user_data, other_user_data, on='movie_id')
    
    if common_movies.empty:
        return 0  # No common movies
    
    #Calculate cosine similarity between the ratings of common movies
    similarity = cosine_similarity([common_movies['rating_x']], [common_movies['rating_y']])
    return similarity[0][0]

#Variable to store the most similar user and the highest similarity score
most_similar_user_id = None
highest_similarity = -1

#Find the most similar user
for user_id in range(10000):
    #Get data for current user
    user_data = all_users_data[all_users_data['user_id'] == user_id]
    
    #Calculate similarity
    similarity = calculate_similarity(main_user_data, user_data)
    
    #Check if this user is more similar than the previous users
    if similarity > highest_similarity:
        highest_similarity = similarity
        most_similar_user_id = user_id

#Get the most similar user's data
most_similar_user_data = all_users_data[all_users_data['user_id'] == most_similar_user_id]

#Recommend a movie that the most similar user has seen but the main user hasn't
recommended_movie = set(most_similar_user_data['movie_id']) - set(main_user_data['movie_id'])

# Print results
print(f"Most similar user ID: {most_similar_user_id}")
print(f"Movies rated by the main user:\n{main_user_data}")
print(f"Movies rated by the most similar user:\n{most_similar_user_data}")
print(f"Recommended movie(s): {recommended_movie}")

Most similar user ID: 18
Movies rated by the main user:
   movie_id  rating
0       324       1
1       201       3
2       157       1
3       501       2
Movies rated by the most similar user:
   user_id  movie_id  rating
0       18       564       1
1       18       501       2
2       18       227       3
3       18       629       3
Recommended movie(s): {227, 564, 629}
