In [352]:
import pandas as pd
import numpy as np
import copy
import random
from sklearn.metrics.pairwise import cosine_similarity

In [394]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [353]:
#Load Dataset and Create a Backup (df2)
df = pd.read_csv('IMDb_Data_final.csv')
df2=copy.deepcopy(df)
df2.head(5) #Show Data

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130min,UA,2022
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139min,R,2022
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176min,UA,2022
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127min,UA,1993
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175min,A,1972


In [354]:
### Data Inspection and Preprocessing ###

In [357]:
#Title Inspection
#df2['Title'].value_counts(dropna=False) 
#Check Title Duplicates (Drishyam, Scarface 2 times)
#title_counts=df2['Title'].value_counts()>1 #Check for duplicates

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear


In [358]:
#After inspection these are the original Malayalam version of 2013 and the Hindi remake of 2015 so both are valid
df2[df2['Title']=='Drishyam']

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
779,Drishyam,NishikantKamat,"AjayDevgn, ShriyaSaran, Tabu, RajatKapoor",8.2,"Crime,Drama,Mystery",163min,UA,2015
967,Drishyam,JeethuJoseph,"Mohanlal, Meena, AshaSharath, Ansiba",8.3,"Crime,Drama,Thriller",160min,U,2013


In [359]:
df2[df2['Title']=='Scarface']
#After inspection these are the original Scarface version of 1932 and the famous remake of 1983 so both are valid

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
96,Scarface,BrianDePalma,"AlPacino, MichellePfeiffer, StevenBauer, MaryE...",8.3,"Crime,Drama",170min,A,1983
837,Scarface,"HowardHawks,",", PaulMuni, AnnDvorak, KarenMorley, OsgoodPerkins",7.7,"Action,Crime,Drama",93min,,1932


In [360]:
#Creation of Processed Dataframe
Titles=df2['Title'] #Correct Titles (no changes needed)
Movie_ID = pd.Series(df2.index) #Creating a Movie_ID that corresponds to the row number
#Movie_ID.head(5)
Processed_Dataframe=pd.concat([Movie_ID, Titles],axis=1)
Processed_Dataframe.rename(columns={0:'Movie_ID'},inplace=True)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title
0,0,Top Gun: Maverick
1,1,Everything Everywhere All at Once
2,2,The Batman
3,3,Jurassic Park
4,4,The Godfather


In [364]:
#Director Inspection
#df2['Director'].value_counts(dropna=False) 
#Multiple Directors with more than one entry but entries are correct

Director
StevenSpielberg    13
AlfredHitchcock    13
AkiraKurosawa      11
HayaoMiyazaki      10
MartinScorsese     10
Name: count, dtype: int64

In [365]:
#Stars Inspection
df2['Stars'].head(20) #Movie Stars are in lists and are numerous

0     TomCruise, JenniferConnelly, MilesTeller, ValK...
1     , MichelleYeoh, StephanieHsu, KeHuyQuan, James...
2     RobertPattinson, ZoëKravitz, JeffreyWright, Co...
3     SamNeill, LauraDern, JeffGoldblum, RichardAtte...
4        MarlonBrando, AlPacino, JamesCaan, DianeKeaton
5     TomHolland, Zendaya, BenedictCumberbatch, Jaco...
6       N.T.RamaRaoJr., RamCharan, AjayDevgn, AliaBhatt
7     JohnnyDepp, GeoffreyRush, OrlandoBloom, KeiraK...
8     TimothéeChalamet, RebeccaFerguson, Zendaya, Os...
9     RobertDeNiro, RayLiotta, JoePesci, LorraineBracco
10    TimRobbins, MorganFreeman, BobGunton, WilliamS...
11    MarkHamill, HarrisonFord, CarrieFisher, AlecGu...
12    LeonardoDiCaprio, KateWinslet, BillyZane, Kath...
13    DanielRadcliffe, RupertGrint, RichardHarris, M...
14    LeonardoDiCaprio, BradPitt, MargotRobbie, Emil...
15       SeanAstin, JoshBrolin, JeffCohen, CoreyFeldman
16    ChristianBale, JustinTheroux, JoshLucas, BillSage
17    , RobertDowneyJr., ChrisEvans, MarkRuffalo

In [366]:
#People are probably interested in the leading actors so we keep the 2 protagonists and discard the rest
Lead_Actors=df2['Stars'].str.split(',', n=3).str[:3]
for Actor in range(len(Lead_Actors)): 
    Lead_Actors[Actor]=Lead_Actors[Actor][0:2] if Lead_Actors[Actor][0]!='' else Lead_Actors[Actor][1:3] #Some lists' first entry is blank so we skip it 

In [367]:
Lead_Actors.head(5)

0    [TomCruise,  JenniferConnelly]
1    [ MichelleYeoh,  StephanieHsu]
2    [RobertPattinson,  ZoëKravitz]
3            [SamNeill,  LauraDern]
4         [MarlonBrando,  AlPacino]
Name: Stars, dtype: object

In [368]:
Lead_Actors[0][0]

'TomCruise'

In [369]:
Processed_Dataframe=pd.concat([Processed_Dataframe,df2['Director'],Lead_Actors],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]"
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]"
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]"
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]"
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]"


In [None]:
#Check how many movies Tom Cruise has stared in...
for i in range(len(Lead_Actors)):
    if('TomCruise' in Lead_Actors[i]):
        print(Processed_Dataframe.iloc[i])

In [40]:
#Category Inspection
df2['Category'].head(10) #List of genres

0                Action,Drama
1     Action,Adventure,Comedy
2          Action,Crime,Drama
3     Action,Adventure,Sci-Fi
4                 Crime,Drama
5    Action,Adventure,Fantasy
6                Action,Drama
7    Action,Adventure,Fantasy
8      Action,Adventure,Drama
9       Biography,Crime,Drama
Name: Category, dtype: object

In [None]:
df2['Category'].value_counts(dropna=False) #NaN value noted

In [376]:
#Number of distinct categories in movies
Categories=df2['Category'].str.split(',')
genres=[]
for movie in range(len(Categories)):
    if(isinstance(Categories[movie], float)):
        #print(Categories[movie])
        continue
    else:
        for genre in Categories[movie]:
            if(genre not in genres):
                genres.append(genre)
genres=sorted(genres)

In [388]:
#A nan value is included in categories so the movie is not categorised in terms of genre
Categories[639]

nan

In [None]:
genres
#len(genres)

In [391]:
#One hot-Encoding Movie Categories 
Categories_Encoded = pd.DataFrame(hard_coded, columns=categories)
Categories_Encoded=Categories_Encoded.astype(int)

In [380]:
Categories_Encoded.head(10)

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
8,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [400]:
#Locating where the NaN value is in the original dataset
idx=df2[df2['Category'].isna()].index #RowNumber of NaN value
Categories_Encoded.iloc[idx]=0 #Setting all genre values to 0 for that movie

Index([639], dtype='int64')

In [None]:
Categories_Encoded.iloc[639] #Check Nan Values has been taken care of

In [393]:
Processed_Dataframe=pd.concat([Processed_Dataframe,Categories_Encoded],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [397]:
#Duration Inspection
df2['Duration'].head(10)
#df2['Duration'].value_counts(dropna=False) 
#1 NaN value and 1 Drama,Romance Value wrongly set 
#while the rest are durations in minutes followed by the ending mins which can be omitted

Index([785], dtype='int64')

In [None]:
#Censor Rating Inspection 
#df2['Censor-board-rating'].value_counts(dropna=False) 
#df2['Censor-board-rating'].unique()

#and Category inspection 
#df2['Category'].value_counts(dropna=False)

#through these inspections and some internet search regarding the entries in lines 639 and 785 it was found that the data
#were correct but were inserted in the wrong order in columns 'Category','Duration' and 'Censor-board-rating'

In [401]:
#Fixing data in rows 639 and 785 in the copy of the original dataframe
#The two invalid entries have information that are misalligned
#df2[639:640] #Initially
df2.loc[639,'Category']=df2.loc[639,'Duration']
df2.loc[639,'Duration']=df2.loc[639,'Censor-board-rating']
df2.loc[639,'Censor-board-rating']='Unrated'
#df2[639:640] #check results

#df2[785:786] #Initially
df2.loc[785,'Duration']=df2.loc[785,'Censor-board-rating']
df2.loc[785,'Censor-board-rating']='Unrated'
df2[785:786] #Check results

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear
785,Ayla: The Daughter of War,"CanUlkay,",", ÇetinTekindor, IsmailHacioglu, Kyung-jinLee,...",8.3,"Biography,Drama,History",125min,Unrated,2017


In [403]:
# Remove 'min' from all entries in the 'duration' column
df2['Duration'] = df2['Duration'].str.replace('min', '')
df2['Duration'].head(5)

0    130
1    139
2    176
3    127
4    175
Name: Duration, dtype: object

In [404]:
#Now that the data have been correctly inserted
Duration=df2['Duration']
#Duration.head(5)

In [413]:
Processed_Dataframe=pd.concat([Processed_Dataframe,Duration],axis=1)
#Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Duration
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,139
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,176
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,127
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,175


In [406]:
df2['Censor-board-rating'].head(10) #NEED ENCODING

0    UA
1     R
2    UA
3    UA
4     A
5    UA
6    UA
7    UA
8    UA
9     A
Name: Censor-board-rating, dtype: object

In [418]:
#df2['Censor-board-rating'].any() #Check if there are NaN values
#df2['Censor-board-rating'].unique() #Proof
#df2['Censor-board-rating'].value_counts(dropna=False) #How many NaN values (154)

Censor-board-rating
U           270
A           215
UA          197
NaN         154
R            76
PG-13        20
18           13
NotRated     11
PG           10
16            8
13            6
7             5
Unrated       4
G             2
15+           2
U/A           2
UA13+         1
12+           1
All           1
(Banned)      1
M/PG          1
Name: count, dtype: int64

In [420]:
#Set NaN values to Unrated because that is what not having a censorhip rating practically means
Censor_Rating=df2['Censor-board-rating']
Censor_Rating=Censor_Rating.fillna('Unrated')
Censor_Rating.value_counts(dropna=False) #Check Results

Censor-board-rating
U           270
A           215
UA          197
Unrated     158
R            76
PG-13        20
18           13
NotRated     11
PG           10
16            8
13            6
7             5
G             2
U/A           2
15+           2
UA13+         1
12+           1
All           1
(Banned)      1
M/PG          1
Name: count, dtype: int64

In [422]:
#Encoding Censor Ratings
Censor_rating_full=pd.get_dummies(Censor_Rating)
Censor_rating_full=Censor_rating_full.astype(int)
Censor_rating_full.head(5)

Unnamed: 0,(Banned),12+,13,15+,16,18,7,A,All,G,M/PG,NotRated,PG,PG-13,R,U,U/A,UA,UA13+,Unrated
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [124]:
#Check Results
#Censor_rating_full['Check']=censor_rating_full.sum(axis=1) #Sum needs to be 1 for each row because each movie has a single rating
#Censor_rating_full=censor_rating_full.drop(axis=1,columns=['Check'])
#Censor_rating_full.head(5)

Unnamed: 0,(Banned),12+,13,15+,16,18,7,A,All,G,M/PG,NotRated,PG,PG-13,R,U,U/A,UA,UA13+,Unrated
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#Grouping Ratings into 4 main categories
Censor_rating_full['Everyone']=Censor_rating_full.loc[:,['G','U','All','7']].max(axis=1)
Censor_rating_full['PG/13+']=Censor_rating_full.loc[:,['M/PG','PG','13','PG-13','12+','UA13+','UA']].max(axis=1)
Censor_rating_full['Over15']=Censor_rating_full.loc[:,['15+','16','18','A','M/PG','R','(Banned)']].max(axis=1)
Censor_rating_full['NO_RATING']=Censor_rating_full.loc[:,['Unrated','NotRated']].max(axis=1)
Censor_rating_full.head(5)

In [424]:
Processed_Dataframe=pd.concat([Processed_Dataframe,Censor_rating_full['Everyone'],Censor_rating_full['PG/13+'],Censor_rating_full['Over15'],Censor_rating_full['NO_RATING']],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Duration,Everyone,PG/13+,Over15,NO_RATING
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130,0,1,0,0
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,139,0,0,1,0
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,176,0,1,0,0
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,127,0,1,0,0
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,175,0,0,1,0


In [None]:
#ReleaseYear Inspection
#df2['ReleaseYear'].value_counts(dropna=False).head(10)
#df2['ReleaseYear'].unique()
#The movies are going to be split into eras
#Each era is going to have a similar amount of movies (circa 200)

In [425]:
len(df2[df2['ReleaseYear']<=1950]) #Count movie released until 1950

81

In [None]:
len(df[(df['ReleaseYear']>2000) & (df['ReleaseYear']<=2010)]) #Count movie released after 2000 until 2010

In [427]:
#Era 1:1960 or earlier
df2['Till60']=(df2['ReleaseYear']<=1960).astype(int) 
df2['Till60'].head(5)

0    0
1    0
2    0
3    0
4    0
Name: Till60s, dtype: int32

In [428]:
#Era 2:After 1960 till 1980 
df2['Till80']=((df2['ReleaseYear']>1960) & (df2['ReleaseYear']<=1980)).astype(int)

In [429]:
#Era 3:After 1980 till 2000 
df2['Till00']=((df2['ReleaseYear']>1980) & (df2['ReleaseYear']<=2000)).astype(int)

In [171]:
#Era 4:After 2000 till 2010 
df2['Till10']=((df2['ReleaseYear']>2000) & (df2['ReleaseYear']<=2010)).astype(int)

In [430]:
#Era 5:After 2010 till today
df2['Till20']=(df2['ReleaseYear']>2010).astype(int)

In [431]:
ReleaseYears=df2.loc[:,'Till60':'Till20']
ReleaseYears.head(5)

Unnamed: 0,Till60,Till60s,Till80,Till00,Till20
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,1,0,0


In [173]:
df2.head(5)

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear,Till60,Till80s,Till00s,Till10s,Till20s
0,Top Gun: Maverick,JosephKosinski,"TomCruise, JenniferConnelly, MilesTeller, ValK...",8.6,"Action,Drama",130,UA,2022,0,0,0,0,1
1,Everything Everywhere All at Once,"DanKwan,",", MichelleYeoh, StephanieHsu, KeHuyQuan, James...",8.3,"Action,Adventure,Comedy",139,R,2022,0,0,0,0,1
2,The Batman,MattReeves,"RobertPattinson, ZoëKravitz, JeffreyWright, Co...",7.9,"Action,Crime,Drama",176,UA,2022,0,0,0,0,1
3,Jurassic Park,StevenSpielberg,"SamNeill, LauraDern, JeffGoldblum, RichardAtte...",8.2,"Action,Adventure,Sci-Fi",127,UA,1993,0,0,1,0,0
4,The Godfather,FrancisFordCoppola,"MarlonBrando, AlPacino, JamesCaan, DianeKeaton",9.2,"Crime,Drama",175,A,1972,0,1,0,0,0


In [432]:
#Final Table with Ratings Included
Processed_Dataframe=pd.concat([Processed_Dataframe,ReleaseYears,df2['IMDb-Rating']],axis=1)
Processed_Dataframe.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Duration,Everyone,PG/13+,Over15,NO_RATING,Till60,Till60s,Till80,Till00,Till20
0,0,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,130,0,1,0,0,0,0,0,0,1
1,1,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,139,0,0,1,0,0,0,0,0,1
2,2,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,176,0,1,0,0,0,0,0,0,1
3,3,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,127,0,1,0,0,0,0,0,1,0
4,4,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,175,0,0,1,0,0,0,1,0,0


In [242]:
#Non-encoded Data (Duration and Rating not included)
TitleTable=dataframe.iloc[:,0:4]
TitleTable.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars
0,1,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]"
1,2,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]"
2,3,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]"
3,4,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]"
4,5,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]"


In [289]:
TitleTable=pd.concat([TitleTable,df2['IMDb-Rating']],axis=1) #Non-encoded Data (Duration not included)
TitleTable.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,IMDb-Rating
0,1,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",8.6
1,2,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",8.3
2,3,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",7.9
3,4,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",8.2
4,5,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",9.2


In [243]:
#Encoded Data that will be used to make recommendations to users
Encoded_Data=dataframe.iloc[:,4:]
Vectors=Vectors.drop(['Duration'],axis=1)
Vectors.head(5)

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,Everyone,PG/13+,Over15,NO_RATING,Till60,Till80s,Till00s,Till10s,Till20s
0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0


In [310]:
Vectors=pd.concat([Vectors,df2['IMDb-Rating']],axis=1)

In [None]:
### END OF: Data Inspection and Preprocessing ###

In [None]:
### RECOMMENDATION SYSTEMS ###

In [None]:
## RECOMMENDATION METHOD 1: MOVIE SIMILARITY ##

In [245]:
MovieID=random.randint(1,1000)

In [246]:
MovieID

61

In [247]:
from sklearn.metrics.pairwise import cosine_similarity

In [253]:
similarities=cosine_similarity(Vectors)
matrix=similarities[MovieID-1,:]

In [258]:
# Finding the index of the second highest value in each row
indices = np.argsort(matrix, axis=0)

# Getting the index of the second highest value
second_highest_indices = indices[-2] #check -1 to see that it is the same 

print("Indices of the second highest values in each row:")
print(second_highest_indices)

Indices of the second highest values in each row:
76


In [267]:
MostSimilarMovieID=second_highest_indices+1
MostSimilarMovieID

77

In [269]:
TitleTable.iloc[60]

Movie_ID                             61
Title       Zack Snyders Justice League
Director                     ZackSnyder
Stars        [HenryCavill,  BenAffleck]
Name: 60, dtype: object

In [270]:
TitleTable.iloc[MostSimilarMovieID-1]

Movie_ID                              77
Title       Kingsman: The Secret Service
Director                   MatthewVaughn
Stars        [ColinFirth,  TaronEgerton]
Name: 76, dtype: object

In [271]:
df2.iloc[60]

Title                                  Zack Snyders Justice League
Director                                                ZackSnyder
Stars                  HenryCavill, BenAffleck, GalGadot, AmyAdams
IMDb-Rating                                                    8.0
Category                                  Action,Adventure,Fantasy
Duration                                                       242
Censor-board-rating                                             18
ReleaseYear                                                   2021
Till60                                                           0
Till80s                                                          0
Till00s                                                          0
Till10s                                                          0
Till20s                                                          1
Name: 60, dtype: object

In [272]:
df2.iloc[76]

Title                                       Kingsman: The Secret Service
Director                                                   MatthewVaughn
Stars                  ColinFirth, TaronEgerton, SamuelL.Jackson, Mic...
IMDb-Rating                                                          7.7
Category                                         Action,Adventure,Comedy
Duration                                                             129
Censor-board-rating                                                    A
ReleaseYear                                                         2014
Till60                                                                 0
Till80s                                                                0
Till00s                                                                0
Till10s                                                                0
Till20s                                                                1
Name: 76, dtype: object

In [347]:
#Function of similarity
#3 points if genre is same, 1 if 1 element is same, 1 for era of movie same, 1 for Censor-board-rating --> Full is 5 star rating
MovieID4=random.randint(1,1000)
print(MovieID4)
#Vectors
def rate_movie(Vectors,MovieID4,TitleTable):
    vec=Vectors.to_numpy()
    past_movie=vec[MovieID4-1:MovieID4]
    positions=[]
    
    for i in range(past_movie.shape[1]):
        if(past_movie[0,i]==1):
            positions.append(i)
    #print(past_movie)    
    print(positions)
    categories=0
    for j in positions:
        if(j<21):
            categories=categories+1
    print(categories)
    best_score=0
    num_rows = vec.shape[0]
    #print(num_rows)
    best_rating=0
    for row in range(num_rows):
        score=0
        count1s=0
        count_extra=0
        if row==MovieID4-1:
            continue
        else:
            for index in positions: 
                if(index<21):
                    if(past_movie[0][index]==vec[row][index]):
                        count1s=count1s+1
                elif(index>=21 and past_movie[0][index]==vec[row][index]):
                    count_extra=count_extra+1
        if(categories==1 and count1s==1):
            score=3+count_extra
        elif(categories>1 and count1s==categories-1):
            score=2+count_extra
        elif(categories>1 and count1s==categories):
            score=3+count_extra
        if score>best_score:
            best_score=score
            best_movie=row
            best_rating=vec[row][-1]
            print("Good",best_score)
        elif score==best_score and vec[row][-1]>best_rating:
            best_score=score
            best_movie=row
            best_rating=vec[row][-1]
            print("Best",best_score)
    return best_movie,best_score

best_movie,best_score=rate_movie(Vectors,MovieID4,TitleTable)
print("Best Movie is...",best_movie)
print("With a cummulative score of "+ str(best_score) + "/5!")

df2.iloc[MovieID4-1:MovieID4,:]

763
[6, 8, 12, 23, 27]
3
Best 0
Best 0
Best 0
Good 3
Best 3
Good 4
Good 5
Best Movie is... 600
With a cummulative score of 5/5!


Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear,Till60,Till80s,Till00s,Till10s,Till20s
762,La double vie de Véronique,KrzysztofKieslowski,"IrèneJacob, WladyslawKowalski, HalinaGryglasze...",7.7,"Drama,Fantasy,Music",98,R,1991,0,0,1,0,0


In [348]:
df2.iloc[best_movie:best_movie+1,:]

Unnamed: 0,Title,Director,Stars,IMDb-Rating,Category,Duration,Censor-board-rating,ReleaseYear,Till60,Till80s,Till00s,Till10s,Till20s
600,Pink Floyd: The Wall,AlanParker,"BobGeldof, ChristineHargreaves, JamesLaurenson...",8.0,"Drama,Fantasy,Music",95,A,1982,0,0,1,0,0


In [346]:
# ONLY BASED ON GENRE
#MovieID2=1
MovieID2=random.randint(1,1000)
print(MovieID2)
similarities=cosine_similarity(Categories_Encoded)
matrix2=similarities[MovieID2-1,:]
#print(matrix2)
# Finding the index of the second highest value in each row
indices2 = np.argsort(matrix2, axis=0)
print(indices2)
# Getting the index of the second highest value
second_highest_indices2 = indices2[-12:-1]

remove_list=[MovieID2-1]    
filtered_list = [num for num in second_highest_indices2 if num not in remove_list]    
print(filtered_list)

print("Indices of the second highest values in each row:")
print(filtered_list)


698
[353 291 142 143 688 147 148 506 150 689 504 288 691 773 500 771 159 824
 889 163 693 165 166 401 511 695 393 137 781 111 777 300 114 668 298 922
 526 297 244 919 524 523 125 294 909 130 131 132 678 517 136 513 386 885
 354 835 858 857 262 212 260 723 456 853 219 258 417 419 223 448 444 428
 249 247 235 237 238 437 462 278 464 866 705 828 178 275 180 481 183 184
 272 187 188 874 709 710 409 195 196 197 268 414 265 715 264 717 108 121
 304 334 582 333 625 579 578 367  41 974 973  45 634 330  48 969 929 327
 326  54 325  56  57  58 335 586  30  29   1   3 356   5   7 359 361  11
 343  13 342 324  15  18 364 593 616 617 986 621 793  26 337  28 362  60
 433 372  77 556 318 650 376  83 314 952 312 380  76  91 543  95 309 308
  98 384 101 102 104 537 937  75 551 322 965  65  64  69  70  66  62 960
 562 763 389 421 783 420 762 396 744 360 742 358 741 398 385 430 355 432
 747 404 794 395 755 790 410 753 375 758 787 792 412 392 374 770 378 373
 785 369 391 776 784 415 759 751 788 303 435 56

In [286]:
df2.iloc[MovieID2-1]

Title                                                           Jai Bhim
Director                                                    T.J.Gnanavel
Stars                  Suriya, LijoMolJose, ManikandanK., RajishaVijayan
IMDb-Rating                                                          8.9
Category                                             Crime,Drama,Mystery
Duration                                                             164
Censor-board-rating                                                    A
ReleaseYear                                                         2021
Till60                                                                 0
Till80s                                                                0
Till00s                                                                0
Till10s                                                                0
Till20s                                                                1
Name: 563, dtype: object

In [288]:
df2.iloc[563]

Title                                                           Jai Bhim
Director                                                    T.J.Gnanavel
Stars                  Suriya, LijoMolJose, ManikandanK., RajishaVijayan
IMDb-Rating                                                          8.9
Category                                             Crime,Drama,Mystery
Duration                                                             164
Censor-board-rating                                                    A
ReleaseYear                                                         2021
Till60                                                                 0
Till80s                                                                0
Till00s                                                                0
Till10s                                                                0
Till20s                                                                1
Name: 563, dtype: object

In [298]:
TitleTable=pd.concat([TitleTable,df2['Category']],axis=1)
TitleTable.head(5)

Unnamed: 0,Movie_ID,Title,Director,Stars,IMDb-Rating,Category
0,1,Top Gun: Maverick,JosephKosinski,"[TomCruise, JenniferConnelly]",8.6,"Action,Drama"
1,2,Everything Everywhere All at Once,"DanKwan,","[ MichelleYeoh, StephanieHsu]",8.3,"Action,Adventure,Comedy"
2,3,The Batman,MattReeves,"[RobertPattinson, ZoëKravitz]",7.9,"Action,Crime,Drama"
3,4,Jurassic Park,StevenSpielberg,"[SamNeill, LauraDern]",8.2,"Action,Adventure,Sci-Fi"
4,5,The Godfather,FrancisFordCoppola,"[MarlonBrando, AlPacino]",9.2,"Crime,Drama"


In [300]:
## Movie with highest rating and weighted 
Results=TitleTable.iloc[filtered_list,:]
Results.sort_values(by='IMDb-Rating',ascending=False)


Unnamed: 0,Movie_ID,Title,Director,Stars,IMDb-Rating,Category
307,308,3 Idiots,RajkumarHirani,"[AamirKhan, Madhavan]",8.4,"Comedy,Drama"
120,121,Dead Poets Society,PeterWeir,"[RobinWilliams, RobertSeanLeonard]",8.1,"Comedy,Drama"
961,962,La règle du jeu,JeanRenoir,"[MarcelDalio, NoraGregor]",7.9,"Comedy,Drama"
718,719,Guess Whos Coming to Dinner,StanleyKramer,"[SpencerTracy, SidneyPoitier]",7.8,"Comedy,Drama"
79,80,The Breakfast Club,JohnHughes,"[EmilioEstevez, JuddNelson]",7.8,"Comedy,Drama"
371,372,Birdman or (The Unexpected Virtue of Ignorance),AlejandroG.Iñárritu,"[MichaelKeaton, ZachGalifianakis]",7.7,"Comedy,Drama"
377,378,Lost in Translation,SofiaCoppola,"[BillMurray, ScarlettJohansson]",7.7,"Comedy,Drama"
670,671,Short Cuts,RobertAltman,"[AndieMacDowell, JulianneMoore]",7.7,"Comedy,Drama"
434,435,Naked,MikeLeigh,"[DavidThewlis, LesleySharp]",7.7,"Comedy,Drama"
852,853,The Station Agent,TomMcCarthy,"[PeterDinklage, PatriciaClarkson]",7.6,"Comedy,Drama"


In [297]:
df2.iloc[MovieID2-1]

Title                                   Once Upon a Time in... Hollywood
Director                                                QuentinTarantino
Stars                  LeonardoDiCaprio, BradPitt, MargotRobbie, Emil...
IMDb-Rating                                                          7.6
Category                                                    Comedy,Drama
Duration                                                             161
Censor-board-rating                                                    A
ReleaseYear                                                         2019
Till60                                                                 0
Till80s                                                                0
Till00s                                                                0
Till10s                                                                0
Till20s                                                                1
Name: 14, dtype: object

In [None]:
##Create users and compare

In [302]:
# Step 2: Create the Users DataFrame
num_users = 10000
user_ids = np.arange(1, num_users + 1)  # User IDs from 1 to 10000
users_df = pd.DataFrame({'User_ID': user_ids})

# Step 3: Generate Watch History for Each User
# Function to randomly select 4 unique movie IDs for a user
def select_movies():
    return np.random.choice(TitleTable['Movie_ID'], size=4, replace=False).tolist()

# Create a new column 'Watch_History' in users_df
users_df['Watch_History'] = [select_movies() for _ in range(num_users)]

# Display the first few rows of the Users DataFrame
print(users_df.tail(6))

      User_ID         Watch_History
9994     9995  [630, 729, 761, 593]
9995     9996  [130, 562, 845, 139]
9996     9997   [925, 908, 967, 72]
9997     9998  [498, 122, 739, 726]
9998     9999     [702, 67, 6, 224]
9999    10000  [867, 699, 386, 358]


In [308]:
# Step 1: Convert the column to a NumPy array
np_array = users_df['Watch_History'].to_numpy()

# Step 2: Sort the lists in each row
sorted_array = np.array([sorted(row) for row in np_array])

test = pd.dataframe(sorted_array.tolist() # back to dataframe

<class 'NameError'>: name 'test' is not defined

In [None]:
## RECOMMENDATION METHOD 2: Collaborative Filtering ##

In [349]:
# Total number of movies
total_movies = 1000

# Randomly generate movie ids and ratings for the main user
main_user_movies = np.random.choice(range(1, total_movies + 1), size=4, replace=False)
main_user_ratings = np.random.choice([1, 2, 3], size=4)

# Store main user data
main_user_data = pd.DataFrame({'movie_id': main_user_movies, 'rating': main_user_ratings})

# Generate random data for 10000 users
users = []
for user_id in range(10000):
    movies_watched = np.random.choice(range(1, total_movies + 1), size=4, replace=False)
    ratings_given = np.random.choice([1, 2, 3], size=4)
    users.append(pd.DataFrame({
        'user_id': user_id,
        'movie_id': movies_watched,
        'rating': ratings_given
    }))

# Concatenate the list of user dataframes into one
all_users_data = pd.concat(users)

# Function to calculate similarity between two users
def calculate_similarity(main_user_data, other_user_data):
    # Find common movies
    common_movies = pd.merge(main_user_data, other_user_data, on='movie_id')
    
    if common_movies.empty:
        return 0  # No common movies
    
    # Calculate cosine similarity between the ratings of common movies
    similarity = cosine_similarity([common_movies['rating_x']], [common_movies['rating_y']])
    return similarity[0][0]

# Variable to store the most similar user and the highest similarity score
most_similar_user_id = None
highest_similarity = -1

# Loop through all users and find the most similar user
for user_id in range(10000):
    # Get data for current user
    user_data = all_users_data[all_users_data['user_id'] == user_id]
    
    # Calculate similarity
    similarity = calculate_similarity(main_user_data, user_data)
    
    # Check if this user is more similar than the previous users
    if similarity > highest_similarity:
        highest_similarity = similarity
        most_similar_user_id = user_id

# Get the most similar user's data
most_similar_user_data = all_users_data[all_users_data['user_id'] == most_similar_user_id]

# Recommend a movie that the most similar user has seen but the main user hasn't
recommended_movie = set(most_similar_user_data['movie_id']) - set(main_user_data['movie_id'])

# Print results
print(f"Most similar user ID: {most_similar_user_id}")
print(f"Movies rated by the main user:\n{main_user_data}")
print(f"Movies rated by the most similar user:\n{most_similar_user_data}")
print(f"Recommended movie(s): {recommended_movie}")

Most similar user ID: 23
Movies rated by the main user:
   movie_id  rating
0       330       1
1       107       3
2        16       3
3        50       1
Movies rated by the most similar user:
   user_id  movie_id  rating
0       23       821       3
1       23       131       3
2       23       609       2
3       23       330       1
Recommended movie(s): {609, 131, 821}


In [None]:
#Create Function of liking 