In [304]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import*

In [305]:
movies= pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv')

In [306]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [307]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [308]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [309]:
movies.shape

(10329, 3)

In [310]:
ratings.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [311]:
ratings.shape

(105339, 4)

In [312]:
#null values are not present

In [313]:
ratings.describe() #0 -5 and the avg=3.5

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [314]:
#different generes available
movies["genres"]=movies['genres'].str.split('|')

In [315]:
movies2=movies.explode('genres')
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [316]:
#unique generes avaiable
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'War', 'Musical', 'Documentary',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [317]:
movies2["genres"].nunique()

20

In [318]:
#remove (no genres listed)
movies2=movies2[movies2['genres']!='(no genres listed)']

In [319]:
movies2["genres"].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'War', 'Musical', 'Documentary',
       'Western', 'Film-Noir'], dtype=object)

In [320]:
movies2["genres"].nunique()

19

In [321]:
#merging the datas -> movies and ratings
merged_data=pd.merge(ratings,movies2,on='movieId',how="inner")
merged_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime
1,1,16,4.0,1217897793,Casino (1995),Drama
2,9,16,4.0,842686699,Casino (1995),Crime
3,9,16,4.0,842686699,Casino (1995),Drama
4,12,16,1.5,1144396284,Casino (1995),Crime
...,...,...,...,...,...,...
281892,668,140098,2.5,1450415424,Runoff (2015),Drama
281893,668,140816,2.5,1443288791,Tangerine (2015),Comedy
281894,668,140816,2.5,1443288791,Tangerine (2015),Drama
281895,668,142488,4.0,1451535844,Spotlight (2015),Thriller


In [322]:
popularity=merged_data.groupby(['genres','title']).agg({"rating":['mean','size']}).reset_index()
popularity.columns=["Genres","Title","Average_Ratings","Number_of_Ratings"]
popularity

Unnamed: 0,Genres,Title,Average_Ratings,Number_of_Ratings
0,Action,'71 (2014),3.500000,1
1,Action,'Hellboy': The Seeds of Creation (2004),3.000000,1
2,Action,10 to Midnight (1983),2.500000,1
3,Action,12 Rounds (2009),2.875000,4
4,Action,13 Assassins (Jûsan-nin no shikaku) (2010),3.500000,5
...,...,...,...,...
23093,Western,Wyatt Earp (1994),3.200000,30
23094,Western,Young Guns (1988),3.375000,36
23095,Western,Young Guns II (1990),3.083333,12
23096,Western,Young Ones (2014),2.000000,1


In [323]:
#popularity recommeder system
#threshold=50
#top
popularity[(popularity["Genres"]=='Action')&(popularity["Number_of_Ratings"]>=50)].sort_values(by="Average_Ratings",ascending=False).head(7)


Unnamed: 0,Genres,Title,Average_Ratings,Number_of_Ratings
1179,Action,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1076,Action,North by Northwest (1959),4.273973,73
975,Action,"Matrix, The (1999)",4.264368,261
1433,Action,Star Wars: Episode V - The Empire Strikes Back...,4.22807,228
1331,Action,Seven Samurai (Shichinin no samurai) (1954),4.217742,62
1199,Action,Raiders of the Lost Ark (Indiana Jones and the...,4.212054,224
747,Action,Inception (2010),4.18932,103


In [324]:
#popularity recommeder system  function
def TopNpopularMovies(genre,threshold,topN):
  popularity=merged_data.groupby(['genres','title']).agg({"rating":['mean','size']}).reset_index()
  popularity.columns=["Genres","Title","Average_Ratings","Number_of_Ratings"]
  #filter data
  topNrecommendations=popularity[(popularity["Genres"]==genre)&(popularity["Number_of_Ratings"]>=threshold)].sort_values(by="Average_Ratings",ascending=False).head(topN)
  #output
  topNrecommendations["Sno."] = range(1, len(topNrecommendations) + 1)
  topNrecommendations.index=range(0,len(topNrecommendations))
  topNrecommendations.columns=['Genres','Movie Title','Average Movie Rating','Number of Review','Sno.']
  return topNrecommendations[['Sno.','Movie Title','Average Movie Rating','Number of Review']]

In [325]:
#testcase 1
genre='Action'
thershold=50
topN=18
TopNpopularMovies(genre=genre,threshold=thershold,topN=topN)

Unnamed: 0,Sno.,Movie Title,Average Movie Rating,Number of Review
0,1,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1,2,North by Northwest (1959),4.273973,73
2,3,"Matrix, The (1999)",4.264368,261
3,4,Star Wars: Episode V - The Empire Strikes Back...,4.22807,228
4,5,Seven Samurai (Shichinin no samurai) (1954),4.217742,62
5,6,Raiders of the Lost Ark (Indiana Jones and the...,4.212054,224
6,7,Inception (2010),4.18932,103
7,8,Star Wars: Episode IV - A New Hope (1977),4.188645,273
8,9,Fight Club (1999),4.188406,207
9,10,Blade Runner (1982),4.169872,156


In [326]:
#content based recommedation system
#tfidf vectorizer >>>genres
#toy story>>>adventure ,fantasy,children,comedy

In [327]:
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [328]:
movies3=movies2.groupby('title').agg({"genres":lambda x:" ".join(list(x))}).reset_index()

In [329]:
movies3.head()

Unnamed: 0,title,genres
0,'71 (2014),Action Drama Thriller War
1,'Hellboy': The Seeds of Creation (2004),Action Adventure Comedy Documentary Fantasy
2,'Round Midnight (1986),Drama Musical
3,'Til There Was You (1997),Drama Romance
4,"'burbs, The (1989)",Comedy


In [330]:
#instances of tfidfvectorizer >>>used to apply the function upon genres and generate their vectors
tf=tfidf=TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english')
tf

In [331]:
tf_matrix=tf.fit_transform(movies3['genres'])

In [332]:
cosine_sim=cosine_similarity(tf_matrix,tf_matrix)
cosine_sim

array([[1.        , 0.02677945, 0.02931913, ..., 0.10229517, 0.        ,
        0.        ],
       [0.02677945, 1.        , 0.        , ..., 0.03626651, 0.02411583,
        0.02863994],
       [0.02931913, 0.        , 1.        , ..., 0.        , 0.        ,
        0.35526663],
       ...,
       [0.10229517, 0.03626651, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.02411583, 0.        , ..., 0.        , 1.        ,
        0.07090711],
       [0.        , 0.02863994, 0.35526663, ..., 0.        , 0.07090711,
        1.        ]])

In [363]:
 #final Function
def recommedation_genre(movie_df,similarity_matrix,movie_title,topN):
  #indices for all movies
  indices=pd.Series(movies3.index,index=movies3['title'])
  #index of target movie
  index=indices[movie_title]
  #cosine similarity scores
  cosine_scores=list(enumerate(similarity_matrix[index]))
  cosine_scores=sorted(cosine_scores,key=lambda x:x[1],reverse=True)[1:topN+2]
  #extract the matching movie
  matched=[i[0] for i in cosine_scores]
  matching_df= movies3.iloc[matched]
  #filter out  the target movie
  matching_df=matching_df[matching_df['title']!=movie_title]
  #output
  matching_df.rename(columns={'title':'Movie Title'},inplace=True)
  matching_df['Sno']= range(1,len(matching_df)+1)
  matching_df.index=range(0,len(matching_df))
  return matching_df[['Sno','Movie Title']].head(topN)

In [364]:
recommedation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title='Shrek the Third (2007)',topN=10)

Unnamed: 0,Sno,Movie Title
0,1,Antz (1998)
1,2,Asterix and the Vikings (Astérix et les Viking...
2,3,"Boxtrolls, The (2014)"
3,4,DuckTales: The Movie - Treasure of the Lost La...
4,5,"Emperor's New Groove, The (2000)"
5,6,"Monsters, Inc. (2001)"
6,7,"Tale of Despereaux, The (2008)"
7,8,Toy Story (1995)
8,9,Toy Story 2 (1999)
9,10,Turbo (2013)


In [365]:
#interactive widgets

In [366]:
#popularity
#inputs
genres=Dropdown(options=list(set(movies2['genres'])),description='Genres',style={'description_width':'initial'})
num_review=IntText(description="Minimun Reviews",style={"description_width":"Initial"})
num_recommendation_1=IntText(description="Number of Recommendations",style={"description_width":"Initial"})


#tabs
b1=Button(description='RECOMMEND ME',style={"description_width":"Initial"})
h1=HBox([num_review,num_recommendation_1])
popularity_tab1=VBox([genres,h1,b1])



#content base system
#inputs
title=Textarea(description="Movie Title",style={"description_width":"Initial"})
num_recommendation_2=IntText(description="Number of Recommendations",style={"description_width":"Initial"})


#tabs
h2=HBox([title,num_recommendation_2])
b2=Button(description='RECOMMEND ME',style={"description_width":"Initial"})

content_tab=VBox([h2,b2])


#creating final tabs
tabs=([popularity_tab1,content_tab])
wid=widgets.Tab(tabs)

#set titles to the tabs
names=['Popularity Based Recommedaion','Content Based Recommedaion']
[wid.set_title(i,title) for i ,title in enumerate(names)]

display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Children', 'IMAX', 'Adventure', 'War', '…

In [372]:
#setting up events to respone when clicked upon
#popularity based recommendation
def b1_clicked(b):
  global output
  output=TopNpopularMovies(genre=genres.value,threshold=num_review.value,topN=num_recommendation_1.value)
b1.on_click(b1_clicked)


#content based recommendation
def b2_clicked(b):
  global output
  result=recommedation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title=title.value,topN=num_recommendation_2.value)
  output=result
b2.on_click(b2_clicked)


In [386]:
display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Children', 'IMAX', 'Adventure', 'War', '…

In [388]:
output

Unnamed: 0,Sno,Movie Title
0,1,Tarzan (1999)
1,2,"Flight of Dragons, The (1982)"
2,3,Kiki's Delivery Service (Majo no takkyûbin) (1...
3,4,Little Nemo: Adventures in Slumberland (1992)
4,5,Watership Down (1978)
5,6,Bambi (1942)
6,7,"Fox and the Hound, The (1981)"
7,8,Ratatouille (2007)
8,9,Anastasia (1997)
9,10,101 Dalmatians (One Hundred and One Dalmatians...
