# Soal 2 - Film Bagus 🎥

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

__Load Dataset__

In [2]:
dfMov = pd.read_csv('movies.csv')
dfMov

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


## 1. __Content-Based Filtering__ (Recommendation for Joko)

__Use count vectorizer to count the words for each data in the column 'genres'__

In [21]:
cv = CountVectorizer()
cm = cv.fit_transform(dfMov['genres'])

In [22]:
cv.get_feature_names()

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'fi',
 'film',
 'genres',
 'horror',
 'imax',
 'listed',
 'musical',
 'mystery',
 'no',
 'noir',
 'romance',
 'sci',
 'thriller',
 'war',
 'western']

__Matrix freq each word in each data__

In [23]:
cm.toarray()

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

__Find cosine similarity of each data in matrix freq__

In [24]:
cosScore = cosine_similarity(cm.toarray())
cosScore

array([[1.        , 0.77459667, 0.31622777, ..., 0.4472136 , 0.        ,
        0.        ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.70710678, 0.        ,
        0.        ],
       ...,
       [0.4472136 , 0.        , 0.70710678, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

___"Joko sangat menyukai film bergenre animasi & action, terutama film Superman vs. The Elite (2012)."___

__Find the index for movie 'Superman vs. The Elite (2012)'__

In [5]:
dfMov[dfMov['title'] == 'Superman vs. The Elite (2012)']

Unnamed: 0,movieId,title,genres
9370,94974,Superman vs. The Elite (2012),Action|Animation


In [240]:
Joko = dfMov[dfMov['title'] == 'Superman vs. The Elite (2012)']['movieId']
Joko = Joko.index.to_list()[0]
Joko

9370

__Find similar movies using cosScore, then sort to make the similarity percentage in descending order__

In [245]:
similarMov = list(enumerate(cosScore[Joko]))
similarMov = sorted(similarMov, key=lambda x: x[1], reverse=True)
similarMov[:6] # 6 because it includes 'Superman vs. The Elite (2012)', will drop it later

[(6260, 0.9999999999999998),
 (8637, 0.9999999999999998),
 (9370, 0.9999999999999998),
 (9570, 0.9999999999999998),
 (10167, 0.9999999999999998),
 (10277, 0.9999999999999998)]

#### __Top 5 Movies Recommendation for Joko (all genres):__

In [243]:
dfMJoko = []
for i in similarMov[:6]:
    dfMJoko.append(dfMov.iloc[i[0]])
dfMJoko = pd.DataFrame(dfMJoko)
dfMJoko = dfMJoko.drop(Joko) # to remove 'Superman vs. The Elite (2012)' from the recommendation
dfMJoko

Unnamed: 0,movieId,title,genres
6260,26913,Street Fighter II: The Animated Movie (Sutorît...,Action|Animation
8637,79274,Batman: Under the Red Hood (2010),Action|Animation
9570,99813,"Batman: The Dark Knight Returns, Part 2 (2013)",Action|Animation
10167,124867,Justice League: Throne of Atlantis (2015),Action|Animation
10277,138104,Justice League: Gods and Monsters (2015),Action|Animation


## 2. __Collaborative Filtering__ (Recommendation for Widodo)

__Load Dataset__

In [14]:
dfRate = pd.read_csv('ratings.csv')
dfRate

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


__Find max & min rate (for reference)__

In [104]:
maxRate = dfRate['rating'].max()
minRate = dfRate['rating'].min()
maxRate, minRate

(5.0, 0.5)

__Use pivot_table to make a DataFrame where index = 'userId' and columns = 'movieID'__
<br>_In my opinion the timestamp doesn't really affect recommendation so I drop it_

In [78]:
dfRate2 = dfRate.pivot_table(
    index = 'userId',
    columns = 'movieId'
)
dfRate2 = dfRate2['rating']
dfRate2

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,5.0,,2.0,,3.0,,,,,,...,,,,,,,,,,
3,,,,,3.0,,3.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,,,,,,,,,,,...,,,,,,,,,,
665,,,,,,,,,,,...,,,,,,,,,,
666,,,,,,,,,,,...,,,,,,,,,,
667,,,,,,,,,,,...,,,,,,,,,,


__Replace NaN Value with 0__
<br>*I assume the NaN value is there because people haven't watch the movie yet or haven't give ratings so I replace it with 0*

In [64]:
dfRate2 = dfRate2.replace(np.NaN, 0)
dfRate2

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


__Calculate correlation (Pearson)__

In [66]:
dfRate2cor = dfRate2.corr()
dfRate2cor

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.229124,0.222062,0.079319,0.243736,0.211601,0.237869,0.066370,0.026712,0.156136,...,0.053224,-0.027328,0.054386,-0.027328,0.026201,-0.027328,-0.027328,-0.027328,0.026451,-0.027328
2,0.229124,1.000000,0.108133,0.085610,0.123400,0.182489,0.111970,0.035200,0.043333,0.314376,...,0.082551,0.082551,0.035785,0.082551,0.115655,-0.015020,-0.015020,0.082551,0.081235,0.082551
3,0.222062,0.108133,1.000000,0.145779,0.469828,0.224401,0.281310,0.161308,0.188836,0.039788,...,-0.011447,-0.011447,0.032102,-0.011447,-0.016037,-0.011447,-0.011447,-0.011447,0.029735,-0.011447
4,0.079319,0.085610,0.145779,1.000000,0.095993,0.136973,0.067767,0.184446,0.006738,0.127035,...,-0.004763,-0.004763,-0.006649,-0.004763,-0.006673,-0.004763,-0.004763,-0.004763,-0.008250,-0.004763
5,0.243736,0.123400,0.469828,0.095993,1.000000,0.123640,0.366064,0.177667,0.086260,0.052069,...,-0.011947,-0.011947,0.040914,-0.011947,-0.016738,-0.011947,-0.011947,-0.011947,0.038672,-0.011947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146684,-0.027328,-0.015020,-0.011447,-0.004763,-0.011947,-0.017275,-0.011248,-0.002579,-0.006932,-0.019007,...,-0.001499,-0.001499,-0.002093,-0.001499,-0.002100,1.000000,1.000000,-0.001499,-0.002597,-0.001499
146878,-0.027328,-0.015020,-0.011447,-0.004763,-0.011947,-0.017275,-0.011248,-0.002579,-0.006932,-0.019007,...,-0.001499,-0.001499,-0.002093,-0.001499,-0.002100,1.000000,1.000000,-0.001499,-0.002597,-0.001499
148238,-0.027328,0.082551,-0.011447,-0.004763,-0.011947,0.105883,0.130140,-0.002579,-0.006932,0.098554,...,-0.001499,1.000000,-0.002093,1.000000,0.599232,-0.001499,-0.001499,1.000000,-0.002597,1.000000
148626,0.026451,0.081235,0.029735,-0.008250,0.038672,0.109557,0.139296,-0.004468,0.151573,0.134155,...,-0.002597,-0.002597,0.780272,-0.002597,-0.003638,-0.002597,-0.002597,-0.002597,1.000000,-0.002597


__Check if there's any NaN in dfRate2cor__

In [76]:
# Check if there's any NaN 

dfRate2cor.isnull().sum().sum()

0

___"Widodo sangat menyukai film drama komedi, salah satunya bertajuk Being Flynn (2012)"___

__Find *'movieId'* for movie 'Being Flynn (2012)'__

In [203]:
dfMov[dfMov['title'] == 'Being Flynn (2012)']

Unnamed: 0,movieId,title,genres
9411,95816,Being Flynn (2012),Comedy|Drama


In [221]:
midWid =  dfMov[dfMov['title'] == 'Being Flynn (2012)']['movieId']
midWid = midWid.to_list()[0]

__Then put the *'movieId'* inside 'Widodo' list, here I assume Widodo rate the movie as 5 *(max rating)* because of the statement:__ _"Widodo sangat menyukai film drama komedi, salah satunya bertajuk Being Flynn (2012)"_

In [223]:
Widodo = [midWid, 5]

In [224]:
Widodo[0]

95816

__Finding similarity score based on correlation matrix, sort that into descending order__

In [225]:
score = dfRate2cor[Widodo[0]] * Widodo[1] 
score = score.sort_values(ascending=False)
score

movieId
7086     5.000000
4801     5.000000
6898     5.000000
87660    5.000000
53038    5.000000
           ...   
1080    -0.079633
1073    -0.094189
231     -0.094907
367     -0.096955
344     -0.103537
Name: 95816, Length: 10325, dtype: float64

In [249]:
type(score)

pandas.core.series.Series

__Get the index from 'score' series, make that into a list__

In [250]:
RecWidodo = score.index.to_list()

__Make a new DataFrame to contain movie recommendations for Widodo, use looping to put each movie recommendation into the DataFrame__
<br>_Here I don't limit the number of movie recommendation because I want to show all-genres recommendations & only 'drama comedy' recommendations below_

In [227]:
dfMWidodo = pd.DataFrame()
for i in RecWidodo:
    rec = dfMov[dfMov['movieId'] == i]
    dfMWidodo = pd.concat([dfMWidodo, rec], axis='rows')
dfMWidodo

Unnamed: 0,movieId,title,genres
5162,7086,Pygmalion (1938),Comedy|Drama
3755,4801,"Little Foxes, The (1941)",Drama
5020,6898,Sweet Sixteen (2002),Drama
9033,87660,Too Big to Fail (2011),Drama
7381,53038,Red Dust (1932),Drama
...,...,...,...
877,1080,Monty Python's Life of Brian (1979),Comedy
872,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
202,231,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
326,367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy


#### __Top 5 Movies Recommendation for Widodo (all genres):__

In [228]:
dfMWidodo[:5]

Unnamed: 0,movieId,title,genres
5162,7086,Pygmalion (1938),Comedy|Drama
3755,4801,"Little Foxes, The (1941)",Drama
5020,6898,Sweet Sixteen (2002),Drama
9033,87660,Too Big to Fail (2011),Drama
7381,53038,Red Dust (1932),Drama


#### __Top 5 Movies Recommendation for Widodo (only Drama Comedy genre):__

In [229]:
dfMWidodo[dfMWidodo['genres'] == 'Comedy|Drama'][:5]

Unnamed: 0,movieId,title,genres
5162,7086,Pygmalion (1938),Comedy|Drama
6054,26113,"Best Man, The (1964)",Comedy|Drama
10009,113064,"Trip to Italy, The (2014)",Comedy|Drama
7192,49115,"Boynton Beach Bereavement Club, The (2005)",Comedy|Drama
7113,47619,State of the Union (1948),Comedy|Drama
