## Content Based Filtering

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('movies.csv')

In [3]:
data.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
cv = CountVectorizer()
cm = cv.fit_transform(data['genres'])

In [5]:
cv.get_feature_names()

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'fi',
 'film',
 'genres',
 'horror',
 'imax',
 'listed',
 'musical',
 'mystery',
 'no',
 'noir',
 'romance',
 'sci',
 'thriller',
 'war',
 'western']

In [6]:
cm.toarray()

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
cm.toarray()[0]

array([0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [8]:
cosScore = cosine_similarity(cm.toarray())
cosScore

array([[1.        , 0.77459667, 0.31622777, ..., 0.4472136 , 0.        ,
        0.        ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.70710678, 0.        ,
        0.        ],
       ...,
       [0.4472136 , 0.        , 0.70710678, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [9]:
## jika user sangat menyukai film Waiting to Exhale (1995)

data[data['title']=='Waiting to Exhale (1995)']

Unnamed: 0,movieId,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance


In [10]:
## ambil index nya

user_suka = data[data['title']=='Waiting to Exhale (1995)'].index[0]
user_suka

3

In [11]:
similarFilm = list(enumerate(cosScore[user_suka]))
similarFilm

[(0, 0.25819888974716115),
 (1, 0.0),
 (2, 0.816496580927726),
 (3, 1.0000000000000002),
 (4, 0.5773502691896258),
 (5, 0.0),
 (6, 0.816496580927726),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 1.0000000000000002),
 (11, 0.408248290463863),
 (12, 0.0),
 (13, 0.5773502691896258),
 (14, 0.3333333333333334),
 (15, 0.408248290463863),
 (16, 0.816496580927726),
 (17, 0.5773502691896258),
 (18, 0.5773502691896258),
 (19, 0.5163977794943223),
 (20, 0.3333333333333334),
 (21, 0.25819888974716115),
 (22, 0.0),
 (23, 0.3333333333333334),
 (24, 0.816496580927726),
 (25, 0.5773502691896258),
 (26, 0.408248290463863),
 (27, 0.816496580927726),
 (28, 0.2357022603955159),
 (29, 0.408248290463863),
 (30, 0.5773502691896258),
 (31, 0.0),
 (32, 0.3333333333333334),
 (33, 0.408248290463863),
 (34, 0.816496580927726),
 (35, 0.408248290463863),
 (36, 0.816496580927726),
 (37, 0.5773502691896258),
 (38, 0.408248290463863),
 (39, 0.3333333333333334),
 (40, 0.5773502691896258),
 (41, 0.0),
 (42, 0.666666666666666

In [12]:
similarFilm = sorted(similarFilm, key=lambda x: x[1], reverse=True)
similarFilm

[(3, 1.0000000000000002),
 (10, 1.0000000000000002),
 (48, 1.0000000000000002),
 (53, 1.0000000000000002),
 (86, 1.0000000000000002),
 (168, 1.0000000000000002),
 (195, 1.0000000000000002),
 (203, 1.0000000000000002),
 (250, 1.0000000000000002),
 (311, 1.0000000000000002),
 (320, 1.0000000000000002),
 (331, 1.0000000000000002),
 (482, 1.0000000000000002),
 (485, 1.0000000000000002),
 (631, 1.0000000000000002),
 (659, 1.0000000000000002),
 (682, 1.0000000000000002),
 (693, 1.0000000000000002),
 (715, 1.0000000000000002),
 (719, 1.0000000000000002),
 (730, 1.0000000000000002),
 (737, 1.0000000000000002),
 (940, 1.0000000000000002),
 (995, 1.0000000000000002),
 (996, 1.0000000000000002),
 (1003, 1.0000000000000002),
 (1006, 1.0000000000000002),
 (1036, 1.0000000000000002),
 (1099, 1.0000000000000002),
 (1180, 1.0000000000000002),
 (1238, 1.0000000000000002),
 (1241, 1.0000000000000002),
 (1309, 1.0000000000000002),
 (1401, 1.0000000000000002),
 (1461, 1.0000000000000002),
 (1463, 1.000000

In [13]:
## ambil index dati hasil yang sudah disort untuk mengambil film nya 

a = []
for i in similarFilm:
    a.append(data.iloc[i[0]])

In [14]:
a

[movieId                           4
 title      Waiting to Exhale (1995)
 genres         Comedy|Drama|Romance
 Name: 3, dtype: object,
 movieId                                11
 title      American President, The (1995)
 genres               Comedy|Drama|Romance
 Name: 10, dtype: object,
 movieId                         52
 title      Mighty Aphrodite (1995)
 genres        Comedy|Drama|Romance
 Name: 48, dtype: object,
 movieId                                   58
 title      Postman, The (Postino, Il) (1994)
 genres                  Comedy|Drama|Romance
 Name: 53, dtype: object,
 movieId                        94
 title      Beautiful Girls (1996)
 genres       Comedy|Drama|Romance
 Name: 86, dtype: object,
 movieId                               195
 title      Something to Talk About (1995)
 genres               Comedy|Drama|Romance
 Name: 168, dtype: object,
 movieId                        224
 title      Don Juan DeMarco (1995)
 genres        Comedy|Drama|Romance
 Name: 195, dtyp

In [15]:
## masukan ke dataframe

hasil = pd.DataFrame(a)
hasil

Unnamed: 0,movieId,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
10,11,"American President, The (1995)",Comedy|Drama|Romance
48,52,Mighty Aphrodite (1995),Comedy|Drama|Romance
53,58,"Postman, The (Postino, Il) (1994)",Comedy|Drama|Romance
86,94,Beautiful Girls (1996),Comedy|Drama|Romance
...,...,...,...
10312,142566,Sarusuberi: Miss Hokusai (2015),Animation
10313,142973,Ice and the Sky (2015),Documentary
10317,143709,The Take (2009),(no genres listed)
10321,144976,Bone Tomahawk (2015),Horror|Western


In [16]:
## ambil 10 film rekomendasi terbaik

hasil.head(10)

Unnamed: 0,movieId,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
10,11,"American President, The (1995)",Comedy|Drama|Romance
48,52,Mighty Aphrodite (1995),Comedy|Drama|Romance
53,58,"Postman, The (Postino, Il) (1994)",Comedy|Drama|Romance
86,94,Beautiful Girls (1996),Comedy|Drama|Romance
168,195,Something to Talk About (1995),Comedy|Drama|Romance
195,224,Don Juan DeMarco (1995),Comedy|Drama|Romance
203,232,Eat Drink Man Woman (Yin shi nan nu) (1994),Comedy|Drama|Romance
250,281,Nobody's Fool (1994),Comedy|Drama|Romance
311,351,"Corrina, Corrina (1994)",Comedy|Drama|Romance
