In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy
10325,146878,Le Grand Restaurant (1966),Comedy
10326,148238,A Very Murray Christmas (2015),Comedy
10327,148626,The Big Short (2015),Drama


In [4]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


In [5]:
import re
for i, title in enumerate(movies['title']):
    year = re.findall('(\d{4})', title)
    if len(year) == 0:
        print(i)

10172
10322


In [6]:
movies.iloc[10322]

movieId                         146344
title      Elämältä kaiken sain (    )
genres                    Comedy|Drama
Name: 10322, dtype: object

## 1. content-based filtering berdasarkan genre

In [7]:
list_genres = []
for genres in movies['genres']:
    for genre in genres.split('|'):
        if genre not in list_genres:
            list_genres.append(genre)

In [8]:
list_genres

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'IMAX',
 'War',
 'Musical',
 'Documentary',
 'Western',
 'Film-Noir',
 '(no genres listed)']

- Avoid/drop (no genres listed) since it is not useful in building the recommendation and to recommend further (based on genre)

In [9]:
for genre in list_genres[:-1]:
    movies[genre] = movies['genres'].map(lambda x: 1 if genre in x else 0)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
cos_sim = cosine_similarity(np.array(movies[list_genres[:-1]]))
cos_sim.shape

(10329, 10329)

#### Rekomendasi 5 judul film

User : Joko sangat menyukai film bergenre animasi & action, terutama film Superman vs. The Elite (2012).

In [12]:
title = 'Superman vs. The Elite (2012)'
movies[movies['title'] == title]

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir
9370,94974,Superman vs. The Elite (2012),Action|Animation,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
i = int(movies[movies['title'] == title].index.values)
movies_sim = list(enumerate(cos_sim[i,:]))  # access specific movie cosine similarities by index
movies_rec = sorted(movies_sim, key=lambda x: x[1], reverse=True)  # sort by similarities value
                  
movies_rec[:6]

[(6260, 0.9999999999999998),
 (8637, 0.9999999999999998),
 (9370, 0.9999999999999998),
 (9570, 0.9999999999999998),
 (10167, 0.9999999999999998),
 (10277, 0.9999999999999998)]

In [14]:
count = 0
for j, _ in movies_rec:
    if j != i:
        print(movies.iloc[j]['title'])
        count += 1
        if count == 5:
            break

Street Fighter II: The Animated Movie (Sutorîto Faitâ II gekijô-ban) (1994)
Batman: Under the Red Hood (2010)
Batman: The Dark Knight Returns, Part 2 (2013)
Justice League: Throne of Atlantis (2015)
Justice League: Gods and Monsters (2015)


## 2. collaborative filtering

In [15]:
ratings = ratings.merge(movies, how='left', on='movieId', suffixes=('_ratings', '_movies'))

for genre in list_genres[:-1]:
    ratings[genre] = ratings['rating'] * ratings[genre]
    ratings[genre].replace({0.0:np.nan}, inplace=True)

In [16]:
columns = ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama',
             'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'IMAX', 'War',
             'Musical', 'Documentary', 'Western', 'Film-Noir', 'userId']

avg_ratings_by_user = ratings[columns].groupby('userId').mean()
genre_corr = avg_ratings_by_user.corr()

#### Rekomendasi 5 judul film

User : Widodo sangat menyukai film drama komedi, salah satunya bertajuk Being Flynn (2012).

In [18]:
title = 'Being Flynn (2012)'
rating = 5.0  # assumption, as he likes it much
movies[movies['title'] == title]

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir
9411,95816,Being Flynn (2012),Comedy|Drama,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [20]:
i = int(movies[movies['title'] == title].index.values)
genres = movies.loc[i,'genres'].split('|')

In [21]:
genres_rec = pd.DataFrame()
for genre in genres:
    score = genre_corr[genre] * rating
    genres_rec = genres_rec.append(score)
    
genres_score = genres_rec.sum().sort_values(ascending=False).to_dict()
genres_score

{'Drama': 8.666499533283869,
 'Comedy': 8.666499533283869,
 'Romance': 7.2566627149892255,
 'Thriller': 6.941716582855982,
 'Crime': 6.868234266082712,
 'Action': 6.594509627225763,
 'Sci-Fi': 6.233505403322585,
 'Adventure': 6.038085108438931,
 'Fantasy': 5.741797039475454,
 'War': 5.253735786668521,
 'Mystery': 5.218078253231532,
 'Children': 4.932782410935815,
 'Musical': 4.896878555305988,
 'Horror': 4.842240797625466,
 'IMAX': 4.8294180233561566,
 'Film-Noir': 4.475783088347134,
 'Animation': 4.450717147343225,
 'Documentary': 4.180189633442454,
 'Western': 4.167316311915473}

In [22]:
for genre in list_genres[:-1]:
    movies[genre] = movies[genre].map({1:genres_score[genre]})

In [23]:
movies['overall_score'] = movies[list_genres[:-1]].mean(axis=1)
movies.sort_values(by=['overall_score'], ascending=False)[:5][['title', 'genres']]

Unnamed: 0,title,genres
10327,The Big Short (2015),Drama
5761,"Super, The (1991)",Comedy
5771,Mamma Roma (1962),Drama
2193,Crimes of the Heart (1986),Comedy|Drama
2194,"Color Purple, The (1985)",Drama
