In [10]:
import pandas as pd
from pandas import read_csv, isnull, notnull
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def get_dataframe_movies_csv(text):
    """
    đọc file csv của movilens, lưu thành dataframe với 3 cột user id, title, genres
    """
    movie_cols = ['movie_id', 'title', 'genres']
    movies = pandas.read_csv(text, sep=',', names=movie_cols, encoding='latin-1')
    return movies

In [3]:
df = pd.read_csv('ml-latest-small/movies.csv')

In [4]:
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
dftag = pd.read_csv('ml-latest-small/tags.csv')
dftag

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [8]:
movie_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv('ml-latest-small/movies.csv', sep=',', names=movie_cols, encoding='latin-1')

In [9]:
movies

Unnamed: 0,movie_id,title,genres
0,movieId,title,genres
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,2,Jumanji (1995),Adventure|Children|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama|Romance
...,...,...,...
9738,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9739,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9740,193585,Flint (2017),Drama
9741,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [12]:
"""
        Dùng hàm "TfidfVectorizer" để chuẩn hóa "genres" với:
        + analyzer='word': chọn đơn vị trích xuất là word
        + ngram_range=(1, 1): mỗi lần trích xuất 1 word
        + min_df=0: tỉ lệ word không đọc được là 0
        Lúc này ma trận trả về với số dòng tương ứng với số lượng film và số cột tương ứng với số từ được tách ra từ "genres"
    """
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0)
new_tfidf_matrix = tf.fit_transform(movies['genres'])

In [13]:
new_tfidf_matrix

<9743x24 sparse matrix of type '<class 'numpy.float64'>'
	with 23220 stored elements in Compressed Sparse Row format>

In [14]:
movies['genres'] = movies['genres'].str.split('|')
movies['genres'] = movies['genres'].fillna("").astype('str')

In [15]:
movies

Unnamed: 0,movie_id,title,genres
0,movieId,title,['genres']
1,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
2,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']"
3,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
4,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']"
...,...,...,...
9738,193581,Black Butler: Book of the Atlantic (2017),"['Action', 'Animation', 'Comedy', 'Fantasy']"
9739,193583,No Game No Life: Zero (2017),"['Animation', 'Comedy', 'Fantasy']"
9740,193585,Flint (2017),['Drama']
9741,193587,Bungo Stray Dogs: Dead Apple (2018),"['Action', 'Animation']"


In [16]:
tfidf_matrix = tf.fit_transform(movies['genres'])

In [17]:
tfidf_matrix

<9743x24 sparse matrix of type '<class 'numpy.float64'>'
	with 23220 stored elements in Compressed Sparse Row format>

In [18]:
cosine_sim  =  linear_kernel(tfidf_matrix, tfidf_matrix)

In [19]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.81357698, ..., 0.        , 0.42103424,
        0.2675923 ],
       [0.        , 0.81357698, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.42103424, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.2675923 , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [50]:
# Input title 

titles = movies['title']
genres = movies['genres']
indices = pd.Series(movies.index, index=movies['title'])


In [22]:
indices

title
title                                           0
Toy Story (1995)                                1
Jumanji (1995)                                  2
Grumpier Old Men (1995)                         3
Waiting to Exhale (1995)                        4
                                             ... 
Black Butler: Book of the Atlantic (2017)    9738
No Game No Life: Zero (2017)                 9739
Flint (2017)                                 9740
Bungo Stray Dogs: Dead Apple (2018)          9741
Andrew Dice Clay: Dice Rules (1991)          9742
Length: 9743, dtype: int64

In [47]:
idx = indices['Tom and Huck (1995)']
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:23 + 1]
movie_indices = [i[0] for i in sim_scores]

In [51]:
sim_scores, titles.iloc[movie_indices].values, genres.iloc[movie_indices].values

([(120, 1.0),
  (132, 1.0),
  (205, 1.0),
  (422, 1.0),
  (522, 1.0),
  (581, 1.0),
  (636, 1.0),
  (643, 1.0),
  (752, 1.0),
  (776, 1.0),
  (1186, 1.0),
  (1344, 1.0),
  (1525, 1.0),
  (1526, 1.0),
  (1543, 1.0),
  (2504, 1.0),
  (2735, 1.0),
  (2736, 1.0),
  (2966, 1.0),
  (3918, 1.0),
  (4236, 1.0),
  (4374, 1.0),
  (142, 0.9355643656853467)],
 array(['Amazing Panda Adventure, The (1995)', 'Casper (1995)',
        'Far From Home: The Adventures of Yellow Dog (1995)',
        'Lassie (1994)', 'Homeward Bound II: Lost in San Francisco (1996)',
        'Flipper (1996)', 'Alaska (1996)',
        'Adventures of Pinocchio, The (1996)', 'Fly Away Home (1996)',
        'Swiss Family Robinson (1960)', 'Wild America (1997)',
        "Barney's Great Adventure (1998)",
        'In Search of the Castaways (1962)',
        'Incredible Journey, The (1963)',
        'Journey of Natty Gann, The (1985)',
        'Charlie, the Lonesome Cougar (1967)', 'Benji (1974)',
        'Benji the Hunted (1987)'

In [33]:
movies[['movie_id', 'title', 'genres']]

Unnamed: 0,movie_id,title,genres
0,movieId,title,['genres']
1,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
2,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']"
3,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
4,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']"
...,...,...,...
9738,193581,Black Butler: Book of the Atlantic (2017),"['Action', 'Animation', 'Comedy', 'Fantasy']"
9739,193583,No Game No Life: Zero (2017),"['Animation', 'Comedy', 'Fantasy']"
9740,193585,Flint (2017),['Drama']
9741,193587,Bungo Stray Dogs: Dead Apple (2018),"['Action', 'Animation']"


In [45]:
movies.loc[0:143]

Unnamed: 0,movie_id,title,genres
0,movieId,title,['genres']
1,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
2,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']"
3,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
4,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']"
...,...,...,...
139,165,Die Hard: With a Vengeance (1995),"['Action', 'Crime', 'Thriller']"
140,166,"Doom Generation, The (1995)","['Comedy', 'Crime', 'Drama']"
141,168,First Knight (1995),"['Action', 'Drama', 'Romance']"
142,169,Free Willy 2: The Adventure Home (1995),"['Adventure', 'Children', 'Drama']"
