In [1]:
import os
import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [2]:
path = './data/ml-latest-small/'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Representation using tf-idf

## Genre Representation

In [6]:
unique_genres = list(set([gr for grs in list(map(lambda x: x.split('|'), movies_df['genres'])) for gr in grs]))
total_genres = len(unique_genres)
total_movies = len(movies_df)

print(f'num of moives: {total_movies}\nnum of genres: {total_genres}')

num of moives: 9742
num of genres: 20


In [7]:
genre_count = dict.fromkeys(unique_genres)

for genre_list in movies_df['genres']:
    for genre in genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] += 1
            
print(genre_count)

{'Action': 1828, 'Film-Noir': 87, 'Adventure': 1263, 'Romance': 1596, 'Musical': 334, 'Western': 167, 'IMAX': 158, 'Fantasy': 779, 'Crime': 1199, 'Documentary': 440, 'Drama': 4361, 'Sci-Fi': 980, 'Thriller': 1894, 'Animation': 611, 'Horror': 978, '(no genres listed)': 34, 'Mystery': 573, 'Comedy': 3756, 'Children': 664, 'War': 382}


In [8]:
'''
Only idf is used because the genre appears once for each movie.
'''
for genre in genre_count:
    genre_count[genre] = np.log10(total_movies/genre_count[genre])
    
print(genre_count)

{'Action': 0.7266719338379385, 'Film-Noir': 2.0491288726171324, 'Adventure': 0.8872447746804204, 'Romance': 0.7856152382210405, 'Musical': 1.4649016584241867, 'Western': 1.7659316540881678, 'IMAX': 1.7899910382813284, 'Fantasy': 1.0971106675631865, 'Crime': 0.9098289421369025, 'Documentary': 1.3451954487495636, 'Drama': 0.3490620385623247, 'Sci-Fi': 0.9974220495432563, 'Thriller': 0.7112681505684965, 'Animation': 1.2026069149931968, 'Horror': 0.9983092704481497, '(no genres listed)': 2.457169208193496, 'Mystery': 1.2304935032683613, 'Comedy': 0.4139225416416778, 'Children': 1.1664800458677336, 'War': 1.4065847623240424}


In [9]:
%%time
'''
Create Genre Representation
It takes about 40 seconds
'''
genre_representation = pd.DataFrame(columns=sorted(unique_genres), index=movies_df.index)

for index, row in movies_df.iterrows():
    dict_temp = {g: genre_count[g] for g in row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation.head()

Wall time: 38.2 s


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,


## Tag Representation
- TfidfVectorizer (wrong case)
- Module not used (only use python)

In [10]:
tag_list = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([t for tags in tag_list for t in tags])))))
total_tags = len(unique_tags)

print(f'num of tags:{len(tag_list)} \nnum of unique tags: {total_tags}')

num of tags:3683 
num of unique tags: 1589


In [11]:
start_id = 60756
corpus = ['']
cnt = 0
for _, rows in tags_df[['movieId','tag']].iterrows():
    m_id, tag = rows
    if start_id == m_id:
        corpus[cnt] = corpus[cnt] + ' ' + tag
    else:
        corpus.append('')
        start_id = m_id
        cnt+=1
        corpus[cnt] = corpus[cnt] + ' ' + tag
        
#print(corpus)

In [12]:
'''
using TfidfVectorizer
'''

from sklearn.feature_extraction.text import TfidfVectorizer

tag_representation = TfidfVectorizer().fit_transform(corpus)

In [13]:
tag_representation

<1771x1744 sparse matrix of type '<class 'numpy.float64'>'
	with 5434 stored elements in Compressed Sparse Row format>

<b>tags are not in word units<b> <br>
TfidfVectorizer splits tags made up of sentences and uses them.<br> 
Therefore, it does not match the existing number of unique tags.<br>
    
- origin num of unique tags : 1589
- TfidfVectorizer tags : 1744

In [14]:
'''
Compute Idf for tag (Module not used)
'''
movie_tag_count = len(set(tags_df['movieId']))
tag_count_dict = dict.fromkeys(unique_tags)

for movie_tag_list in tags_df['tag']:
    for tag in movie_tag_list.split(','):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1
            
tag_idf = dict()
for tag in tag_count_dict:
    tag_idf[tag] = np.log10(movie_tag_count / tag_count_dict[tag])

#print(tag_idf)

In [15]:
%%time
'''
Create Tag Representation
It takes about 4 minutes
'''
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tags_df.groupby(by='movieId'):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for tag_list in temp_list for tag in tag_list])))))
    
    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)
    
tag_representation = tag_representation.sort_index(0)

Wall time: 3min 33s


In [16]:
tag_representation

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


## Concat the Representation

In [17]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)

In [18]:
movie_representation

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
%%time
def custom_cosine_similarity(x, mode='similarity'):
    if mode == 'similarity':
        return np.dot(x, x.T) / (np.linalg.norm(x, axis=1)[:, np.newaxis] * np.linalg.norm(x, axis=1)[np.newaxis, :])
    if mode == 'distance':
        return 1 - np.dot(x, x.T) / (np.linalg.norm(x, axis=1)[:, np.newaxis] * np.linalg.norm(x, axis=1)[np.newaxis, :])
    
result  = custom_cosine_similarity(movie_representation)
result_df = pd.DataFrame(data=result, index=[movie_representation.index])

Wall time: 1.95 s


In [20]:
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.000000,0.124438,0.008403,0.040571,0.011755,0.000000,0.016339,0.331122,0.000000,0.131794,...,0.064466,0.260941,0.071492,0.271710,0.0,0.348295,0.379492,0.000000,0.232553,0.093519
2,0.124438,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.240843,0.000000,0.095861,...,0.000000,0.000000,0.000000,0.000000,0.0,0.108082,0.117763,0.000000,0.000000,0.000000
3,0.008403,0.000000,1.000000,0.179391,0.011294,0.000000,0.072246,0.000000,0.000000,0.000000,...,0.006560,0.000000,0.068686,0.000000,0.0,0.020322,0.022142,0.000000,0.000000,0.089849
4,0.040571,0.000000,0.179391,1.000000,0.054530,0.000000,0.348828,0.000000,0.000000,0.000000,...,0.031674,0.101979,0.567487,0.000000,0.0,0.098119,0.106908,0.365843,0.000000,0.433821
5,0.011755,0.000000,0.011294,0.054530,1.000000,0.000000,0.640342,0.000000,0.000000,0.000000,...,0.009177,0.000000,0.096091,0.000000,0.0,0.028429,0.030976,0.000000,0.000000,0.125697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.348295,0.108082,0.020322,0.098119,0.028429,0.211466,0.039515,0.000000,0.397065,0.213809,...,0.206804,0.631077,0.172901,0.657123,0.0,1.000000,0.917791,0.000000,0.767770,0.226174
193583,0.379492,0.117763,0.022142,0.106908,0.030976,0.000000,0.043055,0.000000,0.000000,0.000000,...,0.169874,0.687605,0.188388,0.715984,0.0,0.917791,1.000000,0.000000,0.612800,0.246433
193585,0.000000,0.000000,0.000000,0.365843,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278750,0.644671,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.232553,0.000000,0.000000,0.000000,0.000000,0.275428,0.000000,0.000000,0.517166,0.278480,...,0.247849,0.821961,0.000000,0.855885,0.0,0.767770,0.612800,0.000000,1.000000,0.000000


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
'''
Same result as custom function
'''


def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    cs_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return cs_df


cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.000000,0.124438,0.008403,0.040571,0.011755,0.000000,0.016339,0.331122,0.000000,0.131794,...,0.064466,0.260941,0.071492,0.271710,0.0,0.348295,0.379492,0.000000,0.232553,0.093519
2,0.124438,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.240843,0.000000,0.095861,...,0.000000,0.000000,0.000000,0.000000,0.0,0.108082,0.117763,0.000000,0.000000,0.000000
3,0.008403,0.000000,1.000000,0.179391,0.011294,0.000000,0.072246,0.000000,0.000000,0.000000,...,0.006560,0.000000,0.068686,0.000000,0.0,0.020322,0.022142,0.000000,0.000000,0.089849
4,0.040571,0.000000,0.179391,1.000000,0.054530,0.000000,0.348828,0.000000,0.000000,0.000000,...,0.031674,0.101979,0.567487,0.000000,0.0,0.098119,0.106908,0.365843,0.000000,0.433821
5,0.011755,0.000000,0.011294,0.054530,1.000000,0.000000,0.640342,0.000000,0.000000,0.000000,...,0.009177,0.000000,0.096091,0.000000,0.0,0.028429,0.030976,0.000000,0.000000,0.125697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.348295,0.108082,0.020322,0.098119,0.028429,0.211466,0.039515,0.000000,0.397065,0.213809,...,0.206804,0.631077,0.172901,0.657123,0.0,1.000000,0.917791,0.000000,0.767770,0.226174
193583,0.379492,0.117763,0.022142,0.106908,0.030976,0.000000,0.043055,0.000000,0.000000,0.000000,...,0.169874,0.687605,0.188388,0.715984,0.0,0.917791,1.000000,0.000000,0.612800,0.246433
193585,0.000000,0.000000,0.000000,0.365843,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278750,0.644671,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.232553,0.000000,0.000000,0.000000,0.000000,0.275428,0.000000,0.000000,0.517166,0.278480,...,0.247849,0.821961,0.000000,0.855885,0.0,0.767770,0.612800,0.000000,1.000000,0.000000


In [22]:
print(cs_df[1].sort_values(ascending=False))

2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [23]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[126142])
print(movies_df.loc[2043])
print(movies_df.loc[2399])

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     The Cave of the Golden Rose (1991)
genres            Adventure|Children|Fantasy
Name: 126142, dtype: object
title     Darby O'Gill and the Little People (1959)
genres                   Adventure|Children|Fantasy
Name: 2043, dtype: object
title     Santa Claus: The Movie (1985)
genres       Adventure|Children|Fantasy
Name: 2399, dtype: object


## ContentBasedProfile(Just Test)

- make an user profile(user-featrue)
- make an item profile(item-feature)
cosine similarity(user_profile , item_profile) -> recommend

In [76]:
sparse_matrix = ratings_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x['userId'])).unstack()
sparse_matrix.index.name = 'movieId'
sparse_matrix = sparse_matrix.fillna(0)
sparse_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
movie_representation

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
'''
Matching the number of movies in the movie representation and the number of movies in the rating matirx are different.
'''
s_index = set(sparse_matrix.index)
m_index = set(movie_representation.index)
diff_index = list(set.difference(m_index, s_index))

for i in movie_representation.index:
    if i in diff_index:
        idx = list(movie_representation.index).index(i)
        movie_representation = movie_representation.drop([movie_representation.index[idx]])
        
movie_representation

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
sparse_matrix.T

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
user_profile = np.dot(sparse_matrix.T, movie_representation)

In [89]:
final_predict = cosine_similarity(user_profile, movie_representation)
final_predict = pd.DataFrame(data=final_predict, index=[sparse_matrix.T.index], columns=sparse_matrix.T.columns)
final_predict.shape

(610, 9724)

In [90]:
final_predict.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.326487,0.167216,0.058691,0.224195,0.023591,0.452139,0.070058,0.482399,0.361051,0.581497,...,0.149534,0.2389,0.232004,0.208902,0.0,0.492775,0.380712,0.13732,0.36552,0.187684
2,0.021095,0.010315,0.010488,0.104404,0.009293,0.341952,0.020394,0.042829,0.201637,0.243567,...,0.071558,0.040964,0.151255,0.0,0.111551,0.096784,0.018219,0.146957,0.10428,0.07393
3,0.119213,0.078649,0.007407,0.052715,0.013282,0.339685,0.025666,0.203916,0.401897,0.530236,...,0.182455,0.038466,0.06137,0.026605,0.0,0.284594,0.136212,0.046333,0.230618,0.041207
4,0.167581,0.082826,0.091503,0.564427,0.050355,0.332315,0.187284,0.211836,0.13855,0.332902,...,0.079139,0.157103,0.481389,0.066302,0.024721,0.282835,0.248228,0.335173,0.1284,0.347058
5,0.221879,0.07053,0.026343,0.176445,0.010984,0.205454,0.051225,0.196155,0.082602,0.154752,...,0.062072,0.159433,0.153587,0.126938,0.0,0.213407,0.196786,0.134624,0.151364,0.087381


In [119]:
final_predict.iloc[0].sort_values(ascending=False)[:10]

movieId
546       0.699974
27155     0.694434
26590     0.694434
26340     0.686178
51939     0.686178
164226    0.677864
2005      0.674516
3440      0.674516
79139     0.674516
117646    0.670261
Name: (1,), dtype: float64

In [122]:
ratings_df[ratings_df['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
227,1,3744,4.0,964980694
228,1,3793,5.0,964981855
229,1,3809,4.0,964981220
230,1,4006,4.0,964982903


In [118]:
movies_df.loc[ratings_df[ratings_df['userId']==1]['movieId']]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
6,Heat (1995),Action|Crime|Thriller
47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...
3744,Shaft (2000),Action|Crime|Thriller
3793,X-Men (2000),Action|Adventure|Sci-Fi
3809,What About Bob? (1991),Comedy
4006,Transformers: The Movie (1986),Adventure|Animation|Children|Sci-Fi


In [121]:
movies_df.loc[final_predict.iloc[0].sort_values(ascending=False)[:10].index]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
546,Super Mario Bros. (1993),Action|Adventure|Children|Comedy|Fantasy|Sci-Fi
27155,"Batman/Superman Movie, The (1998)",Action|Adventure|Animation|Children|Fantasy|Sc...
26590,G.I. Joe: The Movie (1987),Action|Adventure|Animation|Children|Fantasy|Sc...
26340,"Twelve Tasks of Asterix, The (Les douze travau...",Action|Adventure|Animation|Children|Comedy|Fan...
51939,TMNT (Teenage Mutant Ninja Turtles) (2007),Action|Adventure|Animation|Children|Comedy|Fan...
164226,Maximum Ride (2016),Action|Adventure|Comedy|Fantasy|Sci-Fi|Thriller
2005,"Goonies, The (1985)",Action|Adventure|Children|Comedy|Fantasy
3440,Teenage Mutant Ninja Turtles III (1993),Action|Adventure|Children|Comedy|Fantasy
79139,"Sorcerer's Apprentice, The (2010)",Action|Adventure|Children|Comedy|Fantasy
117646,Dragonheart 2: A New Beginning (2000),Action|Adventure|Comedy|Drama|Fantasy|Thriller


솔직히 추천 결과가 좋은지 잘 모르겠다. 정량적인 지표가 필요할듯<br>
나중에 아래쪽 evaluation 에서 코드 가져와서 테스트 해봐야 할 것 같다.<br>

## Evaluation

In [31]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)
test_userids = list(set(test_df.userId.values))

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [123]:
%%time
result_df = pd.DataFrame()

for user_id in test_userids:
    user_record_df = train_df.loc[train_df.userId == int(user_id), :] 
    
    user_sim_df = cs_df.loc[user_record_df['movieId']] 
    user_rating_df = user_record_df[['rating']]  # (n, 1)
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)  # (9742, 1)
    
    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1)
    
    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

Wall time: 7.59 s


In [124]:
result_df

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,151,4.293969,1,5.0,964984041
1,423,4.082067,1,3.0,964982363
2,596,4.182728,1,5.0,964982838
3,673,4.147920,1,3.0,964981775
4,1029,4.264604,1,5.0,964982855
...,...,...,...,...,...
253,152077,3.404142,610,4.0,1493845817
254,156371,3.400804,610,5.0,1479542831
255,158238,3.569544,610,5.0,1479545219
256,160341,3.477674,610,2.5,1479545749
