In [1]:
import os
import tqdm
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix

In [6]:
path = './data/ml-latest-small/'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [10]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Representation using tf-idf

## Genre Representation

In [65]:
unique_genres = list(set([gr for grs in list(map(lambda x: x.split('|'), movies_df['genres'])) for gr in grs]))
total_genres = len(unique_genres)
total_movies = len(movies_df)

print(f'num of moives: {total_movies}\nnum of genres: {total_genres}')

num of moives: 9742
num of genres: 20


In [153]:
genre_count = dict.fromkeys(unique_genres)

for genre_list in movies_df['genres']:
    for genre in genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] += 1
            
print(genre_count)

{'Animation': 611, 'Romance': 1596, 'Action': 1828, 'Documentary': 440, 'Western': 167, 'War': 382, 'Fantasy': 779, 'Sci-Fi': 980, 'Children': 664, 'Drama': 4361, 'Thriller': 1894, '(no genres listed)': 34, 'IMAX': 158, 'Comedy': 3756, 'Crime': 1199, 'Horror': 978, 'Musical': 334, 'Film-Noir': 87, 'Mystery': 573, 'Adventure': 1263}


In [154]:
'''
Only idf is used because the genre appears once for each movie.
'''
for genre in genre_count:
    genre_count[genre] = np.log10(total_movies/genre_count[genre])
    
print(genre_count)

{'Animation': 1.2026069149931968, 'Romance': 0.7856152382210405, 'Action': 0.7266719338379385, 'Documentary': 1.3451954487495636, 'Western': 1.7659316540881678, 'War': 1.4065847623240424, 'Fantasy': 1.0971106675631865, 'Sci-Fi': 0.9974220495432563, 'Children': 1.1664800458677336, 'Drama': 0.3490620385623247, 'Thriller': 0.7112681505684965, '(no genres listed)': 2.457169208193496, 'IMAX': 1.7899910382813284, 'Comedy': 0.4139225416416778, 'Crime': 0.9098289421369025, 'Horror': 0.9983092704481497, 'Musical': 1.4649016584241867, 'Film-Noir': 2.0491288726171324, 'Mystery': 1.2304935032683613, 'Adventure': 0.8872447746804204}


In [155]:
%%time
'''
Create Genre Representation
It takes about 40 seconds
'''
genre_representation = pd.DataFrame(columns=sorted(unique_genres), index=movies_df.index)

for index, row in movies_df.iterrows():
    dict_temp = {g: genre_count[g] for g in row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation.head()

Wall time: 42 s


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,


## Tag Representation
- TfidfVectorizer (wrong case)
- Module not used (only use python)

In [156]:
tag_list = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([t for tags in tag_list for t in tags])))))
total_tags = len(unique_tags)

print(f'num of tags:{len(tag_list)} \nnum of unique tags: {total_tags}')

num of tags:3683 
num of unique tags: 1589


In [157]:
start_id = 60756
corpus = ['']
cnt = 0
for _, rows in tags_df[['movieId','tag']].iterrows():
    m_id, tag = rows
    if start_id == m_id:
        corpus[cnt] = corpus[cnt] + ' ' + tag
    else:
        corpus.append('')
        start_id = m_id
        cnt+=1
        corpus[cnt] = corpus[cnt] + ' ' + tag
        
print(corpus)

[' funny Highly quotable will ferrell', ' Boxing story MMA Tom Hardy', ' drugs Leonardo DiCaprio Martin Scorsese', ' way too long', ' Al Pacino gangster mafia', ' Al Pacino Mafia', ' holocaust true story', ' twist ending', ' Anthony Hopkins courtroom drama twist ending', ' britpop indie record label music', ' dumpster diving Sustainability', ' romantic comedy wedding', ' painter', ' bloody', ' black hole sci-fi time-travel', ' fantasy magic board game Robin Williams', ' beautiful scenery epic historical inspirational Medieval mel gibson Oscar (Best Cinematography) revenge sword fight', ' black comedy Christina Ricci Christopher Lloyd dark comedy family gothic', ' Al Pacino Andy Garcia Classic Francis Ford Coppola mafia', ' black comedy Christina Ricci Christopher Lloyd Family gothic quirky', ' family funny Macaulay Culkin sequel', ' animation Disney funny original Pixar sequel Tom Hanks', ' ancient Rome Epic history imdb top 250 revenge Rome Russell Crowe', ' Ed Harris Jude Law Rachel 

In [158]:
'''
using TfidfVectorizer
'''

from sklearn.feature_extraction.text import TfidfVectorizer

tag_representation = TfidfVectorizer().fit_transform(corpus)

In [159]:
tag_representation

<1771x1744 sparse matrix of type '<class 'numpy.float64'>'
	with 5434 stored elements in Compressed Sparse Row format>

<b>tags are not in word units<b> <br>
TfidfVectorizer splits tags made up of sentences and uses them.<br> 
Therefore, it does not match the existing number of unique tags.<br>
    
- origin num of unique tags : 1589
- TfidfVectorizer tags : 1744

In [160]:
'''
Compute Idf for tag (Module not used)
'''
movie_tag_count = len(set(tags_df['movieId']))
tag_count_dict = dict.fromkeys(unique_tags)

for movie_tag_list in tags_df['tag']:
    for tag in movie_tag_list.split(','):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1
            
tag_idf = dict()
for tag in tag_count_dict:
    tag_idf[tag] = np.log10(movie_tag_count / tag_count_dict[tag])

print(tag_idf)

{'cerebral': 2.4183012913197452, 'narrated': 2.895422546039408, 'Modern war': 3.196452541703389, 'Cult classic': 3.196452541703389, 'ethics': 3.196452541703389, 'Great performances': 3.196452541703389, 'Robert Penn Warren': 3.196452541703389, 'ironic': 2.895422546039408, 'Steve Carell': 2.7193312869837265, 'menacing': 3.196452541703389, 'Gary Oldman': 3.196452541703389, 'immortality': 3.196452541703389, 'scary': 2.895422546039408, 'technology': 3.196452541703389, 'Funny': 3.196452541703389, 'Horrible directing': 3.196452541703389, 'jay and silent bob': 3.196452541703389, 'characters': 2.895422546039408, 'Wesley Snipes': 3.196452541703389, 'subway': 3.196452541703389, 'butler': 3.196452541703389, 'spacecraft': 3.196452541703389, 'Doc Ock': 3.196452541703389, 'M. Night Shyamalan': 2.895422546039408, 'creepy': 2.2422100322640643, 'violent': 2.5943925503754266, 'comics': 3.196452541703389, 'President': 3.196452541703389, 'psychedelic': 2.7193312869837265, 'Gal Gadot': 3.196452541703389, 'g

In [161]:
%%time
'''
Create Tag Representation
It takes about 4 minutes
'''
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tags_df.groupby(by='movieId'):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for tag_list in temp_list for tag in tag_list])))))
    
    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)
    
tag_representation = tag_representation.sort_index(0)

Wall time: 3min 40s


In [162]:
tag_representation

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


## Concat the Representation

In [163]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)

In [164]:
movie_representation

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [217]:
%%time
def custom_cosine_similarity(x, mode='similarity'):
    if mode == 'similarity':
        return np.dot(x, x.T) / (np.linalg.norm(x, axis=1)[:, np.newaxis] * np.linalg.norm(x, axis=1)[np.newaxis, :])
    if mode == 'distance':
        return 1 - np.dot(x, x.T) / (np.linalg.norm(x, axis=1)[:, np.newaxis] * np.linalg.norm(x, axis=1)[np.newaxis, :])
    
result  = custom_cosine_similarity(movie_representation)
result_df = pd.DataFrame(data=result, index=[movie_representation.index])

Wall time: 1.85 s


In [219]:
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.000000,0.124438,0.008403,0.040571,0.011755,0.000000,0.016339,0.331122,0.000000,0.131794,...,0.064466,0.260941,0.071492,0.271710,0.0,0.348295,0.379492,0.000000,0.232553,0.093519
2,0.124438,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.240843,0.000000,0.095861,...,0.000000,0.000000,0.000000,0.000000,0.0,0.108082,0.117763,0.000000,0.000000,0.000000
3,0.008403,0.000000,1.000000,0.179391,0.011294,0.000000,0.072246,0.000000,0.000000,0.000000,...,0.006560,0.000000,0.068686,0.000000,0.0,0.020322,0.022142,0.000000,0.000000,0.089849
4,0.040571,0.000000,0.179391,1.000000,0.054530,0.000000,0.348828,0.000000,0.000000,0.000000,...,0.031674,0.101979,0.567487,0.000000,0.0,0.098119,0.106908,0.365843,0.000000,0.433821
5,0.011755,0.000000,0.011294,0.054530,1.000000,0.000000,0.640342,0.000000,0.000000,0.000000,...,0.009177,0.000000,0.096091,0.000000,0.0,0.028429,0.030976,0.000000,0.000000,0.125697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.348295,0.108082,0.020322,0.098119,0.028429,0.211466,0.039515,0.000000,0.397065,0.213809,...,0.206804,0.631077,0.172901,0.657123,0.0,1.000000,0.917791,0.000000,0.767770,0.226174
193583,0.379492,0.117763,0.022142,0.106908,0.030976,0.000000,0.043055,0.000000,0.000000,0.000000,...,0.169874,0.687605,0.188388,0.715984,0.0,0.917791,1.000000,0.000000,0.612800,0.246433
193585,0.000000,0.000000,0.000000,0.365843,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278750,0.644671,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.232553,0.000000,0.000000,0.000000,0.000000,0.275428,0.000000,0.000000,0.517166,0.278480,...,0.247849,0.821961,0.000000,0.855885,0.0,0.767770,0.612800,0.000000,1.000000,0.000000


In [171]:
from sklearn.metrics.pairwise import cosine_similarity
'''
Same result as custom function
'''


def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    cs_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return cs_df


cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.000000,0.124438,0.008403,0.040571,0.011755,0.000000,0.016339,0.331122,0.000000,0.131794,...,0.064466,0.260941,0.071492,0.271710,0.0,0.348295,0.379492,0.000000,0.232553,0.093519
2,0.124438,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.240843,0.000000,0.095861,...,0.000000,0.000000,0.000000,0.000000,0.0,0.108082,0.117763,0.000000,0.000000,0.000000
3,0.008403,0.000000,1.000000,0.179391,0.011294,0.000000,0.072246,0.000000,0.000000,0.000000,...,0.006560,0.000000,0.068686,0.000000,0.0,0.020322,0.022142,0.000000,0.000000,0.089849
4,0.040571,0.000000,0.179391,1.000000,0.054530,0.000000,0.348828,0.000000,0.000000,0.000000,...,0.031674,0.101979,0.567487,0.000000,0.0,0.098119,0.106908,0.365843,0.000000,0.433821
5,0.011755,0.000000,0.011294,0.054530,1.000000,0.000000,0.640342,0.000000,0.000000,0.000000,...,0.009177,0.000000,0.096091,0.000000,0.0,0.028429,0.030976,0.000000,0.000000,0.125697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.348295,0.108082,0.020322,0.098119,0.028429,0.211466,0.039515,0.000000,0.397065,0.213809,...,0.206804,0.631077,0.172901,0.657123,0.0,1.000000,0.917791,0.000000,0.767770,0.226174
193583,0.379492,0.117763,0.022142,0.106908,0.030976,0.000000,0.043055,0.000000,0.000000,0.000000,...,0.169874,0.687605,0.188388,0.715984,0.0,0.917791,1.000000,0.000000,0.612800,0.246433
193585,0.000000,0.000000,0.000000,0.365843,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278750,0.644671,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.232553,0.000000,0.000000,0.000000,0.000000,0.275428,0.000000,0.000000,0.517166,0.278480,...,0.247849,0.821961,0.000000,0.855885,0.0,0.767770,0.612800,0.000000,1.000000,0.000000


In [221]:
print(cs_df[1].sort_values(ascending=False))

2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [222]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[126142])
print(movies_df.loc[2043])
print(movies_df.loc[2399])

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     The Cave of the Golden Rose (1991)
genres            Adventure|Children|Fantasy
Name: 126142, dtype: object
title     Darby O'Gill and the Little People (1959)
genres                   Adventure|Children|Fantasy
Name: 2043, dtype: object
title     Santa Claus: The Movie (1985)
genres       Adventure|Children|Fantasy
Name: 2399, dtype: object
