In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

from scipy.sparse import coo_matrix, hstack

from src.text import clean_text
from src.factorization import describe_csr_matrix

In [2]:
movies = pd.read_csv('data/movie_info.csv', index_col='id')
tags = pd.read_csv('data/movie_tags.csv', index_col='id')
keywords = pd.read_csv('data/movie_keywords.csv', index_col='id')

movies = movies.join(tags)
movies = movies.join(keywords)

movies.fillna('', inplace=True)
movies.reset_index(drop=False, inplace=True)
movies['idx'] = movies.index

movies.head()

Unnamed: 0,id,original_title,title,overview,tagline,tags,keywords,idx
0,2,Jumanji,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,fantasy adapted from:book animals bad cgi base...,board game disappearance based on children's b...,0
1,3,Grumpier Old Men,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,moldy old Ann Margaret Burgess Meredith Daryl ...,fishing best friend duringcreditsstinger old men,1
2,4,Waiting to Exhale,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,characters girl movie characters chick flick b...,based on novel interracial relationship single...,2
3,5,Father of the Bride Part II,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,steve martin steve martin pregnancy remake agi...,baby midlife crisis confidence aging daughter ...,3
4,6,Heat,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,overrated bank robbery crime heists relationsh...,robbery detective bank obsession chase shootin...,4


In [3]:
%%time

movies['text'] = movies['title'] + " " + movies['overview'] + " " + movies['tagline'] + " " + movies['tags'] + " " + movies['keywords']
text = list(map(clean_text, movies['text']))

CPU times: user 5.15 s, sys: 28 ms, total: 5.18 s
Wall time: 5.18 s


In [4]:
text[0]

'jumanji siblings judy peter discover enchanted board game opens door magical world unwittingly invite alan adult trapped inside game years living room alan hope freedom finish game proves risky three find running giant rhinoceroses evil monkeys terrifying creatures roll dice unleash excitement fantasy adapted book animals bad cgi based book board game childhood recaptured children chris van allsburg fantasy filmed bc jungle kid flick kirsten dunst monkey robin williams saturn award best special effects saturn award best supporting actress scary time time travel fantasy robin williams adapted book childish children kid flick time travel robin williams time travel robin williams joe johnston robin williams children kid flick itaege fantasy robin williams scary time travel game animals comedy fiction thrill dynamic cgi action bad cgi horrifying horror genre kirsten dunst magic board game monkey kids based children book board game disappearance giant insect new home recluse animals fantas

In [5]:
%%time

vectorizer = TfidfVectorizer(lowercase=False, max_df=0.5, min_df=5, ngram_range=(1, 2), 
                             norm='l2', max_features=30000, sublinear_tf=True)
tfidf = vectorizer.fit_transform(text)

print(describe_csr_matrix(tfidf))

20664 x 30000 sparse matrix with 99.79% of sparsity.
CPU times: user 7.66 s, sys: 212 ms, total: 7.87 s
Wall time: 7.87 s


In [6]:
%%time

N_COMPONENTS = 300

svd = TruncatedSVD(n_components=N_COMPONENTS, algorithm="arpack", random_state=0)
tfidf_embedding = svd.fit_transform(tfidf)
tfidf_embedding = normalize(tfidf_embedding, norm="l2", axis=1, copy=False)

CPU times: user 2min 52s, sys: 2.49 s, total: 2min 55s
Wall time: 15.5 s


In [7]:
tfidf_embedding[:10, :5]

array([[ 0.25978422,  0.14438212, -0.05602641, -0.22529994, -0.04495186],
       [ 0.2858111 , -0.0013326 , -0.01666825, -0.07195877, -0.13320343],
       [ 0.26561558, -0.00429922,  0.04043506, -0.1946472 , -0.00089588],
       [ 0.26571431, -0.0101457 ,  0.04745808, -0.08960982, -0.10626855],
       [ 0.28616025,  0.06104085, -0.18023526, -0.02229071,  0.08331679],
       [ 0.24919263,  0.02468183,  0.05931765, -0.22802884, -0.01663561],
       [ 0.24453729,  0.06750767, -0.05225763, -0.13217729, -0.00044391],
       [ 0.10017961,  0.0318589 , -0.04592212,  0.02716878, -0.03254294],
       [ 0.19016097,  0.10069776, -0.11507438, -0.01157223,  0.01068907],
       [ 0.25756446,  0.08071997,  0.03394685, -0.08249145, -0.03711651]])

In [8]:
%%time

movies[['id','title']].to_csv('output/content_embedding_meta.tsv', sep='\t', header=True, index=False)
pd.DataFrame(tfidf_embedding).to_csv('output/tfidf_embedding_vectors.tsv', sep='\t', 
                                     float_format='%.5f', header=False, index=False)

CPU times: user 8.93 s, sys: 56 ms, total: 8.99 s
Wall time: 9.03 s


![](img/tfidf_lord_rings.png)

![](img/tfidf_star_wars.png)

![](img/tfidf_pulp_fiction.png)

In [27]:
crew = pd.read_csv('data/movie_producer.csv', index_col='id')

crew = crew.join(movies[['id','idx']].set_index('id'), how='inner')
crew.reset_index(drop=True, inplace=True)
crew.rename(columns={'idx':'movie_idx'}, inplace=True)

crew['person_id'] = crew['person_id'].astype('category')
crew['person_idx'] = crew['person_id'].cat.codes

crew.sort_values(['person_idx','movie_idx'], inplace=True)

crew.head()

Unnamed: 0,person_id,job,person_name,num_movies,movie_idx,person_idx
361,1,Director,George Lucas,61,204,0
362,1,Writer,George Lucas,61,204,0
3409,1,Director,George Lucas,61,1929,0
4347,1,Director,George Lucas,61,2469,0
6826,1,Director,George Lucas,61,3933,0


In [28]:
crew = crew.groupby(['person_idx','movie_idx']).agg(
    count=pd.NamedAgg(column='movie_idx', aggfunc='count')
)
crew.reset_index(drop=False, inplace=True)

crew.head()

Unnamed: 0,person_idx,movie_idx,count
0,0,204,2
1,0,1929,1
2,0,2469,1
3,0,3933,2
4,0,4774,1


In [29]:
values = crew['count'].values
idx = (crew.movie_idx.values, crew.person_idx.values,)
dim = (movies.idx.max()+1, crew.person_idx.max()+1)
x_crew = coo_matrix((values, idx), shape=dim).tocsr()

print(describe_csr_matrix(x_crew))

20664 x 3635 sparse matrix with 99.97% of sparsity.


In [12]:
x_crew = normalize(x_crew, norm="l2", axis=1, copy=False)
x_tfidf_crew = hstack([tfidf, x_crew])

print(describe_csr_matrix(x_tfidf_crew))

20664 x 34106 sparse matrix with 99.82% of sparsity.


In [13]:
%%time

tfidf_crew_embedding = svd.fit_transform(x_tfidf_crew)
tfidf_crew_embedding = normalize(tfidf_crew_embedding, norm="l2", axis=1, copy=False)

CPU times: user 2min 50s, sys: 2.54 s, total: 2min 52s
Wall time: 15.3 s


In [14]:
tfidf_crew_embedding[:10, :5]

array([[ 0.31318117,  0.17605868, -0.08106005, -0.24501315,  0.05572816],
       [ 0.33767752, -0.00823395,  0.00432691, -0.06634586, -0.13689336],
       [ 0.32999252, -0.01437527,  0.06563404, -0.20575035, -0.05288874],
       [ 0.30538574, -0.03881612,  0.08001554, -0.10589796, -0.08918625],
       [ 0.16065123,  0.04394259, -0.09994559, -0.03926394, -0.0276426 ],
       [ 0.12164504,  0.02239622,  0.03740184, -0.12167827, -0.04634859],
       [ 0.32886804,  0.09096829, -0.05632673, -0.14993616, -0.03390621],
       [ 0.06832814,  0.02244006, -0.04589063,  0.02329674, -0.00306025],
       [ 0.17951467,  0.10554795, -0.12176603, -0.02243625,  0.01868125],
       [ 0.15990407,  0.04954473,  0.03144718, -0.07532962, -0.04347711]])

In [15]:
%%time

pd.DataFrame(tfidf_crew_embedding).to_csv('output/tfidf_crew_embedding_vectors.tsv', sep='\t', 
                                          float_format='%.5f', header=False, index=False)

CPU times: user 9.02 s, sys: 40 ms, total: 9.06 s
Wall time: 9.06 s


![](img/tfidf_crew_lord_rings.png)

![](img/tfidf_crew_star_wars.png)

![](img/tfidf_crew_pulp_fiction.png)

In [25]:
cast = pd.read_csv('data/movie_actor.csv', index_col='id')

cast = cast.join(movies[['id','idx']].set_index('id'), how='inner')
cast.reset_index(drop=True, inplace=True)
cast.rename(columns={'idx':'movie_idx'}, inplace=True)

cast['actor_id'] = cast['actor_id'].astype('category')
cast['actor_idx'] = cast['actor_id'].cat.codes
cast['count'] = 1.0

cast.sort_values(['actor_idx','movie_idx'], inplace=True)

cast.head()

Unnamed: 0,actor_id,actor_name,num_movies,movie_idx,actor_idx,count
14430,1,George Lucas,20,1522,0,1.0
23696,1,George Lucas,20,2555,0,1.0
55828,1,George Lucas,20,6410,0,1.0
59623,1,George Lucas,20,6955,0,1.0
85152,1,George Lucas,20,10229,0,1.0


In [26]:
values = cast['count'].values
idx = (cast.movie_idx.values, cast.actor_idx.values,)
dim = (movies.idx.max()+1, cast.actor_idx.max()+1)
x_cast = coo_matrix((values, idx), shape=dim).tocsr()

print(describe_csr_matrix(x_cast))

20664 x 10269 sparse matrix with 99.94% of sparsity.


In [30]:
x_tfidf_crew_cast = hstack([tfidf, x_crew, x_cast])

print(describe_csr_matrix(x_tfidf_crew_cast))

20664 x 43904 sparse matrix with 99.84% of sparsity.


In [31]:
%%time

tfidf_crew_cast_embedding = svd.fit_transform(x_tfidf_crew_cast)
tfidf_crew_cast_embedding = normalize(tfidf_crew_cast_embedding, norm="l2", axis=1, copy=False)

CPU times: user 3min 34s, sys: 3.19 s, total: 3min 37s
Wall time: 19.2 s


In [32]:
tfidf_crew_cast_embedding[:10, :5]

array([[ 4.64751395e-04,  6.54836918e-02,  4.19997931e-02,
        -1.80599902e-01, -1.34363617e-02],
       [ 1.41679640e-03,  4.27266195e-02,  4.72426610e-02,
        -9.92891462e-02, -6.80458834e-03],
       [ 1.52663667e-03,  7.60918158e-02,  6.50928235e-02,
        -2.23117202e-01, -1.46940540e-02],
       [ 3.21382295e-04,  4.70928298e-02,  3.37889305e-02,
        -1.23476697e-01, -9.18367327e-03],
       [ 4.65341012e-04,  4.71901327e-02,  3.07430766e-02,
        -1.51399891e-01, -1.17035544e-02],
       [ 7.01697209e-04,  4.10423261e-02,  2.82187929e-02,
        -1.15515118e-01, -8.55346765e-03],
       [ 1.60814257e-03,  1.63717682e-01,  1.53390957e-01,
        -3.90336691e-01, -2.03963537e-02],
       [ 6.54653172e-04,  4.34461847e-02,  3.97507282e-02,
        -1.26215998e-01, -3.51817374e-03],
       [ 4.94849818e-04,  3.29496022e-02,  2.48962696e-02,
        -9.81895522e-02, -7.30288853e-03],
       [ 7.74255137e-04,  4.70633978e-02,  3.45051163e-02,
        -1.32088878e-01

In [34]:
%%time

pd.DataFrame(tfidf_crew_cast_embedding).to_csv('output/tfidf_crew_cast_embedding_vectors.tsv', sep='\t', 
                                          float_format='%.5f', header=False, index=False)

CPU times: user 8.72 s, sys: 136 ms, total: 8.85 s
Wall time: 8.85 s


![](img/tfidf_crew_cast_lord_rings.png)

![](img/tfidf_crew_cast_star_wars.png)

![](img/tfidf_crew_cast_pulp_fiction.png)