In [212]:
import pandas as pd
import numpy as np
from sklearn import cluster, metrics
from IPython.display import clear_output
import seaborn as sns
import time

# Jaccard clustering

In order to do clustering on the text/set data we need an appropriate distance metric, we chose the Jaccard distance: $d(A, B) = 1 - \frac{A\cup B}{A \cap B}$

In [267]:
# import the final movie dataset

imdb_data = pd.read_csv('data/movie_data_imdbscores.csv')
imdb_data

Unnamed: 0,movie_id,freebase_movie_id,release_date,box_office_revenue,runtime,languages,countries,genres,english language,german language,...,Romance Film,Romantic drama,Comedy film,Documentary,plot_summary,F_gender_porportion,M_gender_porportion,actor_age_at_movie_release,averageRating,numVotes
0,30332673,/m/0crs0hx,2010-01-01,,90.0,[''],['united states of america'],"['Comedy', 'Comedy film', 'Sex comedy', 'Sport...",False,False,...,False,False,True,False,Two horny college guys get summer jobs at a ch...,0.666667,0.333333,38.333333,3.7,3222
1,4213160,/m/0bq8q8,1971-12-17,,119.0,['english language'],['united states of america'],"['Action', 'Action/Adventure', 'Comedy', 'Crim...",True,False,...,False,False,False,False,"Set in Hamburg, West Germany, several criminal...",0.250000,0.750000,41.625000,6.3,2631
2,20624798,/m/05222ld,2008-01-01,,78.0,['english language'],"['australia', 'israel']","['Animation', 'Drama', 'Indie', 'Stop motion',...",True,False,...,False,False,False,False,The film mainly focuses on 28-year-old Dave Pe...,0.111111,0.888889,41.250000,7.2,22
3,2250713,/m/06z7m4,1988-01-01,,98.0,['english language'],"['hungary', 'united states of america']","['Coming of age', 'Drama', 'Family Drama', 'Pe...",True,False,...,False,False,False,False,The father escaped the Soviet invasion of Buda...,0.000000,1.000000,42.000000,5.9,82
4,25969588,/m/02pmmn1,2007-01-01,,,['english language'],['united states of america'],"['Comedy', 'Drama', 'Indie']",True,False,...,False,False,False,False,,0.200000,0.800000,42.000000,6.4,346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49780,7211363,/m/0kv0kn,2004-06-01,,87.0,['english language'],['united states of america'],['Documentary'],True,False,...,False,False,False,True,,0.000000,1.000000,,6.5,76
49781,2453726,/m/07f6by,1991-01-01,,108.0,[''],"['croatia', 'yugoslavia']","['Crime Drama', 'Drama']",False,False,...,False,False,False,False,,0.000000,1.000000,38.000000,7.2,453
49782,2123739,/m/06ntrs,1999-12-07,,123.0,['croatian language'],['croatia'],['Drama'],False,False,...,False,False,False,False,,1.000000,0.000000,39.000000,4.1,760
49783,23767055,/m/06zpj24,1965-01-01,,81.0,['serbo-croatian language'],['yugoslavia'],"['Art film', 'Drama', 'Romance Film', 'World c...",False,False,...,True,False,False,False,Highly skilled engineer Jan Rudinski comes to...,0.400000,0.600000,30.000000,7.1,1227


For the titles we use the data that has been run through our NLP pipeline (lemmatization and stopword filtering).

In [None]:
plot_data = pd.read_csv('data/plot_summaries_preprocessed.csv')
plot_data

Unnamed: 0,movie_id,plot_summary
0,23890098,taxi saxophonist different hardworking develop...
1,31186339,haymitchs overhearing dodge pin supply wealth ...
2,20663735,aishwarya party daughter accused sentenced dys...
3,2231378,outside charmed free visit retirement reimburs...
4,595909,new daughter accused sentenced country insiste...
...,...,...
42298,34808485,poetry muslim english malayalam medium young m...
42299,1096473,soldier token cousin amusement dressing differ...
42300,35102018,requires testament aspiring never different ta...
42301,8628195,daughter amina realises decides also good sell...


In [174]:
title_data = pd.read_csv('data/titles_preprocessed.csv')
title_data

Unnamed: 0,movie_id,preprocessed_title
0,975900,mar ghost
1,3196793,ramsey mystery away getting jonbenét murder
2,28463795,bitter brun
3,9363483,eye white
4,261236,flame woman
...,...,...
81736,35228177,body found mermaid
81737,34980460,knuckle
81738,9971909,another mess nice
81739,913762,dimension macro fortress lover ii super


In [285]:
imdb_data.columns

Index(['movie_id', 'freebase_movie_id', 'release_date', 'box_office_revenue',
       'runtime', 'languages', 'countries', 'genres', 'english language',
       'german language', 'silent film language', 'spanish language',
       'japanese language', 'italian language', 'tamil language',
       'hindi language', 'malayalam language', 'mandarin language',
       'french language', 'Action', 'Adventure', 'Horror', 'Thriller', 'Drama',
       'Crime Fiction', 'Black-and-white', 'Comedy', 'Indie', 'Short Film',
       'Silent film', 'Family Film', 'World cinema', 'Musical',
       'Action/Adventure', 'Romance Film', 'Romantic drama', 'Comedy film',
       'Documentary', 'plot_summary', 'F_gender_porportion',
       'M_gender_porportion', 'actor_age_at_movie_release', 'averageRating',
       'numVotes'],
      dtype='object')

In [367]:
# transform data into word lists for clustering

#cluster_data = pd.merge(imdb_data, title_data, left_on='movie_id', right_on='movie_id')[['movie_id', 'preprocessed_title','Documentary']].dropna()
#cluster_data = plot_data.copy()
cluster_data = pd.merge(imdb_data, plot_data, left_on='movie_id', right_on='movie_id')
cluster_data

Unnamed: 0,movie_id,freebase_movie_id,release_date,box_office_revenue,runtime,languages,countries,genres,english language,german language,...,Romantic drama,Comedy film,Documentary,plot_summary_x,F_gender_porportion,M_gender_porportion,actor_age_at_movie_release,averageRating,numVotes,plot_summary_y
0,30332673,/m/0crs0hx,2010-01-01,,90.0,[''],['united states of america'],"['Comedy', 'Comedy film', 'Sex comedy', 'Sport...",False,False,...,False,True,False,Two horny college guys get summer jobs at a ch...,0.666667,0.333333,38.333333,3.7,3222,even stripper girl head different mistaken com...
1,4213160,/m/0bq8q8,1971-12-17,,119.0,['english language'],['united states of america'],"['Action', 'Action/Adventure', 'Comedy', 'Crim...",True,False,...,False,False,False,"Set in Hamburg, West Germany, several criminal...",0.250000,0.750000,41.625000,6.3,2631,u deposit new together filled box yellow sitti...
2,20624798,/m/05222ld,2008-01-01,,78.0,['english language'],"['australia', 'israel']","['Animation', 'Drama', 'Indie', 'Stop motion',...",True,False,...,False,False,False,The film mainly focuses on 28-year-old Dave Pe...,0.111111,0.888889,41.250000,7.2,22,search employment apartment looking find low j...
3,2250713,/m/06z7m4,1988-01-01,,98.0,['english language'],"['hungary', 'united states of america']","['Coming of age', 'Drama', 'Family Drama', 'Pe...",True,False,...,False,False,False,The father escaped the Soviet invasion of Buda...,0.000000,1.000000,42.000000,5.9,82,hungarian soviet son also finally outlaw kenne...
4,10331139,/m/02q8q5n,1974-01-01,,91.0,['english language'],['united states of america'],"['Action', 'Drama', 'Thriller']",True,False,...,False,False,False,The film follows a poacher named Desiree who l...,1.000000,0.000000,24.000000,5.4,1277,accidentally party search land bob deep shoote...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31286,1719500,/m/05qyl8,1995-01-01,,115.0,['french language'],['france'],"['Drama', 'Road movie']",False,False,...,False,False,False,Marie is a teenage girl living a semi-criminal...,0.666667,0.333333,31.000000,6.6,2048,girl deep never one realises explainclarify de...
31287,31799966,/m/075qqrl,1988-01-01,,,[''],['italy'],['Horror'],False,False,...,False,False,False,The American ballerina Claire Hamilton travels...,1.000000,0.000000,29.000000,5.9,698,hamilton posse american web ballet died travel...
31288,13983035,/m/03cq2ws,1988-03-10,,88.0,['german language'],['germany'],['Comedy'],False,True,...,False,False,False,Paul Winkelmann is single and running the fami...,0.500000,0.500000,54.500000,7.4,3982,psychologist coffee margarethes pastry margare...
31289,25818705,/m/09v52pt,2008-10-24,,110.0,['portuguese language'],['brazil'],"['Drama', 'World cinema']",False,False,...,False,False,False,The last hours of the life of Sandro do Nascim...,0.500000,0.500000,25.500000,7.1,5251,tv last fictional leaving hour adoptive hijack...


In [368]:
cluster_data = cluster_data[['movie_id', 'plot_summary_y','Horror']].dropna()
cluster_data = cluster_data[cluster_data['Horror']]
cluster_data['plot_summary_y'] = cluster_data['plot_summary_y'].apply(lambda s: set(s.split(' ')))
cluster_data = cluster_data.reset_index()
#cluster_data = cluster_data.head(2000) # work on smaller subset for speed
cluster_data

Unnamed: 0,index,movie_id,plot_summary_y,Horror
0,38,8422241,"{wearily, put, torn, visit, decided, inspired,...",True
1,40,12138219,"{flying, notice, need, exorcising, battered, d...",True
2,43,20989785,"{worked, seal, apparently, center, everyone, m...",True
3,46,34364391,"{act, fueled, circus, trapped, warehouse, late...",True
4,58,33388421,"{demon, put, clock, visit, david, support, fre...",True
...,...,...,...,...
3330,31247,9757243,"{formula, professor, dog, wrath, brought, awak...",True
3331,31248,16171201,"{chemical, action, understand, finish, cancer,...",True
3332,31251,21696582,"{ecological, milius, chance, lead, interpol, h...",True
3333,31276,3477744,"{address, formula, posse, visit, really, von, ...",True


As the sklearn implementation of Jaccard distance requires sets to be of the same size, we define our own function.

In [360]:
def jaccard_distance(a, b):
    aub = a.union(b)
    anb = a.intersection(b)
    return 1 - len(anb)/len(aub)

array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90])

In [361]:
D = np.zeros((cluster_data.index[-1]+1, cluster_data.index[-1]+1))

print("Creating distance matrix:")
n = len(cluster_data.index)
step = 100
stops = np.arange(step,n,step)
N = n*(n-1)//2
t0 = time.time()
for i in cluster_data.index:
    if i in stops:
        k = i*(i-1)/2
        t = time.time()
        T_est = (t-t0)*N/k
        print(f"{100*k/N:.1f}%,\t{t-t0:.1f}s elapsed,\testimated duration {T_est:.1f},\tETF {T_est - (t-t0)}s")
    for j in cluster_data.index[:i]:
        D[i,j] = jaccard_distance(cluster_data['plot_summary_y'].loc[i], cluster_data['plot_summary_y'].loc[j])
D = D + D.T
D

Creating distance matrix:
0.1%,	0.3s elapsed,	estimated duration 367.0,	ETF 366.6752938374124s
0.4%,	1.5s elapsed,	estimated duration 414.2,	ETF 412.70490326366235s
0.8%,	2.9s elapsed,	estimated duration 354.7,	ETF 351.85208333638997s
1.4%,	5.0s elapsed,	estimated duration 348.5,	ETF 343.4581372531584s
2.2%,	8.0s elapsed,	estimated duration 355.2,	ETF 347.20886180092197s
3.2%,	11.3s elapsed,	estimated duration 351.0,	ETF 339.6608018585093s
4.4%,	15.3s elapsed,	estimated duration 347.6,	ETF 332.3431813860583s
5.7%,	19.6s elapsed,	estimated duration 340.7,	ETF 321.06682172815624s
7.3%,	24.4s elapsed,	estimated duration 334.7,	ETF 310.3507173038298s
9.0%,	29.9s elapsed,	estimated duration 332.7,	ETF 302.8238386812153s
10.9%,	36.3s elapsed,	estimated duration 333.5,	ETF 297.24280389130524s
12.9%,	43.3s elapsed,	estimated duration 334.4,	ETF 291.1330181260771s
15.2%,	50.9s elapsed,	estimated duration 335.2,	ETF 284.2635773394525s
17.6%,	59.0s elapsed,	estimated duration 335.1,	ETF 276.09904

array([[0.        , 0.88297872, 0.96116505, ..., 0.97832817, 0.95081967,
        0.99677419],
       [0.88297872, 0.        , 0.95111111, ..., 0.98918919, 0.96421053,
        0.99433428],
       [0.96116505, 0.95111111, 0.        , ..., 0.97619048, 0.96428571,
        0.98675497],
       ...,
       [0.97832817, 0.98918919, 0.97619048, ..., 0.        , 0.98947368,
        1.        ],
       [0.95081967, 0.96421053, 0.96428571, ..., 0.98947368, 0.        ,
        0.99418605],
       [0.99677419, 0.99433428, 0.98675497, ..., 1.        , 0.99418605,
        0.        ]])

In [362]:
clust = cluster.AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=5)
cpred = clust.fit_predict(D)[cluster_data.index]
cluster_data['cluster'] = cpred
#cluster_data[cluster_data['cluster']==2]
clusters = cluster_data.groupby('cluster')
print(f"{i}, Silhouette score: {metrics.silhouette_score(D, metric='precomputed', labels=cpred)}")
clusters['movie_id'].describe()['count']

3334, Silhouette score: 0.011566945320453752


cluster
0    1734.0
1    1574.0
2      16.0
3       7.0
4       4.0
Name: count, dtype: float64

In [363]:
clust = cluster.KMeans(n_clusters=5)
cpred = clust.fit_predict(D)[cluster_data.index]
cluster_data['cluster'] = cpred
#cluster_data[cluster_data['cluster']==2]
clusters = cluster_data.groupby('cluster')
print(f"Silhouette score: {metrics.silhouette_score(D, metric='precomputed', labels=cpred)}")
clusters['movie_id'].describe()['count']

Silhouette score: -0.0036115605203038783


cluster
0    552.0
1    767.0
2    704.0
3    623.0
4    689.0
Name: count, dtype: float64

In [365]:
clust = cluster.DBSCAN(metric='precomputed', eps = 0.9)
cpred = clust.fit_predict(D)[cluster_data.index]
cluster_data['cluster'] = cpred
#cluster_data[cluster_data['cluster']==2]
clusters = cluster_data.groupby('cluster')
print(f"{i}, Silhouette score: {metrics.silhouette_score(D, metric='precomputed', labels=cpred)}")
clusters['movie_id'].describe()['count']

3334, Silhouette score: 0.0119435196444607


cluster
-1    1723.0
 0    1529.0
 1      38.0
 2       9.0
 3       7.0
 4       6.0
 5       8.0
 6       5.0
 7       6.0
 8       4.0
Name: count, dtype: float64