In [212]:
import pandas as pd
import numpy as np
from sklearn import cluster, metrics
from IPython.display import clear_output
import seaborn as sns
import time

# Jaccard clustering

In order to do clustering on the text/set data we need an appropriate distance metric, we chose the Jaccard distance: $d(A, B) = 1 - \frac{A\cup B}{A \cap B}$

In [267]:
# import the final movie dataset

imdb_data = pd.read_csv('data/movie_data_imdbscores.csv')
imdb_data

Unnamed: 0,movie_id,freebase_movie_id,release_date,box_office_revenue,runtime,languages,countries,genres,english language,german language,...,Romance Film,Romantic drama,Comedy film,Documentary,plot_summary,F_gender_porportion,M_gender_porportion,actor_age_at_movie_release,averageRating,numVotes
0,30332673,/m/0crs0hx,2010-01-01,,90.0,[''],['united states of america'],"['Comedy', 'Comedy film', 'Sex comedy', 'Sport...",False,False,...,False,False,True,False,Two horny college guys get summer jobs at a ch...,0.666667,0.333333,38.333333,3.7,3222
1,4213160,/m/0bq8q8,1971-12-17,,119.0,['english language'],['united states of america'],"['Action', 'Action/Adventure', 'Comedy', 'Crim...",True,False,...,False,False,False,False,"Set in Hamburg, West Germany, several criminal...",0.250000,0.750000,41.625000,6.3,2631
2,20624798,/m/05222ld,2008-01-01,,78.0,['english language'],"['australia', 'israel']","['Animation', 'Drama', 'Indie', 'Stop motion',...",True,False,...,False,False,False,False,The film mainly focuses on 28-year-old Dave Pe...,0.111111,0.888889,41.250000,7.2,22
3,2250713,/m/06z7m4,1988-01-01,,98.0,['english language'],"['hungary', 'united states of america']","['Coming of age', 'Drama', 'Family Drama', 'Pe...",True,False,...,False,False,False,False,The father escaped the Soviet invasion of Buda...,0.000000,1.000000,42.000000,5.9,82
4,25969588,/m/02pmmn1,2007-01-01,,,['english language'],['united states of america'],"['Comedy', 'Drama', 'Indie']",True,False,...,False,False,False,False,,0.200000,0.800000,42.000000,6.4,346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49780,7211363,/m/0kv0kn,2004-06-01,,87.0,['english language'],['united states of america'],['Documentary'],True,False,...,False,False,False,True,,0.000000,1.000000,,6.5,76
49781,2453726,/m/07f6by,1991-01-01,,108.0,[''],"['croatia', 'yugoslavia']","['Crime Drama', 'Drama']",False,False,...,False,False,False,False,,0.000000,1.000000,38.000000,7.2,453
49782,2123739,/m/06ntrs,1999-12-07,,123.0,['croatian language'],['croatia'],['Drama'],False,False,...,False,False,False,False,,1.000000,0.000000,39.000000,4.1,760
49783,23767055,/m/06zpj24,1965-01-01,,81.0,['serbo-croatian language'],['yugoslavia'],"['Art film', 'Drama', 'Romance Film', 'World c...",False,False,...,True,False,False,False,Highly skilled engineer Jan Rudinski comes to...,0.400000,0.600000,30.000000,7.1,1227


For the titles we use the data that has been run through our NLP pipeline (lemmatization and stopword filtering).

In [182]:
plot_data = pd.read_csv('data/plot_summaries_preprocessed.csv')
plot_data

Unnamed: 0,movie_id,plot_summary
0,23890098,taxi saxophonist different hardworking develop...
1,31186339,haymitchs overhearing dodge pin supply wealth ...
2,20663735,aishwarya party daughter accused sentenced dys...
3,2231378,outside charmed free visit retirement reimburs...
4,595909,new daughter accused sentenced country insiste...
...,...,...
42298,34808485,poetry muslim english malayalam medium young m...
42299,1096473,soldier token cousin amusement dressing differ...
42300,35102018,requires testament aspiring never different ta...
42301,8628195,daughter amina realises decides also good sell...


In [174]:
title_data = pd.read_csv('data/titles_preprocessed.csv')
title_data

Unnamed: 0,movie_id,preprocessed_title
0,975900,mar ghost
1,3196793,ramsey mystery away getting jonbenét murder
2,28463795,bitter brun
3,9363483,eye white
4,261236,flame woman
...,...,...
81736,35228177,body found mermaid
81737,34980460,knuckle
81738,9971909,another mess nice
81739,913762,dimension macro fortress lover ii super


In [285]:
imdb_data.columns

Index(['movie_id', 'freebase_movie_id', 'release_date', 'box_office_revenue',
       'runtime', 'languages', 'countries', 'genres', 'english language',
       'german language', 'silent film language', 'spanish language',
       'japanese language', 'italian language', 'tamil language',
       'hindi language', 'malayalam language', 'mandarin language',
       'french language', 'Action', 'Adventure', 'Horror', 'Thriller', 'Drama',
       'Crime Fiction', 'Black-and-white', 'Comedy', 'Indie', 'Short Film',
       'Silent film', 'Family Film', 'World cinema', 'Musical',
       'Action/Adventure', 'Romance Film', 'Romantic drama', 'Comedy film',
       'Documentary', 'plot_summary', 'F_gender_porportion',
       'M_gender_porportion', 'actor_age_at_movie_release', 'averageRating',
       'numVotes'],
      dtype='object')

In [315]:
# transform data into word lists for clustering

#cluster_data = pd.merge(imdb_data, title_data, left_on='movie_id', right_on='movie_id')[['movie_id', 'preprocessed_title','Documentary']].dropna()
#cluster_data = plot_data.copy()
cluster_data = pd.merge(imdb_data, title_data, left_on='movie_id', right_on='movie_id')[['movie_id', 'plot_summary','Silent film']].dropna()
cluster_data = cluster_data[cluster_data['Silent film']]
cluster_data['plot_summary'] = cluster_data['plot_summary'].apply(lambda s: set(s.split(' ')))
cluster_data = cluster_data.reset_index()
#cluster_data = cluster_data.head(2000) # work on smaller subset for speed
cluster_data

Unnamed: 0,index,movie_id,plot_summary,Silent film
0,6,13504095,"{, who, and, indelible, Prevost,, 1918, an, th...",True
1,464,1812238,"{, Master, British, ago, spies, caught, family...",True
2,470,7218457,"{, who, agrees, do, the, days,, At, as, There,...",True
3,507,1568715,"{off, attack, falls, another, knocking, and, t...",True
4,512,13911206,"{, attempts, who, and, falls, the, The, Jorge,...",True
...,...,...,...,...
634,49046,767193,"{, who, entrusted, place,, agrees, confidant, ...",True
635,49082,35031759,"{, recovering, who, the, considers, comes, wis...",True
636,49165,5604231,"{perch., put, gentleman,, the, still, barn, na...",True
637,49511,15149007,"{plenty, who, the, Peggy, spinning, At, steed,...",True


As the sklearn implementation of Jaccard distance requires sets to be of the same size, we define our own function.

In [270]:
def jaccard_distance(a, b):
    aub = a.union(b)
    anb = a.intersection(b)
    return 1 - len(anb)/len(aub)

array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90])

In [316]:
D = np.zeros((cluster_data.index[-1]+1, cluster_data.index[-1]+1))

print("Creating distance matrix:")
n = len(cluster_data.index)
step = 100
stops = np.arange(step,n,step)
N = n*(n-1)//2
t0 = time.time()
for i in cluster_data.index:
    if i in stops:
        k = i*(i-1)/2
        t = time.time()
        T_est = (t-t0)*N/k
        print(f"{100*k/N:.1f}%,\t{t-t0:.1f}s elapsed,\testimated duration {T_est:.1f},\tETF {T_est - (t-t0)}s")
    for j in cluster_data.index[:i]:
        D[i,j] = jaccard_distance(cluster_data['plot_summary'].loc[i], cluster_data['plot_summary'].loc[j])
D = D + D.T
D

Creating distance matrix:
2.4%,	0.4s elapsed,	estimated duration 16.4,	ETF 15.994973797798156s
9.8%,	1.5s elapsed,	estimated duration 15.0,	ETF 13.49608375989013s
22.0%,	2.9s elapsed,	estimated duration 13.3,	ETF 10.362939267142561s
39.1%,	5.2s elapsed,	estimated duration 13.3,	ETF 8.106903648215129s
61.2%,	7.9s elapsed,	estimated duration 12.9,	ETF 5.009860179931701s
88.2%,	11.1s elapsed,	estimated duration 12.5,	ETF 1.4851980816223378s


array([[0.        , 0.95342466, 0.9379562 , ..., 0.96033994, 0.92553191,
        0.94623656],
       [0.95342466, 0.        , 0.89561587, ..., 0.9       , 0.92420538,
        0.97247706],
       [0.9379562 , 0.89561587, 0.        , ..., 0.90405117, 0.89206349,
        0.94396552],
       ...,
       [0.96033994, 0.9       , 0.90405117, ..., 0.        , 0.90745501,
        0.96784566],
       [0.92553191, 0.92420538, 0.89206349, ..., 0.90745501, 0.        ,
        0.93877551],
       [0.94623656, 0.97247706, 0.94396552, ..., 0.96784566, 0.93877551,
        0.        ]])

In [326]:
clust = cluster.AgglomerativeClustering(affinity='precomputed', linkage='complete', n_clusters=5)
cpred = clust.fit_predict(D)[cluster_data.index]
cluster_data['cluster'] = cpred
#cluster_data[cluster_data['cluster']==2]
clusters = cluster_data.groupby('cluster')
print(f"{i}, Silhouette score: {metrics.silhouette_score(D, metric='precomputed', labels=cpred)}")
clusters['movie_id'].describe()['count']

638, Silhouette score: 0.018326288092678975


cluster
0    127.0
1    512.0
Name: count, dtype: float64

In [329]:
clust = cluster.KMeans(n_clusters=5)
cpred = clust.fit_predict(D)[cluster_data.index]
cluster_data['cluster'] = cpred
#cluster_data[cluster_data['cluster']==2]
clusters = cluster_data.groupby('cluster')
print(f"Silhouette score: {metrics.silhouette_score(D, metric='precomputed', labels=cpred)}")
clusters['movie_id'].describe()['count']

Silhouette score: 0.013925443866057955


cluster
0    249.0
1    172.0
2    218.0
Name: count, dtype: float64