In [1]:
import pandas as pd
import numpy as np
from sklearn import cluster

# Jaccard clustering

In order to do clustering on the text/set data we need an appropriate distance metric, we chose the Jaccard distance: $d(A, B) = 1 - \frac{A\cup B}{A \cap B}$

In [2]:
# import the final movie dataset

imdb_data = pd.read_csv('data/movie_data_imdbscores.csv')
imdb_data

Unnamed: 0,movie_id,freebase_movie_id,release_date,box_office_revenue,runtime,languages,countries,genres,english language,german language,...,Comedy film,Documentary,plot_summary,F_gender_porportion,M_gender_porportion,positive_count,negative_count,pn,averageRating,numVotes
0,30332673,/m/0crs0hx,2010,,90.0,[''],['united states of america'],"['Comedy', 'Comedy film', 'Sex comedy', 'Sport...",False,False,...,True,False,Two horny college guys get summer jobs at a ch...,0.666667,0.333333,9,9,0.500000,3.7,3222
1,4213160,/m/0bq8q8,1971-12-17,,119.0,['english language'],['united states of america'],"['Action', 'Action/Adventure', 'Comedy', 'Crim...",True,False,...,False,False,"Set in Hamburg, West Germany, several criminal...",0.250000,0.750000,16,46,0.258065,6.3,2631
2,20624798,/m/05222ld,2008,,78.0,['english language'],"['australia', 'israel']","['Animation', 'Drama', 'Indie', 'Stop motion',...",True,False,...,False,False,The film mainly focuses on 28-year-old Dave Pe...,0.111111,0.888889,5,6,0.454545,7.2,22
3,2250713,/m/06z7m4,1988,,98.0,['english language'],"['hungary', 'united states of america']","['Coming of age', 'Drama', 'Family Drama', 'Pe...",True,False,...,False,False,The father escaped the Soviet invasion of Buda...,0.000000,1.000000,2,22,0.083333,5.9,82
4,10331139,/m/02q8q5n,1974,,91.0,['english language'],['united states of america'],"['Action', 'Drama', 'Thriller']",True,False,...,False,False,The film follows a poacher named Desiree who l...,1.000000,0.000000,2,9,0.181818,5.4,1277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32714,31799966,/m/075qqrl,1988,,,[''],['italy'],['Horror'],False,False,...,False,False,The American ballerina Claire Hamilton travels...,1.000000,0.000000,1,7,0.125000,5.9,698
32715,13983035,/m/03cq2ws,1988-03-10,,88.0,['german language'],['germany'],['Comedy'],False,True,...,False,False,Paul Winkelmann is single and running the fami...,0.500000,0.500000,8,17,0.320000,7.4,3982
32716,25818705,/m/09v52pt,2008-10-24,,110.0,['portuguese language'],['brazil'],"['Drama', 'World cinema']",False,False,...,False,False,The last hours of the life of Sandro do Nascim...,0.500000,0.500000,1,12,0.076923,7.1,5251
32717,23767055,/m/06zpj24,1965,,81.0,['serbo-croatian language'],['yugoslavia'],"['Art film', 'Drama', 'Romance Film', 'World c...",False,False,...,False,False,Highly skilled engineer Jan Rudinski comes to...,0.400000,0.600000,15,39,0.277778,7.1,1227


For the titles we use the data that has been run through our NLP pipeline (lemmatization and stopword filtering).

In [3]:
title_data = pd.read_csv('data/titles_preprocessed.csv')
title_data

Unnamed: 0,movie_id,preprocessed_title
0,975900,mar ghost
1,3196793,ramsey mystery away getting jonbenét murder
2,28463795,bitter brun
3,9363483,eye white
4,261236,flame woman
...,...,...
81736,35228177,body found mermaid
81737,34980460,knuckle
81738,9971909,another mess nice
81739,913762,dimension macro fortress lover ii super


In [4]:
# transform data into word lists for clustering

cluster_data = pd.merge(imdb_data, title_data, left_on='movie_id', right_on='movie_id')[['movie_id', 'preprocessed_title']].dropna()
cluster_data['preprocessed_title'] = cluster_data['preprocessed_title'].apply(lambda s: set(s.split(' ')))
cluster_data = cluster_data.head(200) # work on smaller subset for speed

As the sklearn implementation of Jaccard distance requires sets to be of the same size, we define our own function.

In [5]:
def jaccard_distance(a, b):
    aub = a.union(b)
    anb = a.intersection(b)
    return 1 - len(anb)/len(aub)

In [6]:
D = np.ones((cluster_data.index[-1]+1, cluster_data.index[-1]+1))

for i in cluster_data.index:
    for j in cluster_data.index:
        D[i,j] = jaccard_distance(cluster_data['preprocessed_title'].loc[i], cluster_data['preprocessed_title'].loc[j])

In [7]:
clust = cluster.AgglomerativeClustering(affinity='precomputed', linkage='average', n_clusters=2)
cpred = clust.fit_predict(D)[cluster_data.index]
cluster_data['cluster'] = cpred

In [8]:
cluster_data

Unnamed: 0,movie_id,preprocessed_title,cluster
0,30332673,"{cheerleader, camp}",0
4,10331139,"{gator, bait}",0
5,13504095,{jim},0
6,3610422,"{sky, neath, arizona}",0
7,8098268,"{mother, night}",1
...,...,...,...
243,5600244,"{later, year}",1
245,36036304,{degree},0
246,25397721,"{elm, way}",0
247,1468747,{pickup},0


In [None]:
import seaborn as sns

sns.hist