# K-means Clustering in sci-kit learn

This example uses a dataset downloaded from https://www.opensubtitles.org/en/search/vip and the raw data at opus.lingfil.uu.se/OpenSubtitles2016/raw/en. Metadata such as title actor and director was scraped from IMDB and is not guaranteed to be complete. This example uses the last 5000 most recent movies. The full archive (1.1 Gig) is [here](https://www.dropbox.com/s/db9d6765zbjru5x/openSubtitles.json.zip?dl=0).

The code does the following:
1. counts words 
2. builds a TFIDF weighted vocabulary
3. Applies the TFIDF weights to the word counts to create a sparce matrix
4. Runs K-means clustering on the sparce matrix
5. Prints top words for each cluster using the largest features in the cluster centroid

Be sure to install the following:
1. `pip3 install sklearn`
2. `pip3 install pandas`
2. `pip3 install scipy`


In [7]:
import pandas as pd 

import sys
sys.version 

'3.6.2 (default, Jul 30 2017, 14:53:19) \n[GCC 4.2.1 Compatible Apple LLVM 8.1.0 (clang-802.0.42)]'

## Unarchive

In [8]:
import tempfile
import zipfile
import os.path

zipFile = "./openSubtitles-5000.json.zip"

print( "Unarchiving ...")
temp_dir = tempfile.mkdtemp()
zip_ref = zipfile.ZipFile(zipFile, 'r')
zip_ref.extractall(temp_dir)
zip_ref.close()

openSubtitlesFile = os.path.join(temp_dir, "openSubtitles-5000.json")
print ("file unarchived to:" + openSubtitlesFile)


Unarchiving ...
file unarchived to:/var/folders/9l/w4_vhqyn5rz64fh1x9zzcsvr0000gn/T/tmpo0amxaix/openSubtitles-5000.json


## Tokenizing and Filtering a Vocabulary

In [9]:

import json
from sklearn.feature_extraction.text import CountVectorizer
#from log_progress import log_progress

maxDocsToload = 500

titles = []
def make_corpus(file):
    with open(file) as f:
        for i, line in enumerate(f):
            doc = json.loads(line)
            titles.append(doc.get('Title',''))
            #if 'Sci-Fi' not in doc.get('Genre',''):
            #    continue
            if i % 100 == 0:
                print ("%d " % i, end='') 
            yield doc.get('Text','')
            if i == maxDocsToload:
                break
                
print ("Starting load ...")                
textGenerator = make_corpus(openSubtitlesFile)              
count_vectorizer = CountVectorizer(min_df=2, max_df=0.75, ngram_range=(1,2), max_features=50000,
                                   stop_words='english', analyzer="word", token_pattern="[a-zA-Z]{3,}")
term_freq_matrix = count_vectorizer.fit_transform(textGenerator)
print ("Done.")
print ( "term_freq_matrix shape = %s" % (term_freq_matrix.shape,) )
print ("term_freq_matrix = \n%s" % term_freq_matrix)


Starting load ...
0 100 200 300 400 500 Done.
term_freq_matrix shape = (501, 50000)
term_freq_matrix = 
  (0, 43931)	1
  (0, 14435)	1
  (0, 44223)	1
  (0, 21102)	1
  (0, 4396)	1
  (0, 10337)	1
  (0, 16837)	1
  (0, 35135)	1
  (0, 43734)	1
  (0, 9200)	1
  (0, 28708)	1
  (0, 31177)	1
  (0, 49339)	1
  (0, 3889)	1
  (0, 23628)	1
  (0, 9280)	1
  (0, 27272)	1
  (0, 24024)	1
  (0, 27130)	1
  (0, 20614)	1
  (0, 24280)	1
  (0, 25037)	1
  (0, 48354)	1
  (0, 25995)	1
  (0, 36082)	1
  :	:
  (500, 28698)	2
  (500, 31748)	2
  (500, 17049)	1
  (500, 14885)	1
  (500, 7897)	2
  (500, 6095)	2
  (500, 15268)	15
  (500, 47476)	1
  (500, 30933)	11
  (500, 2334)	2
  (500, 14585)	1
  (500, 19297)	5
  (500, 6831)	1
  (500, 32108)	1
  (500, 47106)	5
  (500, 14120)	4
  (500, 33632)	1
  (500, 26140)	1
  (500, 7759)	1
  (500, 19195)	3
  (500, 37041)	1
  (500, 37935)	3
  (500, 18517)	3
  (500, 2129)	5
  (500, 5772)	1


## Feature Vocabulary

In [10]:
print( "Vocabulary length = ", len(count_vectorizer.vocabulary_))
word = "data";
rainingIndex = count_vectorizer.vocabulary_[word];
print( "token index for \"%s\" = %d" % (word,rainingIndex))
feature_names = count_vectorizer.get_feature_names()
print( "feature_names[%d] = %s" % (rainingIndex, feature_names[rainingIndex]))


Vocabulary length =  50000
token index for "data" = 8373
feature_names[8373] = data


In [11]:
for i in range(0,1000):
    print( "feature_names[%d] = %s" % (i, feature_names[i]))

feature_names[0] = aaaaaah
feature_names[1] = aaaah
feature_names[2] = aaah
feature_names[3] = aaargh
feature_names[4] = aafrin
feature_names[5] = aah
feature_names[6] = aah aah
feature_names[7] = aah doing
feature_names[8] = aah god
feature_names[9] = aah grunts
feature_names[10] = aah hell
feature_names[11] = aah hey
feature_names[12] = aah ohh
feature_names[13] = aargh
feature_names[14] = aaron
feature_names[15] = abandon
feature_names[16] = abandoned
feature_names[17] = abandoning
feature_names[18] = abandonment
feature_names[19] = abba
feature_names[20] = abbey
feature_names[21] = abbies
feature_names[22] = abbott
feature_names[23] = abby
feature_names[24] = abby abby
feature_names[25] = abc
feature_names[26] = abdomen
feature_names[27] = abdominal
feature_names[28] = abducted
feature_names[29] = abducted aliens
feature_names[30] = abduction
feature_names[31] = abdul
feature_names[32] = abercrombie
feature_names[33] = aberrations
feature_names[34] = abide
feature_names[35] = abiga

feature_names[783] = alignment
feature_names[784] = alike
feature_names[785] = alimony
feature_names[786] = alison
feature_names[787] = alistair
feature_names[788] = alive
feature_names[789] = alive alive
feature_names[790] = alive come
feature_names[791] = alive doesn
feature_names[792] = alive don
feature_names[793] = alive good
feature_names[794] = alive got
feature_names[795] = alive just
feature_names[796] = alive know
feature_names[797] = alive left
feature_names[798] = alive let
feature_names[799] = alive like
feature_names[800] = alive long
feature_names[801] = alive right
feature_names[802] = alive shit
feature_names[803] = alive thank
feature_names[804] = alive think
feature_names[805] = alive today
feature_names[806] = alive want
feature_names[807] = alive yeah
feature_names[808] = alive yes
feature_names[809] = allah
feature_names[810] = allah allah
feature_names[811] = allegations
feature_names[812] = alleged
feature_names[813] = allegedly
feature_names[814] = allegiance
f

## TDIF Weighting
This applys the TFIDF weight to the matrix

tfidf value = word count / number of documents word is in

The document vectors are also normalized so they have a euclidian magnitude of 1.0.

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm="l2")
tfidf.fit(term_freq_matrix)

tf_idf_matrix = tfidf.transform(term_freq_matrix)
print( tf_idf_matrix)

  (0, 1064)	0.102437933997
  (0, 47073)	0.00430884877549
  (0, 46233)	0.0086707711391
  (0, 47608)	0.0146182084176
  (0, 36960)	0.00534832943381
  (0, 914)	0.00659483301489
  (0, 28553)	0.0131531015348
  (0, 37151)	0.424385726557
  (0, 11384)	0.0684477619527
  (0, 48378)	0.00342467379024
  (0, 2710)	0.0192243348403
  (0, 5772)	0.00888012786058
  (0, 45001)	0.00940098966201
  (0, 28531)	0.0104408813188
  (0, 33113)	0.0141685243295
  (0, 2129)	0.00991998214818
  (0, 3718)	0.0105198839887
  (0, 12321)	0.0158761931809
  (0, 20460)	0.546864264415
  (0, 18517)	0.0127099870225
  (0, 37935)	0.0319354603694
  (0, 42137)	0.00896207663239
  (0, 27419)	0.00861861190876
  (0, 49887)	0.00495999107409
  (0, 38489)	0.00473583983605
  :	:
  (500, 26469)	0.0281321057249
  (500, 30790)	0.0292512564546
  (500, 46106)	0.0272176929976
  (500, 46813)	0.0562642114497
  (500, 34010)	0.0562642114497
  (500, 21607)	0.0281321057249
  (500, 29778)	0.0281321057249
  (500, 11287)	0.122776370756
  (500, 49317)	0.0292

## K-Means

In [13]:
%%time
from sklearn.cluster import KMeans
import numpy

num_clusters = 5
km = KMeans(n_clusters=num_clusters, verbose=True, init='k-means++', n_init=3, n_jobs=-1)
km.fit(tf_idf_matrix)

clusters = km.labels_.tolist()
print ("cluster id for each document = %s" % clusters)

print()
# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        


Initialization complete
Initialization complete
Initialization complete
Iteration  0, inertia 909.823
Iteration  0, inertia 903.653
Iteration  0, inertia 894.973
Iteration  1, inertia 463.493
Iteration  1, inertia 464.222
Iteration  1, inertia 463.352
Iteration  2, inertia 461.795
Iteration  2, inertia 461.767
Iteration  2, inertia 463.018
Iteration  3, inertia 461.390
Iteration  3, inertia 462.093
Iteration  3, inertia 461.086
Iteration  4, inertia 460.837
Iteration  4, inertia 461.159
Iteration  4, inertia 461.181
Iteration  5, inertia 460.553
Iteration  5, inertia 460.626
Iteration  5, inertia 461.014
Iteration  6, inertia 460.918
Iteration  6, inertia 460.336
Iteration  6, inertia 460.595
Iteration  7, inertia 460.838
Iteration  7, inertia 460.230
Iteration  7, inertia 460.511
Iteration  8, inertia 460.774
Iteration  8, inertia 460.120
Iteration  8, inertia 460.395
Iteration  9, inertia 460.612
Iteration  9, inertia 459.984
Iteration  9, inertia 460.271
Iteration 10, inertia 460.09

In [24]:
labels = pd.DataFrame(clusters, columns=['Cluster Labels'])
counts = pd.DataFrame(labels['Cluster Labels'].value_counts().sort_index())
counts.columns=['Document Count']
display(counts)

Unnamed: 0,Document Count
0,24
1,207
2,10
3,56
4,204


In [43]:
topNWords = 50

df = pd.DataFrame()

for i in range(num_clusters):
    clusterWords = []
    for topWordIndex,ind in enumerate(order_centroids[i, :topNWords]):   
        clusterWords.append( feature_names[ind] )
    df['Cluster %d' % i] = pd.Series(clusterWords)
        #dtype='object', data= [''] * topNWords)
        #print(topWordIndex)        
        #print(ind)
        #print(feature_names[ind])

df.style.set_properties(**{'text-align': 'right'})
df

Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4
0,fuck,sighs,police,sir,mom
1,fucking,chuckles,car,king,guys
2,shit,music,guy,father,dad
3,gotta,door,phone,lord,baby
4,wanna,grunts,case,men,guy
5,guy,grunting,killed,mary,ooh
6,money,indistinct,kill,majesty,cause
7,fucked,laughs,sir,queen,school
8,jesus,screaming,money,brother,laughs
9,guys,growling,dead,mother,girl


In [44]:

titlesFrame = pd.DataFrame()
titlesFrame['Labels']=km.labels_
titlesFrame['Titles']=titles

sort = titlesFrame.sort_values(by=['Labels','Titles'])
for i in range(num_clusters):
    display( sort.query('Labels == %d' % i) )

Unnamed: 0,Labels,Titles
2762,0,"""19-2"" Babylon (TV Episode 2015)"
1633,0,"""19-2"" Borders (TV Episode 2015)"
2553,0,"""19-2"" Bridges (TV Episode 2015)"
870,0,"""19-2"" Disorder (TV Episode 2015)"
2755,0,"""19-2"" Orphans (TV Episode 2015)"
1815,0,"""19-2"" Property Line (TV Episode 2015)"
1676,0,"""19-2"" Rock Garden (TV Episode 2015)"
950,0,"""19-2"" School (TV Episode 2015)"
1814,0,"""19-2"" Tables (TV Episode 2015)"
1381,0,"""19-2"" Tribes (TV Episode 2015)"


Unnamed: 0,Labels,Titles
606,1,
1808,1,"""12 Monkeys"" Arms of Mine (TV Episode 2015)"
1540,1,"""12 Monkeys"" Atari (TV Episode 2015)"
1539,1,"""12 Monkeys"" Cassandra Complex (TV Episode 2015)"
1624,1,"""12 Monkeys"" Divine Move (TV Episode 2015)"
1537,1,"""12 Monkeys"" Mentally Divergent (TV Episode 2015)"
1807,1,"""12 Monkeys"" Paradox (TV Episode 2015)"
1538,1,"""12 Monkeys"" Pilot (TV Episode 2015)"
1806,1,"""12 Monkeys"" Shonin (TV Episode 2015)"
1543,1,"""12 Monkeys"" The Keys (TV Episode 2015)"


Unnamed: 0,Labels,Titles
1479,2,"""Agent Carter"" A Sin to Err (TV Episode 2015)"
1478,2,"""Agent Carter"" The Iron Ceiling (TV Episode 2015)"
4242,2,"""Agents of S.H.I.E.L.D."" 4,722 Hours (TV Episo..."
4164,2,"""Agents of S.H.I.E.L.D."" Purpose in the Machin..."
1452,2,"""Agents of S.H.I.E.L.D."" Who You Really Are (T..."
3071,2,"""Alaska: Earth's Frozen Kingdom"" Spring (TV Ep..."
3072,2,"""Alaska: Earth's Frozen Kingdom"" Winter (TV Ep..."
3501,2,"""Allegiance"" A Convenient Place to Die (TV Epi..."
3502,2,"""Allegiance"" Blowback (TV Episode 2015)"
1789,2,"""Allegiance"" Chasing Ghosts (TV Episode 2015)"


Unnamed: 0,Labels,Titles
4282,3,"""A Place to Call Home"" In the Heat of the Nigh..."
4278,3,"""A Place to Call Home"" L'chaim, to Life (TV Ep..."
4281,3,"""A Place to Call Home"" Living in the Shadow (T..."
4283,3,"""A Place to Call Home"" Sins of the Father (TV ..."
4279,3,"""A Place to Call Home"" Somewhere Beyond the Se..."
4280,3,"""A Place to Call Home"" Too Old to Dream (TV Ep..."
2815,3,"""A.D. The Bible Continues"" Brothers in Arms (T..."
2816,3,"""A.D. The Bible Continues"" Rise Up (TV Episode..."
2814,3,"""A.D. The Bible Continues"" Saul's Return (TV E..."
2817,3,"""A.D. The Bible Continues"" The Abomination (TV..."


Unnamed: 0,Labels,Titles
4375,4,
4376,4,
4377,4,
4378,4,
4379,4,
4380,4,
4381,4,
4382,4,
4383,4,
2849,4,"""2 Broke Girls"" And the Crime Ring (TV Episode..."


# The End ...