# K-means Clustering in sci-kit learn

This example uses a dataset downloaded from https://www.opensubtitles.org/en/search/vip and the raw data at opus.lingfil.uu.se/OpenSubtitles2016/raw/en. Metadata such as title actor and director was scraped from IMDB and is not guaranteed to be complete. This example uses the last 5000 most recent movies. The full archive (1.1 Gig) is [here](https://www.dropbox.com/s/db9d6765zbjru5x/openSubtitles.json.zip?dl=0).

The code does the following:
1. counts words 
2. builds a TFIDF weighted vocabulary
3. Applies the TFIDF weights to the word counts to create a sparce matrix
4. Runs K-means clustering on the sparce matrix
5. Prints top words for each cluster using the largest features in the cluster centroid

Be sure to install the following:
1. `pip3 install sklearn`
2. `pip3 install pandas`
2. `pip3 install scipy`


In [1]:
import sys
sys.version 

'3.6.2 (default, Jul 30 2017, 14:53:19) \n[GCC 4.2.1 Compatible Apple LLVM 8.1.0 (clang-802.0.42)]'

## Unarchive

In [2]:
import tempfile
import zipfile
import os.path

zipFile = "./openSubtitles-5000.json.zip"

print( "Unarchiving ...")
temp_dir = tempfile.mkdtemp()
zip_ref = zipfile.ZipFile(zipFile, 'r')
zip_ref.extractall(temp_dir)
zip_ref.close()

openSubtitlesFile = os.path.join(temp_dir, "openSubtitles-5000.json")
print ("file unarchived to:" + openSubtitlesFile)


Unarchiving ...
file unarchived to:/var/folders/9l/w4_vhqyn5rz64fh1x9zzcsvr0000gn/T/tmpq8qj8sxt/openSubtitles-5000.json


## Tokenizing and Filtering a Vocabulary

In [3]:

import json
from sklearn.feature_extraction.text import CountVectorizer
#from log_progress import log_progress

maxDocsToload = 2000

titles = []
def make_corpus(file):
    with open(file) as f:
        for i, line in enumerate(f):
            doc = json.loads(line)
            titles.append(doc.get('Title',''))
            if i % 100 == 0:
                print ("%d " % i, end='') 
            yield doc.get('Text','')
            if i == maxDocsToload:
                break
                
print ("Starting load ...")                
textGenerator = make_corpus(openSubtitlesFile)              
count_vectorizer = CountVectorizer(min_df=2, max_df=0.5, stop_words='english', analyzer="word", token_pattern="[a-zA-Z]{3,}")
term_freq_matrix = count_vectorizer.fit_transform(textGenerator)
print ("Done.")
print ("term_freq_matrix = \n%s" % term_freq_matrix)


Starting load ...
0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 Done.
term_freq_matrix = 
  (0, 1802)	1
  (0, 6086)	1
  (0, 27353)	1
  (0, 26072)	1
  (0, 34030)	1
  (0, 18759)	1
  (0, 33974)	1
  (0, 33756)	1
  (0, 28386)	1
  (0, 17081)	1
  (0, 33623)	1
  (0, 13828)	1
  (0, 4332)	1
  (0, 11248)	1
  (0, 3335)	1
  (0, 4561)	1
  (0, 40462)	1
  (0, 7422)	1
  (0, 41881)	1
  (0, 33561)	2
  (0, 29493)	1
  (0, 16605)	1
  (0, 17746)	1
  (0, 4581)	1
  (0, 22126)	1
  :	:
  (2000, 1822)	1
  (2000, 3290)	1
  (2000, 13298)	1
  (2000, 1523)	2
  (2000, 20442)	1
  (2000, 13719)	1
  (2000, 20474)	1
  (2000, 930)	2
  (2000, 24734)	1
  (2000, 11685)	2
  (2000, 4877)	1
  (2000, 17576)	5
  (2000, 13730)	1
  (2000, 14408)	1
  (2000, 15428)	1
  (2000, 29621)	1
  (2000, 4776)	5
  (2000, 20125)	1
  (2000, 7233)	3
  (2000, 35200)	1
  (2000, 41957)	1
  (2000, 22842)	1
  (2000, 32580)	2
  (2000, 4305)	1
  (2000, 6398)	7


## Feature Vocabulary

In [4]:
print( "Vocabulary length = ", len(count_vectorizer.vocabulary_))
word = "data";
rainingIndex = count_vectorizer.vocabulary_[word];
print( "token index for \"%s\" = %d" % (word,rainingIndex))
feature_names = count_vectorizer.get_feature_names()
print( "feature_names[%d] = %s" % (rainingIndex, feature_names[rainingIndex]))


Vocabulary length =  42141
token index for "data" = 9324
feature_names[9324] = data


In [5]:
for i in range(0,1000):
    print( "feature_names[%d] = %s" % (i, feature_names[i]))

feature_names[0] = aaa
feature_names[1] = aaaaaaaaaaaaaaah
feature_names[2] = aaaaaaaaaah
feature_names[3] = aaaaaaagh
feature_names[4] = aaaaaaah
feature_names[5] = aaaaaah
feature_names[6] = aaaaah
feature_names[7] = aaaah
feature_names[8] = aaaargh
feature_names[9] = aaagh
feature_names[10] = aaah
feature_names[11] = aaahh
feature_names[12] = aaargh
feature_names[13] = aadhar
feature_names[14] = aafrin
feature_names[15] = aagh
feature_names[16] = aah
feature_names[17] = aahh
feature_names[18] = aahhh
feature_names[19] = aak
feature_names[20] = aamir
feature_names[21] = aargh
feature_names[22] = aaron
feature_names[23] = aarp
feature_names[24] = aback
feature_names[25] = abacus
feature_names[26] = abaddon
feature_names[27] = abalone
feature_names[28] = abandon
feature_names[29] = abandoned
feature_names[30] = abandoning
feature_names[31] = abandonment
feature_names[32] = abandons
feature_names[33] = abate
feature_names[34] = abattoir
feature_names[35] = abba
feature_names[36] = abbad

feature_names[936] = allahu
feature_names[937] = allan
feature_names[938] = allay
feature_names[939] = allegation
feature_names[940] = allegations
feature_names[941] = alleged
feature_names[942] = allegedly
feature_names[943] = alleges
feature_names[944] = allegiance
feature_names[945] = allegiances
feature_names[946] = alleging
feature_names[947] = allegory
feature_names[948] = allegro
feature_names[949] = allen
feature_names[950] = allerdyce
feature_names[951] = allergen
feature_names[952] = allergic
feature_names[953] = allergies
feature_names[954] = allergy
feature_names[955] = alleviate
feature_names[956] = alleviating
feature_names[957] = alley
feature_names[958] = alleys
feature_names[959] = alleyway
feature_names[960] = alleyways
feature_names[961] = allez
feature_names[962] = alliance
feature_names[963] = alliances
feature_names[964] = allie
feature_names[965] = allied
feature_names[966] = allies
feature_names[967] = alligator
feature_names[968] = alligators
feature_names[969]

## TDIF Weighting
This applys the TFIDF weight to the matrix

tfidf value = word count / number of documents word is in

The document vectors are also normalized so they have a euclidian magnitude of 1.0.

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm="l2")
tfidf.fit(term_freq_matrix)

tf_idf_matrix = tfidf.transform(term_freq_matrix)
print( tf_idf_matrix)

  (0, 1413)	0.104081639293
  (0, 40730)	0.00446522116367
  (0, 40432)	0.00981866060544
  (0, 31882)	0.00591001483191
  (0, 1092)	0.00710680029374
  (0, 32032)	0.436620103368
  (0, 11990)	0.0765188333169
  (0, 41552)	0.00352597811216
  (0, 3216)	0.0213834585029
  (0, 6398)	0.00912543656725
  (0, 38345)	0.0100068071902
  (0, 23818)	0.0108731394864
  (0, 27990)	0.0131045702889
  (0, 4305)	0.0111252531832
  (0, 19767)	0.521567697972
  (0, 32580)	0.0330213392989
  (0, 36665)	0.00969018050357
  (0, 22842)	0.00927490334951
  (0, 41957)	0.00557309620917
  (0, 4916)	0.00428518454776
  (0, 13355)	0.00995832642766
  (0, 8417)	0.00340097124456
  (0, 38155)	0.00683485970033
  (0, 36068)	0.0101113130265
  (0, 4923)	0.00433710749129
  :	:
  (2000, 19946)	0.342110171287
  (2000, 32321)	0.0718038509358
  (2000, 6953)	0.0743473912331
  (2000, 32609)	0.0822483490482
  (2000, 15305)	0.068813246568
  (2000, 38758)	0.0379347359336
  (2000, 20119)	0.0379347359336
  (2000, 20344)	0.0855275428217
  (2000, 7050

## K-Means

In [7]:
%%time
from sklearn.cluster import KMeans
import numpy

num_clusters = 5
km = KMeans(n_clusters=num_clusters, verbose=True, init='k-means++', n_init=3, n_jobs=-1)
km.fit(tf_idf_matrix)

clusters = km.labels_.tolist()
print ("cluster id for each document = %s" % clusters)

print("Top terms per cluster:")
print()
# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

        


Initialization complete
Initialization complete
Initialization complete
Iteration  0, inertia 3720.881
Iteration  0, inertia 3741.166
Iteration  0, inertia 3781.998
Iteration  1, inertia 1921.909
Iteration  1, inertia 1925.495
Iteration  1, inertia 1924.453
Iteration  2, inertia 1918.455
Iteration  2, inertia 1916.371
Iteration  2, inertia 1921.222
Iteration  3, inertia 1915.582
Iteration  3, inertia 1914.365
Iteration  3, inertia 1917.946
Iteration  4, inertia 1914.240
Iteration  4, inertia 1913.567
Iteration  4, inertia 1915.082
Iteration  5, inertia 1913.628
Iteration  5, inertia 1913.241
Iteration  5, inertia 1913.532
Iteration  6, inertia 1913.432
Iteration  6, inertia 1913.088
Iteration  6, inertia 1912.717
Iteration  7, inertia 1913.326
Iteration  7, inertia 1912.847
Iteration  7, inertia 1912.393
Iteration  8, inertia 1913.193
Iteration  8, inertia 1912.682
Iteration  8, inertia 1912.275
Iteration  9, inertia 1913.036
Iteration  9, inertia 1912.235
Iteration  9, inertia 1912.44

In [8]:
topNWords = 50
#clusterWords = numpy.zeros((topNWords,num_clusters))


import pandas as pd   
#from IPython.display import HTML, display

df = pd.DataFrame()

for i in range(num_clusters):
    clusterWords = []
    for topWordIndex,ind in enumerate(order_centroids[i, :topNWords]):   
        clusterWords.append( feature_names[ind] )
    df['Cluster %d' % i] = pd.Series(clusterWords)
        #dtype='object', data= [''] * topNWords)
        #print(topWordIndex)        
        #print(ind)
        #print(feature_names[ind])

df.style.set_properties(**{'text-align': 'right'})
df

Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4
0,grunting,cole,sighs,josh,fuck
1,peter,dahlia,chuckles,sighs,fucking
2,pablo,klaus,king,april,shit
3,fisk,rick,police,jane,gotta
4,grunts,ramse,brother,school,wanna
5,spanish,rebekah,mrs,laughs,danny
6,speaking,cassie,laughs,ooh,jesus
7,elena,snarling,shit,chuckles,fucked
8,malcolm,elijah,john,cool,ain
9,sighs,hayley,ben,wanna,sighs


In [9]:

titlesFrame = pd.DataFrame()
titlesFrame['Labels']=km.labels_
titlesFrame['Titles']=titles

sort = titlesFrame.sort_values(by=['Labels','Titles'])
for i in range(num_clusters):
    display( sort.query('Labels == %d' % i) )

Unnamed: 0,Labels,Titles
606,0,
1902,0,"""A.D. The Bible Continues"" The Body Is Gone (T..."
1903,0,"""A.D. The Bible Continues"" The Spirit Arrives ..."
1901,0,"""A.D. The Bible Continues"" The Tomb Is Open (T..."
1464,0,"""Agents of S.H.I.E.L.D."" S.O.S. Part 1 (TV Epi..."
1411,0,"""Agents of S.H.I.E.L.D."" S.O.S. Part 2 (TV Epi..."
568,0,"""Bitten"" Bad Blood (TV Episode 2015)"
574,0,"""Bitten"" Bad Dreams (TV Episode 2015)"
575,0,"""Bitten"" Dark Arts (TV Episode 2015)"
571,0,"""Bitten"" Dead Meat (TV Episode 2015)"


Unnamed: 0,Labels,Titles
1808,1,"""12 Monkeys"" Arms of Mine (TV Episode 2015)"
1540,1,"""12 Monkeys"" Atari (TV Episode 2015)"
1539,1,"""12 Monkeys"" Cassandra Complex (TV Episode 2015)"
1624,1,"""12 Monkeys"" Divine Move (TV Episode 2015)"
1537,1,"""12 Monkeys"" Mentally Divergent (TV Episode 2015)"
1807,1,"""12 Monkeys"" Paradox (TV Episode 2015)"
1538,1,"""12 Monkeys"" Pilot (TV Episode 2015)"
1806,1,"""12 Monkeys"" Shonin (TV Episode 2015)"
1543,1,"""12 Monkeys"" The Keys (TV Episode 2015)"
1541,1,"""12 Monkeys"" The Night Room (TV Episode 2015)"


Unnamed: 0,Labels,Titles
1569,2,"""2 Broke Girls"" And the Fun Factory (TV Episod..."
1908,2,"""A to Z"" J Is for Jan Vaughan (TV Episode 2015)"
1583,2,"""A to Z"" K Is for Keep Out (TV Episode 2015)"
1708,2,"""A to Z"" L Is for Likability (TV Episode 2015)"
1711,2,"""About a Boy"" About a Hook (TV Episode 2015)"
1495,2,"""About a Boy"" About a Manniversary (TV Episode..."
1479,2,"""Agent Carter"" A Sin to Err (TV Episode 2015)"
1475,2,"""Agent Carter"" Bridge and Tunnel (TV Episode 2..."
1060,2,"""Agent Carter"" Now Is Not the End (TV Episode ..."
1480,2,"""Agent Carter"" SNAFU (TV Episode 2015)"


Unnamed: 0,Labels,Titles
1709,3,"""A to Z"" M Is for Meant to Be (TV Episode 2015)"
1570,3,"""About a Boy"" About a Boy Becoming a Man (TV E..."
557,3,"""Austin & Ally"" Buzzcuts & Beginnings (TV Epis..."
1417,3,"""Austin & Ally"" Dancers & Ditzes (TV Episode 2..."
1420,3,"""Austin & Ally"" Grand Openings & Great Expecta..."
1416,3,"""Austin & Ally"" Homework & Hidden Talents (TV ..."
1419,3,"""Austin & Ally"" Karaoke & Kalamity (TV Episode..."
1418,3,"""Austin & Ally"" Wedding Bells & Wacky Birds (T..."
757,3,"""Awkward."" Prank Amateurs (TV Episode 2015)"
1520,3,"""Bad Judge"" The Fixer (TV Episode 2015)"


Unnamed: 0,Labels,Titles
1633,4,"""19-2"" Borders (TV Episode 2015)"
870,4,"""19-2"" Disorder (TV Episode 2015)"
1815,4,"""19-2"" Property Line (TV Episode 2015)"
1676,4,"""19-2"" Rock Garden (TV Episode 2015)"
950,4,"""19-2"" School (TV Episode 2015)"
1814,4,"""19-2"" Tables (TV Episode 2015)"
1381,4,"""19-2"" Tribes (TV Episode 2015)"
413,4,"""Ballers"" Pilot (TV Episode 2015)"
1904,4,"""Banana"" Episode #1.6 (TV Episode 2015)"
732,4,"""Banshee"" A Fixer of Sorts (TV Episode 2015)"


# The End ...