In [1]:
import numpy as np
import pandas as pd
import math

from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

%config IPCompleter.greedy=True
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)

In [2]:
location = '/Users/marrowgari/tensorflow/pron.txt'
data = pd.read_csv(location, names=('Words', 'Phones'))
words = data['Words']
phones = data['Phones']

print("Data shape --->", data.shape)
print("Words shape --->", words.shape)
print("Phones shape --->", phones.shape)
print('----------')
print("Data Sample:","\n", data[:6])

Data shape ---> (63708, 2)
Words shape ---> (63708,)
Phones shape ---> (63708,)
----------
Data Sample: 
    Words                    Phones
0     2.                      T UW
1     3D              TH R IY D IY
2     3M              TH R IY EH M
3   401K     F OW R OW W AH N K EY
4  401KS   F OW R OW W AH N K EY Z
5  90210  N AY N OW T UW W AH N OW


In [29]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))
X = vectorizer.fit_transform(phones)

In [30]:
X.shape

(63708, 7597)

In [36]:
true_k = 100
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=5000, n_init=1)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=5000,
    n_clusters=100, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [37]:
from sklearn.pipeline import Pipeline

# combine vectorizer and cluster model into one object
phone_cluster_model = Pipeline(steps=[('encode', vectorizer), ('model', model)])

In [59]:
lookup = {}
alist = lookup.get('cow', [])
alist.append('moo')
lookup['cow'] = alist

In [48]:
times_two = [a * 2 for a in range(10)]
print(list(times_two))

times_two_dict = {'word' + str(a): a*2 for a in range(10)}
print(times_two_dict)
print(times_two_dict['word2'])

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
{'word0': 0, 'word1': 2, 'word2': 4, 'word3': 6, 'word4': 8, 'word5': 10, 'word6': 12, 'word7': 14, 'word8': 16, 'word9': 18}
4


In [None]:
word_idx_lookup = {}
for w, idx in enumerate(words):
    word_idx_lookup[w] = idx

In [38]:
word_idx_lookup = {w: idx for idx, w in enumerate(words)}

def cluster_for_word(word):
    """
    For a given word, lookup phones and pass to clustering model, return cluster
    """
    word_idx = word_idx_lookup[word]
    phones_for_word = phones[word_idx]
    cluster = phone_cluster_model.predict([phones_for_word])[0]
    return cluster

In [39]:
def group_words_by_cluster(words):
    """
    Create groups of words by cluster (from phones)
    """
    words_by_cluster = {}
    for word in words:
        cluster = cluster_for_word(word)
        word_set = words_by_cluster.get(cluster, set())  # get the set of words, or empty set if not exists
        word_set.add(word)

        words_by_cluster[cluster] = word_set # save the modified set back into the grouping dict
    return words_by_cluster

words_by_cluster = group_words_by_cluster(words)

In [41]:
# for each cluster, print the cluster and 10 words in that cluster in no particular order
for cluster, cluster_words in words_by_cluster.items():
    cluster_words = list(cluster_words)
    words_in_cluster = len(cluster_words)
    print("Cluster[{} len{}]:{}".format(cluster, words_in_cluster, cluster_words[:10]))

Cluster[57 len3376]:['COURTROOM', 'FROWN', 'PEERS', 'BEARER', 'BROWSED', 'COUPS', 'LORD', 'BREACH', "HAIR'S", 'FRUTZ']
Cluster[77 len616]:["PEVIE'S", 'DILLYDALLY', 'NEARY', 'TEEPEE', 'LINGUINE', 'WEIRDLY', 'PEEZIE', 'MEDIATOR', 'GEDS', "GRISTITI'S"]
Cluster[13 len816]:['ACCIDENTLY', "DENNY'S", 'STEADY', 'STEPHIE', 'VEGGIES', 'MIDDLEBURY', 'TRENEY', 'CLEANLIEST', 'CORONARYS', "PENNY'S"]
Cluster[55 len1848]:['COCO', 'ROAR', 'BOLD', 'LOE', "SMOKERS'", 'SCHROEDER', 'STROLLED', 'DSOS', 'SAILBOATS', 'MOLE']
Cluster[58 len172]:['BELAIR', 'WEBPAGE', 'ENRAGE', 'ALLENDE', 'HEADWAYS', 'BELGRADE', 'STRANGENESS', 'LASEYETTE', 'WEBPAGES', 'X-RAY']
Cluster[70 len1802]:["LANDSCAPER'S", 'MAILERS', "CRANE'S", 'TRAINERS', 'OCTANE', 'UYAY', 'ACED', 'TEENAGEHOOD', 'GAGE', 'CLAIMED']
Cluster[83 len422]:['MICHIGANIAN', 'ECHOLALIA', 'IGLESIAS', "BABIES'", "CHANEY'S", 'BABYISH', 'PENNSYLVANIAN', "LADIES'", 'MANIACS', 'LADYBIRD']
Cluster[26 len91]:['DECONTAMINATE', 'UPGRADED', 'PROMULGATE', 'SOMEDAYS', 'MAKE-UP

In [24]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

Top terms per cluster:
Cluster 0:
 ih
 oh
 oh ih
 ih oh
 hh oh
 hh
 ch
 ih jh
 ix
 sh
Cluster 1:
 ax
 eh
 eh ax
 ae
 hh
 hh eh
 iy
 jh
 axr
 uw
Cluster 2:
 ae
 iy
 ae iy
 hh
 hh ae
 eh
 ow
 ax
 axr
 iy ow
Cluster 3:
 oh
 uw
 ao
 ey
 eh
 sh
 eh oh
 ch
 uw oh
 ix
Cluster 4:
 ix
 ih
 ih ix
 iy
 ix ih
 axr
 ix iy
 ix ix
 ey
 ix ey
Cluster 5:
 uw
 eh
 ow
 eh uw
 th
 iy
 uw th
 iy uw
 uw ow
 ae
Cluster 6:
 ow
 ah
 ah ow
 th
 aw
 ch
 ay
 ao
 ow ay
 ow ao
Cluster 7:
 ax
 iy
 iy ax
 ax iy
 ax ax
 ae
 ae ax
 ah
 ei
 jh
Cluster 8:
 ax
 uh
 sh
 ao
 sh ax
 uh ax
 er
 ax ao
 ax er
 ao sh
Cluster 9:
 aw
 ey
 sh
 ey sh
 aw ey
 sh ax
 ax
 ch er
 ch ow
 dh ao
Cluster 10:
 ix
 aa
 ix nx
 nx
 aa ix
 ax
 sh
 ix aa
 sh ix
 hh aa
Cluster 11:
 ax
 iy
 iy ax
 ay
 ei iy
 ei
 oh
 ax ay
 axr
 th
Cluster 12:
 eh
 ch eh
 ch
 axr
 eh axr
 ih
 ao
 ae
 ey ch
 ah
Cluster 13:
 iy
 aw
 ao
 iy ao
 aw iy
 ei
 ah
 er
 iy ah
 th
Cluster 14:
 ix
 ay
 ix ay
 eh
 eh ix
 ay ix
 ah
 axr
 ah ix
 iy
Cluster 15:
 ey
 ay
 ih ey
 ih
 

 ow
 sh
 oh
 sh ax
 ow ax
 eh jh
 jh
Cluster 153:
 ax
 dx
 ix
 dx ix
 eh
 eh dx
 ax eh
 ix ax
 iy
 jh
Cluster 154:
 ix
 ax
 er
 er ix
 ix ax
 ax er
 ix er
 nx
 iy
 ix nx
Cluster 155:
 ix
 ax
 ey dx
 er ax
 ax ey
 dx
 er
 dx ix
 ey
 ix nx
Cluster 156:
 eh
 ix
 eh ix
 ow
 jh
 axr
 ix eh
 jh eh
 ix ow
 eh eh
Cluster 157:
 ax
 ix
 eh
 eh ax
 ax ix
 nx
 ix nx
 iy
 ae
 ix eh
Cluster 158:
 ax
 sh
 uw sh
 sh ax
 uw
 ix
 ix uw
 oh ix
 oh
 ih
Cluster 159:
 ix
 uh
 uh ix
 ix uh
 ax
 ix ax
 sh
 uh ax
 ix sh
 dx
Cluster 160:
 ow
 ae
 ae ow
 ax
 sh
 ax ae
 sh ae
 ow ax
 ay
 ow ay
Cluster 161:
 axr
 dh axr
 dh
 eh
 eh dh
 aa
 aa dh
 ae
 ao
 ae dh
Cluster 162:
 ix
 dx
 dx ix
 ix dx
 eh
 eh ix
 oh
 oh ix
 nx
 ix nx
Cluster 163:
 ey
 ix
 jh
 ey jh
 jh ix
 ix nx
 nx
 ch
 ch ey
 eh
Cluster 164:
 ae
 axr
 ch
 ch axr
 ae ch
 ax
 uw
 axr ax
 ae uw
 ix
Cluster 165:
 ih
 ih ih
 ax
 iy
 ih iy
 ih ax
 ax ih
 ey
 ih ey
 ae
Cluster 166:
 ax
 ae
 ax ae
 ae ax
 iy
 ae ch
 ax iy
 ch
 axr
 eh
Cluster 167:
 ae
 nx
 ae 

 ao iy
 ah
 ih ih
 ae ih
Cluster 289:
 ix
 aw
 aw ix
 ix nx
 nx
 ax
 ax aw
 iy aw
 hh
 iy
Cluster 290:
 ey ay
 ay
 ey
 hh ey
 hh
 axr
 ix ey
 ix
 jh
 er
Cluster 291:
 ix
 uw
 uw ix
 ix uw
 ix ix
 iy
 eh
 ix eh
 ix iy
 ow
Cluster 292:
 eh
 ae
 ae eh
 axr
 eh axr
 eh ae
 hh
 er
 hh ae
 ey
Cluster 293:
 ix
 ae
 ix ae
 iy
 ae iy
 ow
 er ix
 axr ix
 axr
 iy ow
Cluster 294:
 ow
 iy ow
 iy
 ch iy
 ch
 ei iy
 ei
 jh iy
 jh
 sh
Cluster 295:
 ax
 ay
 ax ay
 ay ax
 ae
 ax ax
 ix
 ae ax
 ax ix
 iy
Cluster 296:
 ix
 dx ix
 ay
 ix ay
 dx
 nx
 ix nx
 ay dx
 ay ix
 ae dx
Cluster 297:
 ax
 sh
 er
 er sh
 sh ax
 ax er
 ix
 ih
 ix er
 ow er
Cluster 298:
 ix
 dx
 axr
 ey dx
 ix ey
 dx axr
 ey
 ih
 ih ix
 eh
Cluster 299:
 ax
 ix
 oh
 ax ax
 oh ax
 ix oh
 jh
 oh ix
 jh ax
 ix jh
Cluster 300:
 dx
 aa
 aa dx
 ix
 dx ix
 ax
 nx
 ix nx
 ix ax
 dx ax
Cluster 301:
 ix
 ix eh
 eh
 oh
 aa
 aa ix
 ax ix
 ax
 eh oh
 oh ix
Cluster 302:
 ax
 ax ax
 oh
 oh ax
 ax oh
 iy
 ao
 ay
 axr
 ax ay
Cluster 303:
 eh
 ix
 ax
 ix e

 ow
Cluster 470:
 ix
 oh
 ix oh
 oh ix
 ax
 sh
 ix ix
 dx
 ix dx
 ix ax
Cluster 471:
 ax
 ax ay
 uw
 uw ax
 ay
 ih
 eh
 hh uw
 hh
 ch uw
Cluster 472:
 jh
 iy
 jh iy
 ax
 iy jh
 iy ax
 ao
 ae
 ax jh
 ah jh
Cluster 473:
 dx axr
 axr
 dx
 uw
 uw dx
 ax
 ax uw
 ix
 sh
 oh
Cluster 474:
 iy
 ow
 aa
 aa iy
 iy ow
 ow aa
 ax
 iy ax
 ae
 iy iy
Cluster 475:
 ax
 ih
 ih ax
 ax ih
 eh
 ix
 ax eh
 ae
 ix ax
 eh ix
Cluster 476:
 ah
 ix
 ah ix
 jh
 jh ah
 ix jh
 hh
 hh ah
 ey
 iy
Cluster 477:
 ax
 sh
 ow
 ow sh
 sh ax
 ix
 ax ow
 ix ow
 iy
 ay
Cluster 478:
 ix
 ae
 ae ch
 ch ix
 ch
 ix nx
 nx
 ax
 ix ix
 ax ae
Cluster 479:
 ax
 ix
 oh
 dx
 oh ix
 dx ax
 ix dx
 ax oh
 iy
 hh
Cluster 480:
 ax
 ey
 ax ax
 ey ax
 ix ey
 ix
 ah
 ay ow
 ow
 ah ey
Cluster 481:
 ax
 ax ow
 ix
 ow
 ow ix
 nx
 ix nx
 iy
 ay ax
 iy ax
Cluster 482:
 ih
 iy
 ix
 ix dx
 dx
 dx iy
 ih ix
 ae
 ae ih
 eh
Cluster 483:
 ax
 uh ey
 sh ax
 jh uh
 uh
 ey sh
 sh
 jh
 ey
 eh
Cluster 484:
 ae
 oh ae
 oh
 ix
 ae ix
 ix oh
 ae nx
 ey oh
 ih
 e

In [14]:
Y = vectorizer.transform(['sh'])
prediction = model.predict(Y)
print(prediction)

[4]
