In [64]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords 
from nltk.corpus import wordnet 
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) 
punctuation = "-!?;:\"\'.,"

def getPOS(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def getFeatures(text):
    features = {}
    tokens = nltk.word_tokenize(text.lower())
    tot_cnt = len(tokens)
    for word in tokens:        
        if word.isnumeric():
            label = '[NUMBER]'
            if label not in features:
                features[label] = 1/float(tot_cnt)    
            else:
                features[label] += 1/float(tot_cnt)    
        elif word not in stop_words and word not in punctuation:
            w = wordnet_lemmatizer.lemmatize(word, getPOS(word))
            if w not in features:
                features[w] = 1/float(tot_cnt)
            else:
                features[w] += 1/float(tot_cnt)
    return features


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
import numpy as np
import random, sys, copy
from collections import Counter

def initCentroids(vec_data, k):
    centroidID = np.random.permutation(len(vec_data))[:k]
    # print(centroidID)
    return [vec_data[cID] for cID in centroidID]

def calculateDistance(v_dict, u_dict):
    v = list(v_dict.values())
    u = list(u_dict.values())
    total = 0
    for i in range(0, len(v)):
        total += ((v[i] - u[i]) ** 2)
    return np.sqrt(total)


# takes a vector dictionary and a set of all the keys
# returns a new vector dictionary with all the keys
def reshapeVectDict(vect_dict, keys):
    result = copy.deepcopy(vect_dict)
    for key in keys:
        if key not in result:
            result[key] = 0.0
    return result

# For a given point, find the closest centroid from the list, return centroid's index
def closestCentroid(point, k, data):
    # Get all the keys (words) in the point and centroids
    # This will be used to transform them to the same dimension so we can do math
    all_keys = []
    all_keys.extend(point)        
    for cID in range(0,k):        
        all_keys.extend(data[cID].keys())
    all_keys = set(all_keys)

    closestCentroid = 0
    minDist = sys.maxsize
    point_reshape = reshapeVectDict(point, all_keys)

    # print()
    # print("point: ", point_reshape)
    for cID in range(0,k):
        centroid = data[cID]
        centroid_reshape = reshapeVectDict(centroid, all_keys)
        dist = calculateDistance(point_reshape, centroid_reshape)

        # print("dist: ", dist, "centroid: ", centroid)
        if (dist < minDist):
            minDist = dist          
            closestCentroid = cID

    # print("closestCentroid: ", closestCentroid, "minDist: ", minDist)
    return closestCentroid

def recalculateCentroid(cluster, data):
    new_centroid = {}
    for idx in cluster:
        point = data[idx]
        new_centroid = dict(Counter(new_centroid) + Counter(point))

    for key in new_centroid:
        new_centroid[key] /= len(cluster)
    # print("new_centroid: ", new_centroid)
    return new_centroid

def calculateConvergence(k, t, M):
    difference = 0
    for i in range(0, k):
        v = list(M[t][i].values())
        u = list(M[t-1][i].values())
        diff = (np.mean(u) - np.mean(v)) ** 2
        difference += diff    
    print(difference)
    return difference

def k_means(vec_data, k, e):
    M = []
    M.append([0 for i in range(0, k)])

    t = 0
    M[t] = initCentroids(vec_data, k)

    cluster = []
    condition = True
    while(condition):
        # print("iteration: ", t)
        t += 1
        M.append([0 for i in range(0, k)])    # M needs t rows
        C = []
        for i in range(0, k):
            C.append([])
        # Centroid assignment
        for point in vec_data:
            clusterID = closestCentroid(point, k, M[t-1])                
            C[clusterID].append(vec_data.index(point))    # keep track of the point's as indices
        
        # Centroid update
        for i in range(0, k):
            new_centroid = recalculateCentroid(C[i], vec_data)
            M[t][i] = new_centroid 

        cluster = C
        condition = calculateConvergence(k, t, M) > e
    
    return cluster

In [0]:
# Takes a list of vect-dicts that describe the cluster, and converts the data to a list of their cluster labels

def printClusters(clusters):
    result = []
    for IDX in range(0, len(clusters)):
        print("Cluster ", IDX, " contains:")
        samples = [vec_data[i] for i in clusters[IDX]]
        print(samples)
        result.append(samples)
    return result
    
def getClusterLabel(item, clusters):
    for cluster in clusters:
        if item in cluster:
            return clusters.index(cluster)

def summarizeClusters(clusters):
    summary = []
    for cluster in clusters:
        summary.append(len(cluster))
    return [cid for cid in range(0, len(clusters))], summary

In [0]:
import pandas as pd

FILE = "committee_utterances.tsv"
path = "drive/My Drive/Colab Notebooks/466-proj2/"

df = pd.read_csv(path + FILE, sep='\t')

In [0]:
#Select a random 25% (1/4) of the content
records = list(df.text)
number_selected = len(records) // 16  #TODO CHANGE THIS VALUE TO 4
selected_records = random.sample(records, number_selected) #get random sample of number_selected records without replacement

In [0]:
data = selected_records

# vectorize data
vec_data = [getFeatures(text) for text in data]

In [145]:
# k-means clustering
K = 5
E = .00000001 # threshold
clusters = k_means(vec_data, K, E)

0.11307669567610816
nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [91]:
cluster_labels, cluster_cnts = summarizeClusters(clusters)
print("Cluster IDs: ", cluster_labels)
print("# of items in each:")
for idx in range(len(cluster_cnts)):
    print(idx, "-", cluster_cnts[idx])

Cluster IDs:  [0, 1, 2, 3, 4]
# of items in each:
0 - 1196
1 - 6
2 - 53
3 - 670
4 - 3


#Evluation

In [0]:
# Vectorize features
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
X_vec = vectorizer.fit_transform(map(getFeatures, data))

In [0]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, max_iter=600, algorithm = 'auto')
fitted = kmeans.fit(X_vec)


In [102]:
# set up contingency table
contingency_table = []
for t in range(0, len(clusters)):
    row = [0 for c in range(0, kmeans.n_clusters)]
    contingency_table.append(row)

inverse = vectorizer.inverse_transform(X_vec)

# add counts to contingency table
for item_idx in range(0, len(vec_data)):
    t_label = getClusterLabel(item_idx, clusters)
    c_label = fitted.labels_[inverse.index(vec_data[item_idx])]
    if t_label is not None and c_label is not None:   # i have an error where some cluster labels are NONE
        contingency_table[t_label][c_label] +=1


[[29, 0, 1159, 0, 0],
 [0, 0, 1, 0, 0],
 [2, 0, 50, 0, 0],
 [41, 2, 514, 4, 34],
 [0, 0, 3, 0, 0]]

In [108]:
from tabulate import tabulate

print(tabulate(contingency_table))

--  -  ----  -  --
29  0  1159  0   0
 0  0     1  0   0
 2  0    50  0   0
41  2   514  4  34
 0  0     3  0   0
--  -  ----  -  --


In [111]:
summarizeClusters(clusters)

([0, 1, 2, 3, 4], [1196, 6, 53, 670, 3])

In [121]:
result = printClusters(clusters)

Cluster  0  contains:
[{'course': 0.018518518518518517, 'disparity': 0.018518518518518517, 'non-arc': 0.018518518518518517, 'county': 0.018518518518518517, 'relative': 0.037037037037037035, 'non-relatives': 0.018518518518518517, 'even': 0.018518518518518517, 'extreme': 0.018518518518518517, 'make': 0.018518518518518517, 'little': 0.018518518518518517, 'sense': 0.018518518518518517, 'deny': 0.018518518518518517, 'specialized': 0.037037037037037035, 'funding': 0.018518518518518517, "'s": 0.018518518518518517, 'care': 0.018518518518518517, 'system': 0.018518518518518517, 'allows': 0.018518518518518517, 'family': 0.018518518518518517, 'go': 0.037037037037037035, 'heroic': 0.018518518518518517, 'length': 0.018518518518518517, "'re": 0.018518518518518517, 'ask': 0.018518518518518517}, {'wo': 0.013888888888888888, "n't": 0.013888888888888888, 'go': 0.013888888888888888, 'anywhere': 0.013888888888888888, 'near': 0.013888888888888888, '[NUMBER]': 0.013888888888888888, 'mile': 0.0138888888888888

In [126]:
pprint(result[0])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  'california': 0.013513513513513514,
  'care': 0.013513513513513514,
  'comment': 0.013513513513513514,
  'deny': 0.013513513513513514,
  'drug': 0.013513513513513514,
  'echo': 0.013513513513513514,
  'employer': 0.013513513513513514,
  'formulary': 0.013513513513513514,
  'get': 0.013513513513513514,
  'injured': 0.013513513513513514,
  'like': 0.013513513513513514,
  'look': 0.013513513513513514,
  'medical': 0.013513513513513514,
  'modify': 0.013513513513513514,
  'prescribe': 0.013513513513513514,
  'prescription': 0.013513513513513514,
  'previous': 0.013513513513513514,
  'prompt': 0.013513513513513514,
  'proscription': 0.013513513513513514,
  'provider': 0.013513513513513514,
  'review': 0.013513513513513514,
  'speaker': 0.013513513513513514,
  'stewart': 0.013513513513513514,
  'subject': 0.013513513513513514,
  'thompson': 0.013513513513513514,
  'treatment': 0.013513513513513514,
  'utilization': 0.01351351

In [0]:
from collections import Counter
cluster_0 = {}
for text_id in clusters[0]:
    cluster_0 = Counter(vec_data[text_id]) + Counter(cluster_0)

In [135]:
sorted(cluster_0.items(), key=lambda x: x[1], reverse=True)

[("'s", 11.005048820313998),
 ('[NUMBER]', 8.95062359193865),
 ('bill', 5.393565674430774),
 ('state', 4.47675245250124),
 ("'re", 4.325961208543084),
 ('thank', 4.267849432606562),
 ('would', 3.787969806647757),
 ('california', 3.6932107954172992),
 ('think', 3.586942170288466),
 ('go', 3.553910782119976),
 ('member', 3.4604959252642598),
 ('need', 3.070762060991401),
 ('year', 3.016343187875341),
 ("n't", 2.9388468900911393),
 ('work', 2.832340663361117),
 ('support', 2.789712767795095),
 ('aye', 2.693065802793715),
 ("'m", 2.686193053618665),
 ('make', 2.67371656083444),
 ('one', 2.618630056304353),
 ('get', 2.605181033303454),
 ('mr.', 2.5747603991022587),
 ('also', 2.4241915910670895),
 ('program', 2.4121505424176903),
 ('item', 2.269131944863456),
 ('know', 2.132412045228046),
 ("'ve", 2.066422758200474),
 ('time', 2.054281633519878),
 ('want', 1.9613615974740393),
 ('committee', 1.9580236408479654),
 ('see', 1.9207376943964847),
 ('people', 1.903053107868201),
 ('like', 1.880531

In [0]:

cluster_2 = {}
for text_id in clusters[2]:
    cluster_2 = Counter(vec_data[text_id]) + Counter(cluster_2)

In [140]:
sorted(cluster_2.items(), key=lambda x: x[1], reverse=True)

[('aye', 1.6823070716091308),
 ("'s", 0.44429452948230547),
 ('[NUMBER]', 0.44036235361762405),
 ("'re", 0.25439135261093215),
 ('think', 0.21876792200103828),
 ('state', 0.2015952838809699),
 ('(', 0.18181818181818182),
 (')', 0.18181818181818182),
 ('would', 0.1743150789998466),
 ('know', 0.16230860547539203),
 ('thank', 0.16072288564548626),
 ('get', 0.15295997066272446),
 ('thing', 0.15132247557652267),
 ('need', 0.13910338382844778),
 ('issue', 0.13495255393283415),
 ('mitchell', 0.13245088245088243),
 ('way', 0.12472785665990535),
 ('mr.', 0.1111111111111111),
 ('people', 0.11030591215273018),
 ('say', 0.10459388157778833),
 ('take', 0.10382395382395382),
 ('bill', 0.10221445869473891),
 ('deal', 0.10021713087750822),
 ('question', 0.09973604826546002),
 ('member', 0.09947954662977095),
 ('monning', 0.09845559845559845),
 ('right', 0.08940077082125569),
 ('hope', 0.0871473354231975),
 ('cannella', 0.08636977058029689),
 ('yeah', 0.0856359176503849),
 ("n't", 0.08534144487449685),

In [0]:

cluster_3 = {}
for text_id in clusters[3]:
    cluster_3 = Counter(vec_data[text_id]) + Counter(cluster_3)

In [142]:
sorted(cluster_3.items(), key=lambda x: x[1], reverse=True)

[('thank', 21.66980550724199),
 ('aye', 11.022337786171459),
 ('okay', 7.109552947052946),
 ('[NUMBER]', 7.013941641430823),
 ('mr.', 6.202469225820618),
 ('support', 5.766489513355606),
 ('bill', 5.49267126418288),
 ("'s", 5.108454769930103),
 ('witness', 4.123967517016256),
 ('senator', 3.815048401432196),
 ('yes', 3.784403393541325),
 ('file', 3.4670347768173855),
 ('much', 3.3769965305585656),
 ('next', 3.1076330532212886),
 ('member', 2.804680698222654),
 ('please', 2.734920634920635),
 ('california', 2.643613384746949),
 ('question', 2.458521567095421),
 ('item', 2.119138539640963),
 ('move', 2.0825275736257747),
 ("'re", 2.0455138468179204),
 ("'ll", 1.9142640925245227),
 ('assembly', 1.842416733593204),
 ('opposition', 1.8361518156924892),
 ('go', 1.7354801806756364),
 ('comment', 1.7251946543567436),
 ('committee', 1.6989282429502186),
 ('ab', 1.6961449640158082),
 ('would', 1.6819919494684057),
 ('yeah', 1.6757323232323231),
 ('one', 1.548398923167231),
 ('vote', 1.4244849527

In [0]:

cluster_4 = {}
for text_id in clusters[4]:
    cluster_4 = Counter(vec_data[text_id]) + Counter(cluster_4)

In [144]:
sorted(cluster_4.items(), key=lambda x: x[1], reverse=True)

[('one', 0.038461538461538464),
 ('way', 0.038461538461538464),
 ('first', 0.038461538461538464),
 ('meeting', 0.038461538461538464),
 ('grower', 0.038461538461538464),
 ('look', 0.038461538461538464),
 ('stress', 0.038461538461538464),
 ('real', 0.038461538461538464),
 ('collaborative', 0.038461538461538464),
 ('process', 0.038461538461538464),
 ('reaction', 0.038461538461538464),
 ('school', 0.038461538461538464),
 ('side', 0.038461538461538464),
 ('serve', 0.037037037037037035),
 ("'re", 0.037037037037037035),
 ('consider', 0.037037037037037035),
 ('technically', 0.037037037037037035),
 ('foster', 0.037037037037037035),
 ('youth', 0.037037037037037035),
 ("'s", 0.037037037037037035)]