In [117]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [118]:
bill_data = pd.read_csv("./word_Foreign_congress_104_108.csv")

In [119]:
bill_titles =  bill_data["bill_title"] #first 10 titles
content = bill_data["bill_long_text"]
years = bill_data["year"]

In [120]:
stopwords = nltk.corpus.stopwords.words('english')

In [121]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [122]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [123]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in content:
    
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [125]:
# keeping only the first mapping of a stemmed to a tokanized word
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
str(vocab_frame.shape[0])

'397590'

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(content) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 17.6 s, sys: 390 ms, total: 18 s
Wall time: 17.8 s
(96, 4392)


In [127]:
terms = tfidf_vectorizer.get_feature_names()

In [128]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [129]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 472 ms, sys: 7.03 ms, total: 479 ms
Wall time: 278 ms


In [131]:
from sklearn.externals import joblib

# joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [132]:
bills = { 'title': bill_titles, 'content': content, 'cluster': clusters, 'year': years}

frame = pd.DataFrame(bills, index = [clusters] , columns = ['title', 'cluster', 'year'])

In [133]:
frame['cluster'].value_counts()

1    45
4    22
0    16
3    11
2     2
Name: cluster, dtype: int64

In [134]:
grouped = frame['year'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

grouped.mean()

cluster
0    1995
1    1995
2    1996
3    1996
4    1996
Name: year, dtype: int64

In [135]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: consideration, rule, foreign, open, waives, conferences,

Cluster 0 titles: s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908,

Cluster 1 words: sec, assistance, funding, countries, any, international,

Cluster 1 titles: s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908,

Cluster 2 words: gross, trade, qualified, transactions, clause, property,

Cluster 2 titles: hr3540, hr3540,

Cluster 3 words: sec, assistance, countries, nato, funding, development,

Cluster 3 titles: hr3540, hr3540, hr3540, hr3540, hr3540, hr3540, hr3540, hr3540, hr3540, hr3540, hr3540,

Cluster 4 words: sec, u.s., secretary, international, requirement, nations,

Cluster 4 titles: hr3540, hr3540, hr3540, hr3540, hr35