In [2]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

In [3]:
unique_immigration_df = pd.read_csv("./unique_immigration_104_107.csv")

In [106]:
bill_nums = unique_immigration_df["bill_title"] 
bill_titles = unique_immigration_df["question"]
content = unique_immigration_df["bill_long_text"]
years = unique_immigration_df["year"]

In [30]:
stopwords = nltk.corpus.stopwords.words('english')

In [31]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [32]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [33]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in content:
    
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [34]:
# keeping only the first mapping of a stemmed to a tokanized word
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
str(vocab_frame.shape[0])

'564919'

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(content) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 26.5 s, sys: 503 ms, total: 27 s
Wall time: 27.1 s
(114, 1409)


In [37]:
terms = tfidf_vectorizer.get_feature_names()

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [39]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 290 ms, sys: 6.1 ms, total: 296 ms
Wall time: 187 ms


In [41]:
from sklearn.externals import joblib

# joblib.dump(km, 'bill_cluster.pkl')

km = joblib.load('bill_cluster.pkl')
clusters = km.labels_.tolist()

In [108]:
bills = { 'number': bill_nums, 'content': content, 'cluster': clusters, 'year': years, 'title':bill_titles}

frame = pd.DataFrame(bills, index = [clusters] , columns = ['title', 'number', 'cluster', 'year'])
print (clusters)
# frame.head(125)

[4, 3, 0, 3, 3, 1, 2, 0, 3, 1, 1, 1, 3, 3, 2, 2, 0, 2, 1, 4, 4, 1, 3, 0, 0, 0, 1, 4, 3, 4, 2, 1, 3, 0, 3, 2, 4, 4, 4, 3, 1, 1, 3, 0, 1, 0, 4, 3, 0, 4, 0, 1, 0, 2, 0, 3, 0, 3, 3, 4, 3, 3, 2, 4, 0, 3, 1, 1, 2, 4, 2, 1, 0, 4, 1, 4, 2, 4, 1, 0, 1, 3, 2, 2, 1, 3, 2, 1, 2, 4, 0, 0, 4, 2, 2, 2, 1, 4, 0, 0, 4, 0, 3, 0, 3, 1, 4, 4, 4, 4, 2, 1, 0, 4]


In [113]:
frame.number.unique()

array(['s1956', 's1357', 's908', 's143', 's1664'], dtype=object)

In [120]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
title_cluster_list = []
clusters_words = []
cluster_titles = []
cluster_center_words = []

for i in range(num_clusters):
    
    tmp_cluster_center_words = []
    for ind in order_centroids[i, :300]:
        tmp_cluster_center_words.append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'))
    cluster_center_words.append(tmp_cluster_center_words)
    
    clusterWord = []
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :30]:
        clusterWord.append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'))
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    clusters_words.append(clusterWord)
    print()
    print()
    
    c_num = frame.ix[i]['number'].values.tolist()
    title_list = frame.ix[i]['title'].values.tolist()
    title_cluster_list.append(title_list)
    cluster_titles.append(c_num)
    print("Cluster %d title:" % i, end='')
    for num in frame.ix[i]['number'].values.tolist():
        print(' %s,' % num, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: alien, visa, amends, immigrant, nations, immigrant, amends, attorney, attorney, petition, directs, directs, nonimmigrant, deportation, border, consideration, illegal, illegal, forth, set, child, adopt, u.s., set, age, closed, respect, natural, sponsor, parents,

Cluster 0 title: s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908, s908,

Cluster 1 words: funding, department, federal, appropriations, administration, agencies, title, officer, make, secretary, supplemental, assistance, available, fy, prohibitions, emergency, supplemental, agriculture, health, related, loan, chapter, educational, sec, rescinds, management, defense, development, housing, commissions,

Cluster 1 title: s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143, s143,

Cluster 2 words: federal, court, funding, atto