In [1]:
import json
import urllib2
import numpy as np
import cStringIO as StringIO
import codecs
import nltk
import re

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


In [7]:
# load data
source_url = r'https://data.seattle.gov/api/views/mags-97de/rows.json?accessType=DOWNLOAD'

response = urllib2.urlopen(source_url)
source = response.read()
full_data = json.loads(source)

# split into metadata (column name descriptions, etc) and actual data
meta = full_data['meta']
data = full_data['data']

# column names available
names = [m['name'].lower() for m in meta['view']['columns']]

name_to_idx_dict = {}
for idx, name in enumerate(names):
    name_to_idx_dict[name] = idx

# indices of column names we are interested in    
description_idx = name_to_idx_dict['description']
category_idx    = name_to_idx_dict['category']
id_idx          = name_to_idx_dict['id']
lon_idx         = name_to_idx_dict['longitude']
lat_idx         = name_to_idx_dict['latitude']
value_idx       = name_to_idx_dict['value']

In [8]:
# map building permit categories to numeric id's
def category_to_id(category):
    if category   == 'SINGLE FAMILY / DUPLEX':
        return 1
    elif category == 'MULTIFAMILY':
        return 2
    return 0

In [29]:
# collect permit id's, their description texts, categories, fabricated category ids, lons, lats, values into lists 
id_list          = []
description_list = []
category_list    = []
category_id_list = []
lon_list         = []
lat_list         = []
value_list       = []


for d in data:
    if d is not None:
        id_list.append(d[id_idx])
        description_list.append(d[description_idx])
        category_list.append(d[category_idx])
        category_id_list.append(category_to_id(d[category_idx]))
        lon_list.append(d[lon_idx])
        lat_list.append(d[lat_idx])
        value_list.append(d[value_idx])

In [31]:
# regular-expression-based text cleaners
# XXX add more

# get rid of numerals in the terms
number_pat = re.compile('\D*\d')
def not_match_number(x):
    if re.match(number_pat, x) is None:
        return True

In [32]:
# the list of clean description texts
text_list = []

indices = []
stemmer = SnowballStemmer("english", ignore_stopwords=True)

# XXX investigate exeptions
for i, d in enumerate(description_list):
    try:
        words = d.lower().split(' ')
        update = filter(not_match_number, words)
        text = ' '.join([stemmer.stem(word) for word in update])
        text_list.append(text)
        indices.append(i)
    except:
        pass

identity_list    = list(np.array(id_list)[indices])
category_id_list = list(np.array(category_id_list, dtype=np.int32)[indices])
lon_list         = list(np.array(lon_list)[indices])
lat_list         = list(np.array(lat_list)[indices])
value_list       = list(np.array(value_list)[indices])

In [33]:
# construct the TF-IDF matrix
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(text_list)

# vocabulary
term_list = vectorizer.get_feature_names()

In [35]:
# save TF-IDF matrix

buf = StringIO.StringIO()

for row, x in enumerate(X):
    dense_x = x.toarray().flatten()
    idx = np.where(dense_x != 0)[0]
    val = dense_x[idx]
    print >> buf, '%d' % category_id_list[row],
    for i, v in zip(*[idx, val]):
        print >> buf, ' %d:%f' % (i, v),
    print >> buf

contents = buf.getvalue()
f = open('seattle_tfidf.txt', 'w')
f.write(contents)
f.close()

In [36]:
# save vocabulary terms 

term_list = vectorizer.get_feature_names()
f = codecs.open('seattle_terms.txt', 'w', 'utf-8')
for term in term_list:
    print >> f, '%s' % term
f.close()

In [37]:
# save building permit id's

f = codecs.open('seattle_identities.txt', 'w', 'utf-8')
for identity in identity_list:
    print >> f, '%s' % identity
f.close()

In [38]:
# save building locations

f = codecs.open('seattle_locations.txt', 'w', 'utf-8')
for lon, lat in zip(*[lon_list, lat_list]):
    print >> f, '%s %s' % (lon, lat)
f.close()

In [40]:
# save values

f = codecs.open('seattle_values.txt', 'w', 'utf-8')
for val in value_list:
    print >> f, '%s' % val
f.close()

In [41]:
# save the matrix as in sparse CSR format
def save_sparse_csr(filename,array):
    np.savez(filename, data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

In [42]:
save_sparse_csr('seattle_tfidf_csr_matrix.npz', X)