In [2]:
#Credit: https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from sklearn.cross_validation import train_test_split
from collections import defaultdict
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text 
import re
import findspark
findspark.init()
import pyspark
from pyspark.sql import SQLContext
import nltk
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import ward, dendrogram
stopwords=text.ENGLISH_STOP_WORDS

DATAFILEPATH='/Users/Dilip_MBP/Documents/CS109/PROJECT/data/'

[nltk_data] Downloading package punkt to /Users/Dilip_MBP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Read and store csv data in to Master index

bike_masterdf = pd.read_csv(DATAFILEPATH+'bike_details.csv',encoding='utf-8')
print "Number of samples in the database:", bike_masterdf.shape[0]
bike_masterdf.head(2)

Number of samples in the database: 59619


Unnamed: 0.1,Unnamed: 0,id,title,serial,manufacturer_name,frame_model,year,thumb,large_img,is_stock_img,stolen,stolen_location,date_stolen,registration_created_at,registration_updated_at,url,api_url,manufacturer_id,paint_description,name,frame_size,description,rear_tire_narrow,front_tire_narrow,type_of_cycle,test_bike,rear_wheel_size_iso_bsd,front_wheel_size_iso_bsd,handlebar_type_slug,frame_material_slug,front_gear_type_slug,rear_gear_type_slug,stolen_id,s_date_stolen,s_location,latitude,longitude,theft_description,locking_description,lock_defeat_description,police_report_number,police_report_department,s_rec_created_at,create_open311,sder_formatted_address,sder_street_number,sder_route,sder_postal_code,sder_neighborhood,sder_city,sder_county,sder_state,sder_country,date_stolen_epoch,registration_created_at_epoch,registration_updated_at_epoch,s_rec_created_at_epoch
0,0,50088,Schwinn Gateway,absent,Schwinn,Gateway,,,,False,False,,1970-01-01 00:00:00,2015-07-06 22:27:01,2015-11-03 05:36:54,https://bikeindex.org/bikes/50088,https://bikeindex.org/api/v1/bikes/50088,117,,,,,True,,Bike,False,,,,,,,,,,,,,,,,,1970-01-01 00:00:00,,,,,,,,,,,0,1436221621,1446529014,0
1,1,60140,2016 Specialized Diverge Elite DSW,WSBC601010269K,Specialized,Diverge Elite DSW,2016.0,,,False,False,,1970-01-01 00:00:00,2015-10-08 19:18:50,2015-11-03 05:34:30,https://bikeindex.org/bikes/60140,https://bikeindex.org/api/v1/bikes/60140,307,,,52cm,,True,,Bike,False,,,,aluminum,,,,,,,,,,,,,1970-01-01 00:00:00,,,,,,,,,,,0,1444331930,1446528870,0


In [4]:
# Remove nulls in theft description - only stolen bikes have theft description -> store in filtered dataframe
num_stolen = len(bike_masterdf.stolen[bike_masterdf.stolen == True])
print "Number of bikes reported as stolen: ", num_stolen
print "Proportion of bikes reported as stolen:  %.2f" % (float(num_stolen)/float(bike_masterdf.shape[0]))
desc_mask = bike_masterdf.theft_description.isnull()
bike_subdf = bike_masterdf[~desc_mask]
print "Number of Stolen bikes with theft description:", bike_subdf.shape[0]

bike_subdf.to_csv(DATAFILEPATH+'stolen_bikes.csv',encoding='utf-8')
bike_subdf.head(2)

Number of bikes reported as stolen:  33182
Proportion of bikes reported as stolen:  0.56
Number of Stolen bikes with theft description: 29838


Unnamed: 0.1,Unnamed: 0,id,title,serial,manufacturer_name,frame_model,year,thumb,large_img,is_stock_img,stolen,stolen_location,date_stolen,registration_created_at,registration_updated_at,url,api_url,manufacturer_id,paint_description,name,frame_size,description,rear_tire_narrow,front_tire_narrow,type_of_cycle,test_bike,rear_wheel_size_iso_bsd,front_wheel_size_iso_bsd,handlebar_type_slug,frame_material_slug,front_gear_type_slug,rear_gear_type_slug,stolen_id,s_date_stolen,s_location,latitude,longitude,theft_description,locking_description,lock_defeat_description,police_report_number,police_report_department,s_rec_created_at,create_open311,sder_formatted_address,sder_street_number,sder_route,sder_postal_code,sder_neighborhood,sder_city,sder_county,sder_state,sder_country,date_stolen_epoch,registration_created_at_epoch,registration_updated_at_epoch,s_rec_created_at_epoch
3,3,11542,2013 Trek 4300,WTU124C5170G,Trek,4300,2013,,,False,True,"San Jose, CA, 95126",2013-06-10 06:00:00,2013-08-20 06:00:00,2015-11-03 05:11:14,https://bikeindex.org/bikes/11542,https://bikeindex.org/api/v1/bikes/11542,47,,,,"Trek 4300, Mountain bicycle, disc brakes, blac...",True,,Bike,False,,,,,,,7889,1370844000,"San Jose, CA, 95126",37.329012,-121.916021,bike rack,,,T13006853,San Jose,2014-06-14 17:52:49,False,"1323 Martin Ave, San Jose, CA 95126, USA",1323,Martin Avenue,95126,Shasta-Hanchett Park,San Jose,Santa Clara County,California,United States,1370844000,1376978400,1446527474,1402768369
4,4,13359,2010 Schwinn Katana,absent,Schwinn,Katana,2010,https://files.bikeindex.org/uploads/Pu/4903/sm...,https://files.bikeindex.org/uploads/Pu/4903/la...,False,True,"Sandusky, OH, 44870",2010-07-10 06:00:00,2010-07-11 06:00:00,2015-11-03 05:17:53,https://bikeindex.org/bikes/13359,https://bikeindex.org/api/v1/bikes/13359,117,,,58cm,"The bike is a grey, blue, and black Schwinn Ka...",True,,Bike,False,,,,,,,9705,1278741600,"Sandusky, OH, 44870",41.440356,-82.768114,car bike rack Reward: $75,,,2010004433,Sandusky,2014-06-14 18:17:08,False,"4614 Venice Heights Blvd, Sandusky, OH 44870, USA",4614,Venice Heights Boulevard,44870,,Sandusky,Erie County,Ohio,United States,1278741600,1278828000,1446527873,1402769828


In [5]:
# Create function to parse incoming text, tokenize ALL words - no filtering 
regex1=re.compile(r"\.{2,}")
regex2=re.compile(r"\-{2,}")
regex3=re.compile(r"\\\\")
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')
stemmer = SnowballStemmer("english")

#use nltk tokenizer to tokenize and stem words in incoming sentence 
def clean_and_tokenize(intext):
    intext=re.sub(regex1, ' ', intext)
    intext=re.sub(regex2, ' ', intext)
    intext=re.sub(regex3, ' ', intext)
    
    #tokenize sentence
    dirty_tokens = [word.lower() for sent in nltk.sent_tokenize(intext) for word in nltk.word_tokenize(sent)]
    tokens=[]
    for eacht in dirty_tokens:
        if eacht not in punctuation:
            tokens.append(eacht)
    stemmed_tokens = [stemmer.stem(t) for t in tokens]
    
    return " ".join(stemmed_tokens)
    
    


In [6]:
# Create function to parse incoming text, tokenize and check for  ******* Nouns
regex1=re.compile(r"\.{2,}")
regex2=re.compile(r"\-{2,}")
regex3=re.compile(r"\\\\")
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')

def gather_nouns(thetext):
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    thetext=re.sub(regex3, ' ', thetext)

    nouns=[]
    for sentence in parse(thetext, tokenize=True, lemmata = True, encoding = 'utf-8').split():
        for token in sentence:
            #print token
            if len(token[4]) >0:
                if token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns.append(token[4])
    nouns2=[]

    for n in nouns:
        if len(n)!=0:
            nouns2.append(n)
        else:
            print "************** WE HAVE A PROBLEM **************"
    return ' '.join(nouns2)

In [None]:
atext = "The bikes were locked to a sign pole in my apartment by the corner of the superstore. The lock was a combination lock and i had \
tied chained my helmet to the bike. We have fairly good security in the block of apartments here. The bike was of make 'GT' with shimano\
gears and had a custom seat. The apartment's security office told me that i have no chance of retrieving the bike"

print clean_and_tokenize(atext)
print "\n"
print gather_nouns(atext)

the bike were lock to a sign pole in my apart by the corner of the superstor the lock was a combin lock and i had tie chain my helmet to the bike we have fair good secur in the block of apart here the bike was of make gt ' with shimanogear and had a custom seat the apart 's secur offic told me that i have no chanc of retriev the bike


bike sign pole apartment corner superstore lock combination lock helmet bike security block apartment bike make shimanogear custom seat apartment security office chance bike


In [None]:
#bike_subdf['Clean_theft_description'] = bike_subdf.theft_description.map(clean_and_tokenize)
bike_subdf['Noun_parsed_theft_description'] = bike_subdf.theft_description.map(gather_nouns)
bike_subdf.to_csv(DATAFILEPATH+'stolen_bikes_tokenized_nouns.csv',encoding='utf-8')

In [None]:
#Tokenize each theft description to return a list of nouns, proper nouns etc for each sample (row)
# We will remove words whose word count is less than 2
vectorizer = CountVectorizer(min_df=2, stop_words='english')
text_sparse_matrix = vectorizer.fit_transform(bike_subdf.Noun_parsed_theft_description)    #************* May need to use mask for train/test 
id2words= dict((v, k) for k, v in vectorizer.vocabulary_.iteritems())
corpus_gensim = gensim.matutils.Sparse2Corpus(text_sparse_matrix, documents_columns=False)

In [None]:
print "Corpus shape:", corpus_gensim.sparse.get_shape()
print "Number of words in vocabulary:", len(vectorizer.vocabulary_)


In [None]:
#Use sparse matrix to initiate K-means clustering


from sklearn.cluster import KMeans

num_clusters = np.arange(2,11)
kmeans_model={}

for i in num_clusters:
    kmeans_model[i] = KMeans(init='k-means++', n_clusters=i, n_init=10)
    %time kmeans_model[i].fit(text_sparse_matrix)



In [None]:
for i in num_clusters:
    print "Printing cluster centers for Kmeans with ", i, " of clusters"
    print kmeans_model[i].cluster_centers_

In [None]:
len(kmeans_model[2].cluster_centers_[0])

In [1]:
# Using Cosine similarity instead of euclidean distance
from sklearn.metrics.pairwise import cosine_similarity

#Convert sparse matrix to full array
text_full_matrix = text_sparse_matrix.toarray()

#%time distance = 1 - cosine_similarity(text_full_matrix)

NameError: name 'text_sparse_matrix' is not defined

In [None]:
%time linkage_matrix = ward(distance) #define the linkage_matrix using ward clustering pre-computed distances


In [None]:
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right");

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout