In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import MiniBatchKMeans
import pandas as pd
import numpy as np
import time
import datetime

In [2]:
business_df = pd.read_csv('../data/yelp_training_set_business.csv').set_index('business_id')
business_df[:3]

Unnamed: 0_level_0,categories,city,full_address,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
rncjoVoEFUJGCUoC1JgnUA,"Accountants,Professional Services,Tax Services...",Peoria,"8466 W Peoria Ave\nSte 6\nPeoria, AZ 85345",33.581867,-112.241596,Peoria Income Tax Service,,True,3,5.0,AZ,business
0FNFSzCFP_rGUoJx8W7tJg,"Sporting Goods,Bikes,Shopping",Phoenix,"2149 W Wood Dr\nPhoenix, AZ 85029",33.604054,-112.105933,Bike Doctor,,True,5,5.0,AZ,business
3f_lyB6vFK48ukH6ScvLHg,,Phoenix,"1134 N Central Ave\nPhoenix, AZ 85004",33.460526,-112.073933,Valley Permaculture Alliance,,True,4,5.0,AZ,business


In [3]:
business_df.columns

Index([u'categories', u'city', u'full_address', u'latitude', u'longitude',
       u'name', u'neighborhoods', u'open', u'review_count', u'stars', u'state',
       u'type'],
      dtype='object')

In [4]:
business_df = business_df.drop(['city','full_address', #'latitude','longitude', # keep for location grouping
                      'name','neighborhoods','state','type'],axis=1)
business_df.columns

Index([u'categories', u'latitude', u'longitude', u'open', u'review_count',
       u'stars'],
      dtype='object')

In [6]:
vect = CountVectorizer(tokenizer=lambda text: text.split(','))
cat_fea = vect.fit_transform(business_df['categories'].fillna(''))
cat_fea

<11537x513 sparse matrix of type '<type 'numpy.int64'>'
	with 31416 stored elements in Compressed Sparse Row format>

In [7]:
cat_fea = cat_fea.todense()
cat_fea

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [8]:
cat_fea[:1]

matrix([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0

In [9]:
cat_fea[cat_fea > 1] = 1

In [10]:
## CATEGORY CLUSTERS
#  Based on the category extracted before, the idea is to create a n clusters to
#  aggregate set of similar categories
for esti in (20,35,50,60,70,80,90,100,110,125):
    km = MiniBatchKMeans(n_clusters=esti, random_state=1377, init_size=esti*10)

    print "fitting "+str(esti)+" clusters - category"
    init_time = time.time()
    km.fit(cat_fea)
    print (time.time()-init_time)/60

    business_df['cat_clust_'+str(esti)] = km.predict(cat_fea)

fitting 20 clusters - category
0.0104936639468
fitting 35 clusters - category




0.00657826662064
fitting 50 clusters - category




0.00511775016785
fitting 60 clusters - category




0.00520635048548
fitting 70 clusters - category




0.0150251865387




fitting 80 clusters - category
0.0176492333412
fitting 90 clusters - category




0.0171066681544
fitting 100 clusters - category




0.0132697184881
fitting 110 clusters - category




0.0193510174751
fitting 125 clusters - category




0.0269546190898




In [11]:
business_df.columns

Index([u'categories', u'latitude', u'longitude', u'open', u'review_count',
       u'stars', u'cat_clust_20', u'cat_clust_35', u'cat_clust_50',
       u'cat_clust_60', u'cat_clust_70', u'cat_clust_80', u'cat_clust_90',
       u'cat_clust_100', u'cat_clust_110', u'cat_clust_125'],
      dtype='object')

In [12]:
## LOCATION CLUSTERS
#  Location cluster, even if only in Phoenix area this might spot interesting patterns
for esti in (5,10,15,20,25,30,40):
    km = MiniBatchKMeans(n_clusters=esti, random_state=1377, init_size=esti*100)

    print "fitting "+str(esti)+" clusters - location"
    init_time = time.time()
    km.fit(business_df.ix[:,['latitude','longitude']])
    print (time.time()-init_time)/60

    business_df['loc_clust_'+str(esti)] = km.predict(business_df.ix[:,['latitude','longitude']])

fitting 5 clusters - location
0.00350896517436
fitting 10 clusters - location
0.00165851910909
fitting 15 clusters - location
0.00091921488444
fitting 20 clusters - location
0.00121919711431
fitting 25 clusters - location
0.00173786878586
fitting 30 clusters - location
0.00281663338343
fitting 40 clusters - location
0.00192678372065


In [13]:
business_df.columns

Index([u'categories', u'latitude', u'longitude', u'open', u'review_count',
       u'stars', u'cat_clust_20', u'cat_clust_35', u'cat_clust_50',
       u'cat_clust_60', u'cat_clust_70', u'cat_clust_80', u'cat_clust_90',
       u'cat_clust_100', u'cat_clust_110', u'cat_clust_125', u'loc_clust_5',
       u'loc_clust_10', u'loc_clust_15', u'loc_clust_20', u'loc_clust_25',
       u'loc_clust_30', u'loc_clust_40'],
      dtype='object')

In [14]:
business_df[:2]

Unnamed: 0_level_0,categories,latitude,longitude,open,review_count,stars,cat_clust_20,cat_clust_35,cat_clust_50,cat_clust_60,...,cat_clust_100,cat_clust_110,cat_clust_125,loc_clust_5,loc_clust_10,loc_clust_15,loc_clust_20,loc_clust_25,loc_clust_30,loc_clust_40
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rncjoVoEFUJGCUoC1JgnUA,"Accountants,Professional Services,Tax Services...",33.581867,-112.241596,True,3,5.0,7,15,14,7,...,55,3,102,3,3,3,19,1,7,36
0FNFSzCFP_rGUoJx8W7tJg,"Sporting Goods,Bikes,Shopping",33.604054,-112.105933,True,5,5.0,4,17,15,13,...,43,70,20,2,5,12,14,22,15,6
