In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split

from categorization import data
from categorization.featurization import SequenceFeaturizer, TfidfBowFeaturizer
from categorization.evaluation import experiment, model_experiment
from categorization.model import DenseTextualModel, RnnTextualModel, SimpleLrBow

## Build the Train and Validation Datasets

In [3]:
businesses = data.load_business_df()
len(businesses)

158525

In [4]:
businesses.head()

Unnamed: 0,business_id,business_name,review_count,stars,state,city,categories
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,128,2.5,ON,Mississauga,"[Specialty Food, Restaurants, Dim Sum, Importe..."
1,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,170,4.0,NC,Charlotte,"[Sushi Bars, Restaurants, Japanese]"
2,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,3,5.0,AZ,Goodyear,"[Insurance, Financial Services]"
3,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,4,4.0,NC,Charlotte,"[Plumbing, Shopping, Local Services, Home Serv..."
4,68dUKd8_8liJ7in4aWOSEA,The UPS Store,3,2.5,ON,Mississauga,"[Shipping Centers, Couriers & Delivery Service..."


In [5]:
pen_businesses = businesses[businesses.state == 'PA']

In [6]:
pen_businesses.head()

Unnamed: 0,business_id,business_name,review_count,stars,state,city,categories
20,1RHY4K3BD22FK7Cfftn8Mg,Marathon Diner,35,4.0,PA,Pittsburgh,"[Sandwiches, Salad, Restaurants, Burgers, Comf..."
43,qWWBVE5T_zMEF7UJ4iTfNw,"DJ Yonish, Inc.",3,2.5,PA,Bethel Park,"[Home Services, Heating & Air Conditioning/HVAC]"
51,dQj5DLZjeDK3KFysh1SYOQ,Apteka,242,4.5,PA,Pittsburgh,"[Nightlife, Bars, Polish, Modern European, Res..."
58,v-scZMU6jhnmV955RSzGJw,No. 1 Sushi Sushi,106,4.5,PA,Pittsburgh,"[Japanese, Sushi Bars, Restaurants]"
61,KFbUQ-RR2UOV62Ep7WnXHw,Westwood Bar & Grill,5,3.0,PA,West Mifflin,"[American (Traditional), Restaurants]"


In [7]:
root_categories = data.CategoryTree().root_categories

In [8]:
%%time

examples, labels = data.load_examples(
    set(pen_businesses.business_id.unique()),
    accepted_categories=root_categories,
    reviews_per_business=1
)

len(examples), len(labels)

CPU times: user 1min 7s, sys: 4.39 s, total: 1min 11s
Wall time: 1min 14s


(6207, 6207)

In [9]:
train_examples, validation_examples, train_labels, validation_labels = \
    train_test_split(examples, labels, test_size=.2)

len(train_examples), len(train_labels), len(validation_examples), len(validation_labels)

(4965, 4965, 1242, 1242)

## Models

In [11]:
%%time

exp = experiment(
    TfidfBowFeaturizer(),
    SimpleLrBow(),
    train_examples,
    train_labels,
    validation_examples,
    validation_labels
)

                              precision    recall  f1-score   support

                 Active Life       1.00      1.00      1.00       222
        Arts & Entertainment       1.00      1.00      1.00        53
                  Automotive       1.00      1.00      1.00       476
               Beauty & Spas       1.00      1.00      1.00       509
                   Education       1.00      1.00      1.00        17
   Event Planning & Services       1.00      1.00      1.00        47
          Financial Services       1.00      1.00      1.00        29
                        Food       1.00      1.00      1.00       459
            Health & Medical       1.00      1.00      1.00       328
               Home Services       1.00      1.00      1.00       363
             Hotels & Travel       1.00      1.00      1.00        76
                Local Flavor       1.00      1.00      1.00        11
              Local Services       1.00      1.00      1.00       144
                  M

In [12]:
for eg in exp.errors_for_label('Beauty & Spas', False)['fn']:
    print('\t|\t'.join([eg.label, eg.prediction, eg.example.business_name, eg.example.review.text[:20]]))

Beauty & Spas	|	Pets	|	Supercuts	|	As I sit in my car t
Beauty & Spas	|	Food	|	Lacquered Up	|	Well, the artists ar
Beauty & Spas	|	Health & Medical	|	Shao Ping He Chinese Massage Therapy	|	No frills, just a go
Beauty & Spas	|	Food	|	Bath & Body Works	|	I stopped in to purc
Beauty & Spas	|	Health & Medical	|	Natural Choice	|	I have been to many 
Beauty & Spas	|	Shopping	|	Sugar Spa and Beauty Bar	|	Wonderful ambiance a
Beauty & Spas	|	Health & Medical	|	Massage a La Carter	|	I had upper back pai
Beauty & Spas	|	Home Services	|	Carson Street Tattoo	|	Awesome place! Zack 


In [None]:
# dense_experiment = model_experiment(
#     featurizer,
#     DenseTextualModel(
#         vocab_size=vocab_size, input_length=input_length,
#         embedding_dimension=50, hidden_dimension=50, num_classes=NUM_CLASSES,
#         learning_rate=0.01, epochs=1, batch_size=256
#     ),
#     labelizer,
#     train_features,
#     train_label_sets, validation_features, validation_label_sets, 
# )

# rnn_experiment = model_experiment(
#     featurizer,
#     RnnTextualModel(
#         vocab_size=vocab_size, input_length=input_length,
#         embedding_dimension=100, rnn_dimension=100, num_classes=NUM_CLASSES,
#         learning_rate=0.001, epochs=200, batch_size=512
#     ),
#     labelizer,
#     train_features,
#     train_label_sets, validation_features, validation_label_sets, 
# )