In [13]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import ElasticNetCV

import pickle

In [14]:
d = pd.read_csv('venue_db.csv')
pn = {1: 'good', 0: 'bad'}
print(f'total rows: {len(d)}')

c = Counter(d['is_ok'])
print({pn[_]: c[_] for _ in c})

total rows: 2884
{'good': 2533, 'bad': 351}


In [15]:
pos_ = d[d['is_ok'] == 1]
neg_ = d[d['is_ok'] == 0]

while len(neg_) < len(pos_):
    neg_ = pd.concat([neg_, neg_.sample(frac=0.2)])
    
d = pd.concat([pos_, neg_]).sample(frac=1.)

In [16]:
X = d['venue']
y = d['is_ok']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)
print(f'training set: {len(X_train)}, testing set: {len(X_test)}')

training set: 3594, testing set: 1541


In [17]:
# this will produce a feature union with components called like 'countvectorizer'
features = make_union(CountVectorizer(strip_accents='ascii', 
                                      lowercase=True, 
                                      stop_words='english', 
                                      ngram_range=(1, 3), 
                                      analyzer='word'))

In [24]:
cls = {'randomforestclassifier': RandomForestClassifier()}

ps = {'randomforestclassifier': {'n_estimators': (100,200,300)},
        'svc': {'class_weight': ('balanced', None),
                'C': (1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1)}}


for cl in cls:
    
    print(f'running {cl.upper()}...')
    
    pipeline = make_pipeline(features, cls[cl])
    
    pars = {f'{cl}__{p}': ps[cl][p] for p in ps[cl]}
    
    grid_search = GridSearchCV(pipeline, param_grid=pars)
    
    grid_search.fit(X_train, y_train.values.ravel())
    
    yh = grid_search.predict(X_test)
    
    print(classification_report(y_test, yh))
    
    print('confusion matrix')
    
    print(confusion_matrix(y_test, yh))
    
    print('incorrectly labelled as BAD')
    
    print(X_test[list(yh - y_test < 0)])
    
    print('incorrectly labelled as GOOD')
    
    print(X_test[list(yh - y_test > 0)])

running RANDOMFORESTCLASSIFIER...
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       781
          1       0.99      0.99      0.99       760

avg / total       0.99      0.99      0.99      1541

confusion matrix
[[773   8]
 [  5 755]]
incorrectly labelled as BAD
2355                  the wharf tavern mooloolaba
1636              river terrace federation square
1748    shipwreck bay holiday park warnambool vic
2187               hard rock cafe darling harbour
1030                       zinc federation square
Name: venue, dtype: object
incorrectly labelled as GOOD
646                      elder park king william road sa
1454    hens pampered package - gift voucher - experienc
550                            ridley reserved elizabeth
646                      elder park king william road sa
787                                 coolangatta beachfnt
1816        wendouree sports and events centre, ballarat
1454    hens pampered package - gif

In [55]:
feature_name_list = grid_search.best_estimator_.named_steps['featureunion'].get_feature_names()
feature_importances = grid_search.best_estimator_.named_steps['randomforestclassifier'].feature_importances_

features_ranked = sorted([(name, round(imp,5)) for name, imp in zip(feature_name_list, feature_importances)], key=lambda x: x[1], reverse=True)

In [58]:
features_ranked[:20]

[('countvectorizer__events', 0.04428),
 ('countvectorizer__festival', 0.02639),
 ('countvectorizer__park', 0.02317),
 ('countvectorizer__reserve', 0.01778),
 ('countvectorizer__wharf', 0.01349),
 ('countvectorizer__club', 0.01233),
 ('countvectorizer__cruises', 0.0103),
 ('countvectorizer__hotel', 0.00776),
 ('countvectorizer__square', 0.00736),
 ('countvectorizer__theatre', 0.00721),
 ('countvectorizer__beach', 0.00715),
 ('countvectorizer__experience', 0.00702),
 ('countvectorizer__circus', 0.00692),
 ('countvectorizer__office', 0.00691),
 ('countvectorizer__ballooning', 0.0067),
 ('countvectorizer__cruise', 0.00662),
 ('countvectorizer__ticketek', 0.00656),
 ('countvectorizer__pier', 0.0061),
 ('countvectorizer__centre', 0.00606),
 ('countvectorizer__balloon', 0.00575)]

In [44]:
pickle.dump(grid_search.best_estimator_, open('badvenue.pkl','wb'))