In [153]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler

In [154]:
d = pd.read_csv('venue_db.csv')
pn = {1: 'good', 0: 'bad'}
print(f'total rows: {len(d)}')

c = Counter(d['is_ok'])
print({pn[_]: c[_] for _ in c})

total rows: 2884
{'good': 2719, 'bad': 165}


In [155]:
pos_ = d[d['is_ok'] == 1]
neg_ = d[d['is_ok'] == 0]

while len(neg_) < len(pos_):
    neg_ = pd.concat([neg_, neg_.sample(frac=0.2)])
    
d = pd.concat([pos_, neg_]).sample(frac=1.)

In [156]:
X = d['venue']
y = d['is_ok']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)
print(f'training set: {len(X_train)}, testing set: {len(X_test)}')

training set: 4045, testing set: 1734


In [157]:
# this will produce a feature union with components called like 'countvectorizer'
features = make_union(CountVectorizer(strip_accents='ascii', 
                                      lowercase=True, 
                                      stop_words='english', 
                                      ngram_range=(1, 3), 
                                      analyzer='word'))

In [158]:
cls = {'randomforestclassifier': RandomForestClassifier(),
       'svc': SVC()}

ps = {'randomforestclassifier': {'n_estimators': (100,200,300)},
        'svc': {'class_weight': ('balanced', None),
                'C': (1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1)}}


for cl in cls:
    
    print(f'running {cl.upper()}...')
    
    pipeline = make_pipeline(features, cls[cl])
    
    pars = {f'{cl}__{p}': ps[cl][p] for p in ps[cl]}
    
    grid_search = GridSearchCV(pipeline, param_grid=pars)
    
    grid_search.fit(X_train, y_train.values.ravel())
    
    yh = grid_search.predict(X_test)
    
    print(classification_report(y_test, yh))
    
    print('confusion matrix')
    
    print(confusion_matrix(y_test, yh))

running RANDOMFORESTCLASSIFIER...
             precision    recall  f1-score   support

          0       0.97      1.00      0.99       918
          1       1.00      0.97      0.98       816

avg / total       0.99      0.99      0.99      1734

confusion matrix
[[918   0]
 [ 25 791]]
running SVC...
             precision    recall  f1-score   support

          0       0.97      0.73      0.84       918
          1       0.77      0.98      0.86       816

avg / total       0.88      0.85      0.85      1734

confusion matrix
[[674 244]
 [ 18 798]]
