In [100]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from collections import Counter

In [57]:
d = pd.read_csv('venue_db.csv')
pn = {1: 'good', 0: 'bad'}
print(f'total rows: {len(d)}')

c = Counter(d['is_ok'])
print({pn[_]: c[_] for _ in c})

total rows: 2884
{'good': 2719, 'bad': 165}


In [94]:
X = d['venue']
y = d['is_ok']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)
print(f'training set: {len(X_train)}, testing set: {len(X_test)}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

training set: 2018, testing set: 866
X_test: (866,)
y_test: (866,)


In [95]:
# this will produce a feature union with components called like 'countvectorizer'
features = make_union(CountVectorizer(strip_accents='ascii', 
                                      lowercase=True, 
                                      stop_words='english', 
                                      ngram_range=(1, 3), 
                                      analyzer='word'))

In [104]:
pipeline = make_pipeline(features, SVC())
pars = {'svc__class_weight': ('balanced',),
       'svc__C': (1e-4, 1e-3, 1e-2, 1e-1, 1, 1e+1)}

In [105]:
grid_search = GridSearchCV(pipeline, param_grid=pars)

In [106]:
grid_search.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
 ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svc__class_weight': ('balanced',), 'svc__C': (0.0001, 0.001, 0.01, 0.1, 1, 10.0)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [108]:
yh = grid_search.predict(X_test)
print(classification_report(y_test, yh))

             precision    recall  f1-score   support

          0       0.60      0.24      0.34        50
          1       0.96      0.99      0.97       816

avg / total       0.93      0.95      0.94       866

