In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [3]:
data_directory_pickle = os.path.join('..','data','pickle')
img_directory= os.path.join( '..','images')

dataset = os.path.join(data_directory_pickle,'health_data.pkl')

In [4]:
df = pd.read_pickle(dataset)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44918 entries, 0 to 44917
Data columns (total 9 columns):
business_id        44918 non-null object
review_id          44918 non-null object
health_business    44918 non-null object
name               44918 non-null object
stars              44918 non-null int64
text               44918 non-null object
processed          44918 non-null object
polarity           44918 non-null float64
subjectivity       44918 non-null float64
dtypes: float64(2), int64(1), object(6)
memory usage: 3.1+ MB


In [6]:
df.head(2)

Unnamed: 0,business_id,review_id,health_business,name,stars,text,processed,polarity,subjectivity
0,2hpi6pXIFf0taDIYCoNIuw,4ar9LmGU4rQ3vXFj325HCg,urgent care,Healthcare Partner,1,If your aim is to waste hours upon hours of yo...,if your aim is to waste hours upon hours of yo...,-0.062605,0.532773
1,2hpi6pXIFf0taDIYCoNIuw,mZo59NzNBPr9RegkzjIGVA,urgent care,Healthcare Partner,5,Memorial Day Weekend.. I cannot Thank Doctor ...,memorial day weekend i cannot thank doctor shu...,0.28125,0.553125


In [7]:
df.stars.value_counts()

5    21365
1    15753
4     3279
2     2778
3     1743
Name: stars, dtype: int64

In [8]:
df.health_business.value_counts()

hospital              8247
family practice       6869
urgent care           6637
obstetrician          5846
chiropractors         5739
diagnostic service    5011
internal medicine     3577
physical therapy      1818
mental health         1174
Name: health_business, dtype: int64

## combine 1& 2 star values and 5

In [9]:
data = df.ix[np.where((df.stars<=2)|(df.stars==5))]

In [10]:
data.stars.value_counts()

5    21365
1    15753
2     2778
Name: stars, dtype: int64

In [11]:
data['review'] = np.where((data.stars<=2),'bad_review','good_review')

In [12]:
data.review.value_counts()

good_review    21365
bad_review     18531
Name: review, dtype: int64

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39896 entries, 0 to 44917
Data columns (total 10 columns):
business_id        39896 non-null object
review_id          39896 non-null object
health_business    39896 non-null object
name               39896 non-null object
stars              39896 non-null int64
text               39896 non-null object
processed          39896 non-null object
polarity           39896 non-null float64
subjectivity       39896 non-null float64
review             39896 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 3.3+ MB


In [14]:
data.head(2)

Unnamed: 0,business_id,review_id,health_business,name,stars,text,processed,polarity,subjectivity,review
0,2hpi6pXIFf0taDIYCoNIuw,4ar9LmGU4rQ3vXFj325HCg,urgent care,Healthcare Partner,1,If your aim is to waste hours upon hours of yo...,if your aim is to waste hours upon hours of yo...,-0.062605,0.532773,bad_review
1,2hpi6pXIFf0taDIYCoNIuw,mZo59NzNBPr9RegkzjIGVA,urgent care,Healthcare Partner,5,Memorial Day Weekend.. I cannot Thank Doctor ...,memorial day weekend i cannot thank doctor shu...,0.28125,0.553125,good_review


# Split data

In [16]:
X1 = data.text  # 
X2 = data.processed
#X3 = data.lemmatize_text
y = data.review

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y_s, test_size=0.33, random_state=42)

# import libraries

In [19]:
from __future__ import print_function

from pprint import pprint
import time
import logging

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,recall_score,precision_score

##  Define a pipeline combining
## a text feature extractor with a simple classifier



In [18]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

In [20]:
#pipeline.get_params()

## Default Parameters

```JSON
'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__C': 1.0,
 'clf__class_weight': None,
 'clf__dual': True,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__loss': 'squared_hinge',
 'clf__max_iter': 1000,
 'clf__multi_class': 'ovr',
 'clf__penalty': 'l2',
 'clf__random_state': None,
 'clf__tol': 0.0001,
 'clf__verbose': 0}
 ```

## TEST

In [22]:
parameters = {}  # for test
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           cv=5,
                           n_jobs=-1, 
                           verbose=1)
print('--------------------------------')
print("    GRID SEARCH ACTIVE\n\n")
print('--------------------------------')

print("PIPELINE:", [name for name, _ in pipeline.steps])
print("PARAMETERS:")
pprint(parameters)
print('--------------------------------')
t0 = time.time()
start = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())

print('start: {}'.format(start))
print('--------------------------------')
grid_search.fit(X1,y)
end = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())
print('--------------------------------')
print('End: {}'.format(end))
print('--------------------------------')

seconds = time.time() - t0

print("done in %0.3fs" % (seconds))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

--------------------------------
    GRID SEARCH ACTIVE


--------------------------------
PIPELINE: ['vect', 'tfidf', 'clf']
PARAMETERS:
{}
--------------------------------
start: 10/30/2018 20:43:03
--------------------------------
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   48.1s finished


--------------------------------
End: 10/30/2018 20:43:57
--------------------------------
done in 54.303s

Best score: 0.959
Best parameters set:


## User parameters

In [24]:
parameters = { 
'vect__ngram_range':( (1, 2),),
'vect__max_df': (0.5, 0.75, 1.0), # 1.0,
'vect__max_features':  (None, 5000, 10000, 50000), # None,
 'clf__C': (1,5,10,15,),
             }

# Cross Validation: 1
- time: 12 minutes

In [26]:
#parameters = {}  # for test
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           cv=5,
                           n_jobs=-1, 
                           verbose=1)
print('--------------------------------')
print("    GRID SEARCH ACTIVE\n\n")
print('--------------------------------')

print("PIPELINE:", [name for name, _ in pipeline.steps])
print("PARAMETERS:")
pprint(parameters)
print('--------------------------------')
t0 = time.time()
start = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())

print('start: {}'.format(start))
print('--------------------------------')
grid_search.fit(X1,y)
end = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())
print('--------------------------------')
print('End: {}'.format(end))
print('--------------------------------')

seconds = time.time() - t0

print("done in %0.3fs" % (seconds))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

--------------------------------
    GRID SEARCH ACTIVE


--------------------------------
PIPELINE: ['vect', 'tfidf', 'clf']
PARAMETERS:
{'clf__C': (1, 5, 10, 15),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 2),)}
--------------------------------
start: 10/30/2018 20:45:12
--------------------------------
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 13.9min finished


--------------------------------
End: 10/30/2018 20:59:35
--------------------------------
done in 863.204s

Best score: 0.971
Best parameters set:
	clf__C: 5
	vect__max_df: 0.5
	vect__max_features: None
	vect__ngram_range: (1, 2)


## health_business
## split data

In [38]:
df.health_business.value_counts()

hospital              8247
family practice       6869
urgent care           6637
obstetrician          5846
chiropractors         5739
diagnostic service    5011
internal medicine     3577
physical therapy      1818
mental health         1174
Name: health_business, dtype: int64

In [23]:
X1=df.text 
X2= df.processed
y = df.health_business.values

## Cross Validation: 1

In [40]:
### TEST ###
parameters = {}  # for test
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           cv=5,
                           n_jobs=-1, 
                           verbose=1)
print('--------------------------------')
print("    GRID SEARCH ACTIVE\n\n")
print('--------------------------------')

print("PIPELINE:", [name for name, _ in pipeline.steps])
print("PARAMETERS:")
pprint(parameters)
print('--------------------------------')
t0 = time.time()
start = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())

print('start: {}'.format(start))
print('--------------------------------')
grid_search.fit(X1,y)
end = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())
print('--------------------------------')
print('End: {}'.format(end))
print('--------------------------------')

seconds = time.time() - t0

print("done in %0.3fs" % (seconds))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

--------------------------------
    GRID SEARCH ACTIVE


--------------------------------
PIPELINE: ['vect', 'tfidf', 'clf']
PARAMETERS:
{}
--------------------------------
start: 10/30/2018 21:07:54
--------------------------------
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.0min finished


--------------------------------
End: 10/30/2018 21:09:08
--------------------------------
done in 74.067s

Best score: 0.602
Best parameters set:


In [41]:
parameters = { 
'vect__ngram_range':( (1, 2),),
'vect__max_df': (0.5, 0.75, 1.0), # 1.0,
'vect__max_features':  (None, 5000, 10000, 50000), # None,
 'clf__C': (1,5,10,15,),
             }

In [42]:
#parameters = {}  # for test
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           cv=5,
                           n_jobs=-1, 
                           verbose=1)
print('--------------------------------')
print("    GRID SEARCH ACTIVE\n\n")
print('--------------------------------')

print("PIPELINE:", [name for name, _ in pipeline.steps])
print("PARAMETERS:")
pprint(parameters)
print('--------------------------------')
t0 = time.time()
start = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())

print('start: {}'.format(start))
print('--------------------------------')
grid_search.fit(X1,y)
end = time.strftime('%m/%d/%Y %H:%M:%S',  time.localtime())
print('--------------------------------')
print('End: {}'.format(end))
print('--------------------------------')

seconds = time.time() - t0

print("done in %0.3fs" % (seconds))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

--------------------------------
    GRID SEARCH ACTIVE


--------------------------------
PIPELINE: ['vect', 'tfidf', 'clf']
PARAMETERS:
{'clf__C': (1, 5, 10, 15),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 2),)}
--------------------------------
start: 10/30/2018 21:09:23
--------------------------------
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 26.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 45.2min finished


--------------------------------
End: 10/30/2018 21:55:26
--------------------------------
done in 2763.054s

Best score: 0.623
Best parameters set:
	clf__C: 1
	vect__max_df: 1.0
	vect__max_features: None
	vect__ngram_range: (1, 2)


cvalues
https://medium.com/@pushkarmandot/what-is-the-significance-of-c-value-in-support-vector-machine-28224e852c5a