In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import pickle

In [3]:
data_directory_pickle = os.path.join('..','data','pickle')

dataset = os.path.join(data_directory_pickle,'health_data.pkl')

In [4]:
df = pd.read_pickle(dataset)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44918 entries, 0 to 44917
Data columns (total 9 columns):
business_id        44918 non-null object
review_id          44918 non-null object
health_business    44918 non-null object
name               44918 non-null object
stars              44918 non-null int64
text               44918 non-null object
processed          44918 non-null object
polarity           44918 non-null float64
subjectivity       44918 non-null float64
dtypes: float64(2), int64(1), object(6)
memory usage: 3.1+ MB


In [8]:
df.head(2)

Unnamed: 0,business_id,review_id,health_business,name,stars,text,processed,polarity,subjectivity
0,2hpi6pXIFf0taDIYCoNIuw,4ar9LmGU4rQ3vXFj325HCg,urgent care,Healthcare Partner,1,If your aim is to waste hours upon hours of yo...,if your aim is to waste hours upon hours of yo...,-0.062605,0.532773
1,2hpi6pXIFf0taDIYCoNIuw,mZo59NzNBPr9RegkzjIGVA,urgent care,Healthcare Partner,5,Memorial Day Weekend.. I cannot Thank Doctor ...,memorial day weekend i cannot thank doctor shu...,0.28125,0.553125


# Split data

In [9]:
X = df.text  # 
y_s = df.stars
y_h = df.health_business

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_s, test_size=0.33, random_state=42)

# import libraries

In [27]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,recall_score,precision_score

##  Define a pipeline combining
## a text feature extractor with a simple classifier



In [12]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

In [15]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), preprocessor=None, stop_words=None,
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
        verbose=0))],
 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_ra

## Default Parameters

```JSON
{'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__C': 1.0,
 'clf__class_weight': None,
 'clf__dual': True,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__loss': 'squared_hinge',
 'clf__max_iter': 1000,
 'clf__multi_class': 'ovr',
 'clf__penalty': 'l2',
 'clf__random_state': None,
 'clf__tol': 0.0001,
 'clf__verbose': 0}
 ```

## User parameters

In [31]:
parameters = {
            #### CountVectorizer #########
            #'vect__analyzer':( 'word',),
           # 'vect__binary':( False,),
            #'vect__decode_error': ('strict',),
            #'vect__dtype':( np.int64,),
            #'vect__encoding':( 'utf-8',),
            #'vect__input':( 'content',),
            #'vect__lowercase':( True,),
            'vect__max_df': (0.5, 0.75, 1.0), # 1.0,
            'vect__max_features':  (None, 5000, 10000, 50000), # None,
            'vect__min_df':(1,),
            'vect__ngram_range':((1, 1), (1, 2)),  # unigrams or bigrams (1, 1),
            'vect__preprocessor':( None,),
            'vect__stop_words':( None,),
            'vect__strip_accents': (None,),
            'vect__token_pattern': ('(?u)\\b\\w\\w+\\b',),
            'vect__tokenizer':( None,),
            'vect__vocabulary':( None,),
             #### TfidfTransformer #########
            'tfidf__norm': ('l1', 'l2'), #'l2',
            'tfidf__smooth_idf': (True,),
            'tfidf__sublinear_tf':( False,),
            'tfidf__use_idf': (True, False), # True,
            #### Classifier : LinearSVC #########
            'clf__C': [0.1, 1, 10, 100], # 1.0,
            'clf__class_weight':( None,),
            'clf__dual':( True,),
            'clf__fit_intercept':( True,),
            'clf__intercept_scaling':( 1,),
            'clf__loss':( 'squared_hinge',),
            'clf__max_iter':( 1000,),
            'clf__multi_class':( 'ovr',),
            'clf__penalty':( 'l2',),
            'clf__random_state':( None,),
            'clf__tol': (0.0001,), # 0.0001,
            'clf__verbose': (0,),}

In [18]:
#parameters = {}

In [32]:
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           cv=5,
                           n_jobs=-1, 
                           verbose=1)
print('--------------------------------')
print("    GRID SEARCH ACTIVE\n\n")
print('--------------------------------')

print("PIPELINE:", [name for name, _ in pipeline.steps])
print("PARAMETERS:")
pprint(parameters)
print('--------------------------------')
t0 = time()
grid_search.fit(X,y_s)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

--------------------------------
    GRID SEARCH ACTIVE


--------------------------------
PIPELINE: ['vect', 'tfidf', 'clf']
PARAMETERS:
{'clf__C': [0.1, 1, 10, 100],
 'clf__class_weight': (None,),
 'clf__dual': (True,),
 'clf__fit_intercept': (True,),
 'clf__intercept_scaling': (1,),
 'clf__loss': ('squared_hinge',),
 'clf__max_iter': (1000,),
 'clf__multi_class': ('ovr',),
 'clf__penalty': ('l2',),
 'clf__random_state': (None,),
 'clf__tol': (0.0001,),
 'clf__verbose': (0,),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__smooth_idf': (True,),
 'tfidf__sublinear_tf': (False,),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__min_df': (1,),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__preprocessor': (None,),
 'vect__stop_words': (None,),
 'vect__strip_accents': (None,),
 'vect__token_pattern': ('(?u)\\b\\w\\w+\\b',),
 'vect__tokenizer': (None,),
 'vect__vocabulary': (None,)}
--------------------------------
Fitt

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 34.9min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed: 74.5min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 189.1min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 218.0min finished


done in 13120.495s

Best score: 0.812
Best parameters set:
	clf__C: 1
	clf__class_weight: None
	clf__dual: True
	clf__fit_intercept: True
	clf__intercept_scaling: 1
	clf__loss: 'squared_hinge'
	clf__max_iter: 1000
	clf__multi_class: 'ovr'
	clf__penalty: 'l2'
	clf__random_state: None
	clf__tol: 0.0001
	clf__verbose: 0
	tfidf__norm: 'l2'
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 1
	vect__ngram_range: (1, 2)
	vect__preprocessor: None
	vect__stop_words: None
	vect__strip_accents: None
	vect__token_pattern: '(?u)\\b\\w\\w+\\b'
	vect__tokenizer: None
	vect__vocabulary: None


# Results

## Out of the box
```
--------------------------------
    GRID SEARCH ACTIVE


--------------------------------
PIPELINE: ['vect', 'tfidf', 'clf']
PARAMETERS:
{}
--------------------------------
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   59.8s finished
done in 70.801s

Best score: 0.788
Best parameters set:

```


## 

```
--------------------------------
    GRID SEARCH ACTIVE


--------------------------------
PIPELINE: ['vect', 'tfidf', 'clf']
PARAMETERS:
{'clf__C': [0.1, 1, 10, 100],
 'clf__class_weight': (None,),
 'clf__dual': (True,),
 'clf__fit_intercept': (True,),
 'clf__intercept_scaling': (1,),
 'clf__loss': ('squared_hinge',),
 'clf__max_iter': (1000,),
 'clf__multi_class': ('ovr',),
 'clf__penalty': ('l2',),
 'clf__random_state': (None,),
 'clf__tol': (0.0001,),
 'clf__verbose': (0,),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__smooth_idf': (True,),
 'tfidf__sublinear_tf': (False,),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__min_df': (1,),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__preprocessor': (None,),
 'vect__stop_words': (None,),
 'vect__strip_accents': (None,),
 'vect__token_pattern': ('(?u)\\b\\w\\w+\\b',),
 'vect__tokenizer': (None,),
 'vect__vocabulary': (None,)}
--------------------------------
Fitting 5 folds for each of 384 candidates, totalling 1920 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 34.9min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed: 74.5min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 189.1min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 218.0min finished
done in 13120.495s

Best score: 0.812
Best parameters set:
	clf__C: 1
	clf__class_weight: None
	clf__dual: True
	clf__fit_intercept: True
	clf__intercept_scaling: 1
	clf__loss: 'squared_hinge'
	clf__max_iter: 1000
	clf__multi_class: 'ovr'
	clf__penalty: 'l2'
	clf__random_state: None
	clf__tol: 0.0001
	clf__verbose: 0
	tfidf__norm: 'l2'
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 1
	vect__ngram_range: (1, 2)
	vect__preprocessor: None
	vect__stop_words: None
	vect__strip_accents: None
	vect__token_pattern: '(?u)\\b\\w\\w+\\b'
	vect__tokenizer: None
	vect__vocabulary: None
```

In [29]:
count_vectorizer = CountVectorizer()
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
svc_count_clf = LinearSVC()
svc_count_clf.fit(count_train, y_train)
pred = svc_count_clf.predict(count_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.756


In [33]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_df=0.5,)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

svc_tfidf_clf = LinearSVC()
svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.827
