In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
path = 'yelp_data/health_text_sentiment.csv'
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,stars,text,sent_value,sent_score,clean_text,sent_value_clean,sent_score_clean
0,1,Please stay away from this place if you can! I...,-0.050645,negative,please stay away place bad care imaginable sta...,-0.036719,negative
1,5,My husband has been a patient of Dr. Byrne for...,0.024962,positive,husband patient dr byrne last year half last m...,0.069479,positive
2,4,Dr. Byrne is a great doctor! She has great bed...,0.513333,positive,dr byrne great doctor great bed side manner ex...,0.317778,positive
3,3,I'm raising my review as Dr Bryne's has been m...,-0.035714,negative,raise review dr bryne receptive daughter go an...,0.002806,positive
4,1,I wish I could give 0 stars. Worst office I've...,-0.048246,negative,wish could give star bad office ever horrible ...,-0.084259,negative


## Split

In [5]:
data = df[['stars','text','clean_text']]

In [6]:
data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
2,4,Dr. Byrne is a great doctor! She has great bed...,dr byrne great doctor great bed side manner ex...
3,3,I'm raising my review as Dr Bryne's has been m...,raise review dr bryne receptive daughter go an...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...


## Select 1& 5 stars only

In [7]:
data = data.ix[np.where((data.stars==1)|(data.stars==5))]
data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...
5,1,I went to the emergency room because i was hav...,go emergency room kidney stone attack emergenc...
6,5,Dr. Byrne is an excellent doctor with all the ...,dr byrne excellent doctor right skill include ...


## binary stars

In [8]:
data.stars.replace(1,0,inplace=True)
data.stars.replace(5,1,inplace=True)

In [9]:
data.head()

Unnamed: 0,stars,text,clean_text
0,0,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,1,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
4,0,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...
5,0,I went to the emergency room because i was hav...,go emergency room kidney stone attack emergenc...
6,1,Dr. Byrne is an excellent doctor with all the ...,dr byrne excellent doctor right skill include ...


# Logistic Regression

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [12]:
review = data['text'].values.astype(str)  # original review
review_clean = data['clean_text'].values.astype(str)  # clean text review
sentiments = data['stars'].values  # star value

In [13]:
X_train, X_test, y_train, y_test = train_test_split(review, sentiments, test_size=0.33, random_state=42)

In [None]:
CountVectorizer?

### `CountVectorizer`
Convert a collection of text documents to a matrix of token counts

This implementation produces a sparse representation of the counts using
scipy.sparse.csr_matrix.


> Init signature:
```python
CountVectorizer(input='content', encoding='utf-8', decode_error='strict', 
                strip_accents=None, lowercase=True, preprocessor=None, 
                tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', 
                ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, 
                max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
```
Docstring:     
Convert a collection of text documents to a matrix of token counts


In [14]:
vectorizer = CountVectorizer()  # count the number of words in the document
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
# Transform the text list to a matrix form
X_train_vectorized = vectorizer.transform(X_train)

In [16]:
classifier = LogisticRegression()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)  # we fit with the X_train_vectorized, not original X_train

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)   # we transform the text list to a matrix form


# Check our classifier performance
score = classifier.score(X_test_vectorized, y_test)

print("Accuracy=", score)

Accuracy= 0.9710030463725601


In [None]:
train_test_split?

## create a function

In [26]:
def nlp_LogisticRegression(X,y,nrange=0):
    '''func '''
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    if nrange== 0:
        vectorizer = CountVectorizer()
    else:
        vectorizer = CountVectorizer(ngram_range=nrange)

    vectorizer.fit(X_train)
    X_train_vectorized = vectorizer.transform(X_train)
    classifier = LogisticRegression()

    # Train the classifier
    classifier.fit(X_train_vectorized, y_train)  # we fit with the X_train_vectorized, not original X_train

    # Vectorize the test data
    X_test_vectorized = vectorizer.transform(X_test)   # we transform the text list to a matrix form


    # Check our classifier performance
    score = classifier.score(X_test_vectorized, y_test)
    return score

In [27]:
review = data['text'].values.astype(str)  # original review
review_clean = data['clean_text'].values.astype(str)  # clean text review
sentiments = data['stars'].values  # star value

In [28]:
%%time
original = nlp_LogisticRegression(review, sentiments)
original

CPU times: user 1min 17s, sys: 9.88 s, total: 1min 27s
Wall time: 22.4 s


In [29]:
original

0.9710030463725601

In [22]:
clean = nlp_LogisticRegression(review_clean, sentiments)
clean

0.9672232878257926

## Result Score


```python
Original text:	  0.9710030463725601
Clean text:	      0.9672232878257926
Difference:	     -0.0037797585467674866
```

In [23]:
print('Original text:\t  {}'.format(original))
print('Clean text:\t  {}'.format(clean))
print('Difference:\t {}'.format(clean - original))

Original text:	  0.9710030463725601
Clean text:	  0.9672232878257926
Difference:	 -0.0037797585467674866


## Ngrams instead of Words

In [30]:
review = data['text'].values.astype(str)  # original review
review_clean = data['clean_text'].values.astype(str)  # clean text review
sentiments = data['stars'].values  # star value

X_train, X_test, y_train, y_test = train_test_split(review, sentiments, test_size=0.33, random_state=42)

In [31]:
original = nlp_LogisticRegression(review, sentiments,(1,3))
original

0.9754033622926774

In [32]:
clean = nlp_LogisticRegression(review_clean, sentiments,(1,3))
clean

0.9723569897325962

# Result:` ngrams= (1,3)`


```python
Original text:	  0.9754033622926774
Clean text:	  0.9723569897325962
Difference:	 -0.003046372560081223
```

In [33]:
print('Original text:\t  {}'.format(original))
print('Clean text:\t  {}'.format(clean))
print('Difference:\t {}'.format(clean - original))

Original text:	  0.9754033622926774
Clean text:	  0.9723569897325962
Difference:	 -0.003046372560081223


## Pipeline

In [36]:
from sklearn.pipeline import Pipeline

In [37]:
review = data['text'].values.astype(str)  # original review
review_clean = data['clean_text'].values.astype(str)  # clean text review
sentiments = data['stars'].values  # star value

### Original text pipeline

In [38]:
%%time
X_train, X_test, y_train, y_test = train_test_split(review, sentiments, test_size=0.33, random_state=42)
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])

pipeline.fit(X_train, y_train)
# Check our classifier performance

score = pipeline.score(X_test, y_test)

print("Accuracy=", score)

Accuracy= 0.9754033622926774
CPU times: user 4min 48s, sys: 29.2 s, total: 5min 17s
Wall time: 1min 44s


### Clean Text pipeline

In [39]:
%%time
X_train, X_test, y_train, y_test = train_test_split(review_clean, sentiments, test_size=0.33, random_state=42)
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])

pipeline.fit(X_train, y_train)
# Check our classifier performance

score = pipeline.score(X_test, y_test)

print("Accuracy=", score)

Accuracy= 0.9723569897325962
CPU times: user 2min 55s, sys: 21.6 s, total: 3min 17s
Wall time: 55.1 s


## Cross validation

In [40]:
from sklearn.model_selection import cross_val_score

In [41]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])



print("MeanAccuracy=", cross_val_score(pipeline, review, sentiments, cv=5).mean())

MeanAccuracy= 0.9778455972678921
CPU times: user 26min 13s, sys: 2min 44s, total: 28min 57s
Wall time: 10min 4s


In [None]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])



print("MeanAccuracy=", cross_val_score(pipeline, review_clean, sentiments, cv=5).mean())

# Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer( ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])


classifier = GridSearchCV(pipeline, {
    # try out different ngram ranges
    'vectorizer__ngram_range': ((1, 2), (2, 3), (1, 3)),
    # check if setting all non zero counts to 1 makes a difference
    'vectorizer__binary': (True, False),},
                          n_jobs=-1, 
                          verbose=True,
                          error_score=0.0, 
                          cv=5)

In [None]:
%%time
# Compute the vocabulary and train the classifier
classifier.fit(review, sentiments)

In [None]:
%%time
# Compute the vocabulary and train the classifier
classifier.fit(review_cleanlean, sentiments)