In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#! pip install nltk
import nltk
nltk.download('stopwords')

In [2]:
import pandas as pd
import numpy as np

import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [3]:
stemmer = SnowballStemmer('english')
words = stopwords.words("english")

In [4]:
path = 'storage/yelp_data/health_raw00.csv'

In [5]:
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54756 entries, 0 to 54755
Data columns (total 12 columns):
business_id    54756 non-null object
name           54756 non-null object
categories     54756 non-null object
cool           54756 non-null int64
date           54756 non-null object
funny          54756 non-null int64
review_id      54756 non-null object
stars          54756 non-null int64
text           54756 non-null object
useful         54756 non-null int64
user_id        54756 non-null object
len_text       54756 non-null int64
dtypes: int64(5), object(7)
memory usage: 5.0+ MB


In [7]:
df.head(2)

Unnamed: 0,business_id,name,categories,cool,date,funny,review_id,stars,text,useful,user_id,len_text
0,y-4xTZNKVm8mAZpiXMS5ZA,"Lauren Byrne, MD","urologists, doctors, health & medical",0,2018-03-03,0,TNNkSmMfshsD3G60jTNjDA,1,Please stay away from this place if you can! I...,2,xv2V2GO5IZYvtw4oW7gQ1w,2002
1,y-4xTZNKVm8mAZpiXMS5ZA,"Lauren Byrne, MD","urologists, doctors, health & medical",0,2015-11-29,0,v-iKdstPdCxJr8zV1ZMdrw,5,My husband has been a patient of Dr. Byrne for...,1,SjvWP7c9toeZoV_q62zhTA,877


## split review stars and text

In [8]:
data = df[['stars','text']]

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54756 entries, 0 to 54755
Data columns (total 2 columns):
stars    54756 non-null int64
text     54756 non-null object
dtypes: int64(1), object(1)
memory usage: 855.6+ KB


In [10]:
data.head()

Unnamed: 0,stars,text
0,1,Please stay away from this place if you can! I...
1,5,My husband has been a patient of Dr. Byrne for...
2,4,Dr. Byrne is a great doctor! She has great bed...
3,3,I'm raising my review as Dr Bryne's has been m...
4,1,I wish I could give 0 stars. Worst office I've...


In [12]:
# %%time
# data['cleaned'] = data['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [None]:
data.info()

In [None]:
data.head()

In [11]:
data = data.ix[np.where((data.stars==1)|(data.stars==5))]
data.head()

Unnamed: 0,stars,text
0,1,Please stay away from this place if you can! I...
1,5,My husband has been a patient of Dr. Byrne for...
4,1,I wish I could give 0 stars. Worst office I've...
5,1,I went to the emergency room because i was hav...
6,5,Dr. Byrne is an excellent doctor with all the ...


In [13]:
data.stars.replace(1,0,inplace=True)
data.stars.replace(5,1,inplace=True)

In [14]:
data.head()

Unnamed: 0,stars,text
0,0,Please stay away from this place if you can! I...
1,1,My husband has been a patient of Dr. Byrne for...
4,0,I wish I could give 0 stars. Worst office I've...
5,0,I went to the emergency room because i was hav...
6,1,Dr. Byrne is an excellent doctor with all the ...


In [None]:
## Problems
- names get cut of
- 

In [None]:
### stars

In [15]:
data.stars.value_counts()

1    25502
0    20129
Name: stars, dtype: int64

### Split positive and negtative
- total positive 31,186
- total negative : 22451
- total meh : 10,853

drop 2-4

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45631 entries, 0 to 54755
Data columns (total 2 columns):
stars    45631 non-null int64
text     45631 non-null object
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [17]:
data.stars.value_counts()

1    25502
0    20129
Name: stars, dtype: int64

In [18]:
data.head()

Unnamed: 0,stars,text
0,0,Please stay away from this place if you can! I...
1,1,My husband has been a patient of Dr. Byrne for...
4,0,I wish I could give 0 stars. Worst office I've...
5,0,I went to the emergency room because i was hav...
6,1,Dr. Byrne is an excellent doctor with all the ...


## Demo test

In [19]:
review = data['text'].values.astype(str)
sentiments = data['stars'].values

In [20]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [21]:
%%time
pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)),
                     ('chi',  SelectKBest(chi2, k=10000)),
                     ('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 211 µs


In [22]:
%%time
model = pipeline.fit(X_train, y_train)

CPU times: user 12.6 s, sys: 1.46 s, total: 14.1 s
Wall time: 12.8 s


In [23]:
%%time
vectorizer = model.named_steps['vect']
chi = model.named_steps['chi']
clf = model.named_steps['clf']

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 26.5 µs


In [24]:
%%time
feature_names = vectorizer.get_feature_names()
feature_names = [feature_names[i] for i in chi.get_support(indices=True)]
feature_names = np.asarray(feature_names)

CPU times: user 1.66 s, sys: 12 ms, total: 1.67 s
Wall time: 1.66 s


In [25]:
target_names =['']

In [26]:
print("top 10 keywords per class:")

for i, label in enumerate(target_names):
    top10 = np.argsort(clf.coef_[i])[-10:]
    print("%s: %s" % (label, " ".join(feature_names[top10])))

top 10 keywords per class:
: excellent pleased wonderful love thank grateful takes time great amazing best


In [27]:
%%time
print("accuracy score: " + str(model.score(X_test, y_test)))

accuracy score: 0.9688835323764654
CPU times: user 1.7 s, sys: 0 ns, total: 1.7 s
Wall time: 1.7 s


## Multinomial NB
-

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [29]:
 data['stars'].values

array([0, 1, 0, ..., 0, 0, 1])

In [30]:
review = data['text'].values.astype(str)
sentiments = data['stars'].values

In [31]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [32]:
vectorizer = CountVectorizer(lowercase=True)
# Compute the vocabulary only on the training data
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [33]:
# Transform the text list to a matrix form
X_train_vectorized = vectorizer.transform(X_train)

In [34]:
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)

# Check our classifier performance
score = classifier.score(X_test_vectorized, y_test)

print("Accuracy=", score)

Accuracy= 0.9562835542894708


# Results: out of box

- all stars: `Accuracy= 0.8001240502403474`
- 1 & 5 starts: `Accuracy= 0.9561894108873975`

In [35]:
X_train[0]

"My 20 minute wait turned in to 2 hours and just before I was called back I saw a small note saying the provider was not a medical doctor, but a Physician's Assistant. At one point I asked if they had a room where I could go lie down (so I wouldn't fall down). I was told no, they had to keep rooms open for emergencies. So wait, my dizzy and feel like passing out isn't emergent enough? Sheesh. A 2-hour wait with a mere 4 patients in the waiting room is ridiculous. If I hadn't felt so yucky I would have left. Unless they make some changes, I recommend finding another urgent care."

In [36]:
vectorizer.get_feature_names()

['00',
 '000',
 '000000',
 '00204',
 '007',
 '00a',
 '00am',
 '00p',
 '00pm',
 '01',
 '014',
 '016',
 '019',
 '01am',
 '01mg',
 '01pm',
 '02',
 '0200',
 '02am',
 '02pm',
 '03',
 '0300',
 '03am',
 '03jun18',
 '03pm',
 '04',
 '0404',
 '0430',
 '045',
 '047',
 '049',
 '04am',
 '04pm',
 '05',
 '0530',
 '05am',
 '05p',
 '05pm',
 '06',
 '06pm',
 '07',
 '0700',
 '0709',
 '0710',
 '0720',
 '0723',
 '0727',
 '0730',
 '0745',
 '075',
 '0755',
 '07a',
 '07am',
 '07pm',
 '08',
 '080',
 '0800',
 '0805',
 '0825',
 '0845',
 '088',
 '08am',
 '08pm',
 '09',
 '090',
 '0900',
 '0926',
 '0945',
 '0952',
 '0985773006',
 '09am',
 '09pm',
 '0_o',
 '0a3a',
 '0n',
 '0ne',
 '0star',
 '10',
 '100',
 '1000',
 '10000',
 '1000000',
 '100000000000',
 '1000k',
 '1000s',
 '1000x',
 '1001',
 '1005',
 '1008',
 '100days',
 '100k',
 '100lb',
 '100pm',
 '100por',
 '100something',
 '100th',
 '100x',
 '100xs',
 '101',
 '1010',
 '1012',
 '1014',
 '1015',
 '10159531',
 '1015ish',
 '102',
 '1020',
 '1021',
 '1022',
 '1025am',
 

## Diffrent classifier: Logistic

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
len(review)

45631

In [39]:
len(sentiments)

45631

In [40]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [41]:
vectorizer = CountVectorizer(lowercase=True)
# Compute the vocabulary only on the training data
vectorizer.fit(X_train)

# Transform the text list to a matrix form
X_train_vectorized = vectorizer.transform(X_train)

classifier = LogisticRegression()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)

# Check our classifier performance
score = classifier.score(X_test_vectorized, y_test)
print("Accuracy=", score)

Accuracy= 0.9701983126985866


## Logistic results
`Accuracy= 0.9737136465324385
`

## Use Ngrams Instead of Words


In [44]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [46]:
%%time
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,3))
# Compute the vocabulary only on the training data
vectorizer.fit(X_train)

# Transform the text list to a matrix form
X_train_vectorized = vectorizer.transform(X_train)

classifier = LogisticRegression()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)

# Check our classifier performance
score = classifier.score(X_test_vectorized, y_test)
print("Accuracy=", score)

Accuracy= 0.9768817793360359
CPU times: user 1min 26s, sys: 856 ms, total: 1min 27s
Wall time: 1min 27s


### Results
- `Accuracy= 0.9793064876957495
`

## pipeline
- Result `Accuracy= 0.9777218493661447
`

In [47]:
from sklearn.pipeline import Pipeline

In [48]:
# Split the data for training and for testing and shuffle it
X_train, X_test, y_train, y_test = train_test_split(review, sentiments,
test_size=0.2, shuffle=True)

In [49]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', 
     CountVectorizer(lowercase=True,ngram_range=(1, 3)))
    ,('classifier', LogisticRegression())])

pipeline.fit(X_train, y_train)
# Check our classifier performance

score = pipeline.score(X_test, y_test)

print("Accuracy=", score)

Accuracy= 0.9778678645776269
CPU times: user 1min 13s, sys: 712 ms, total: 1min 13s
Wall time: 1min 13s


## Cross validation score

In [50]:
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

In [51]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(
        lowercase=True,
        ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])

CPU times: user 124 ms, sys: 16 ms, total: 140 ms
Wall time: 142 ms


In [52]:
review, sentiments = shuffle(review, sentiments)

In [53]:
%%time
print("MeanAccuracy=", cross_val_score(pipeline, review, sentiments, cv=5).mean())

MeanAccuracy= 0.9768577868048183
CPU times: user 6min 2s, sys: 5.17 s, total: 6min 8s
Wall time: 6min 8s


## Result:
```
MeanAccuracy= 0.9773477193058582
CPU times: user 10min 21s, sys: 44.1 s, total: 11min 5s
Wall time: 8min 45s

```

## Grid search

In [54]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [55]:
# shuffle data
review, sentiments = shuffle(review, sentiments)

In [56]:
%%time
# Put everything in a Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(
        lowercase=True,
        ngram_range=(1, 3))),
    ('classifier', LogisticRegression())])


classifier = GridSearchCV(pipeline, {
    # try out different ngram ranges
    'vectorizer__ngram_range': ((1, 2), (2, 3), (1, 3)),
    # check if setting all non zero counts to 1 makes a difference
    'vectorizer__binary': (True, False),},
                          n_jobs=-1, 
                          verbose=True,
                          error_score=0.0, 
                          cv=5)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 637 µs


In [57]:
%%time
# Compute the vocabulary and train the classifier
classifier.fit(review, sentiments)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


Process ForkPoolWorker-13:
Process ForkPoolWorker-14:
Process ForkPoolWorker-7:
Process ForkPoolWorker-6:
Process ForkPoolWorker-8:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Process ForkPoolWorker-15:
Process ForkPoolWorker-2:
Process ForkPoolWorker-5:
Process ForkPoolWorker-9:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6

KeyboardInterrupt: 

In [58]:
print("Best Accuracy: ", classifier.best_score_)
print("Best Parameters: ", classifier.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'