In [82]:
import graphlab

In [83]:
products = graphlab.SFrame('amazon_baby.gl')

In [84]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [85]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])

In [86]:
graphlab.canvas.set_target('ipynb')

In [87]:
def special_count(w):
    def awesome_count(d):
        if w in d:
            return d[w]
        else:
            return 0
    return awesome_count
    

In [88]:
for w in selected_words:
    products[w] = products['word_count'].apply(special_count(w))

In [89]:
sum(products['love'])

42065

In [90]:
sum(products['awesome'])    

2090

In [91]:
sum(products['great'])

45206

In [92]:
sum(products['fantastic'])

932

In [93]:
sum(products['amazing'])

1363

In [94]:
sum(products['horrible'])

734

In [95]:
sum(products['bad'])

3724

In [96]:
sum(products['terrible'])

748

In [97]:
sum(products['awful'])

383

In [98]:
sum(products['wow'])

144

In [99]:
sum(products['hate'])

1220

In [100]:
###great is the most popular answer

In [101]:
products = products[products['rating']!=3]

In [102]:
products['sentiment'] = products['rating'] >= 4

In [104]:
train, test = products.random_split(.8, seed=0)

In [105]:
selected_words_model = graphlab.logistic_classifier.create(train, 
                                                           target = 'sentiment', 
                                                           features = selected_words, 
                                                           validation_set = test)

PROGRESS: Logistic regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 133448
PROGRESS: Number of classes           : 2
PROGRESS: Number of feature columns   : 11
PROGRESS: Number of unpacked features : 11
PROGRESS: Number of coefficients    : 12
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+-------------------+---------------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-accuracy | Validation-accuracy |
PROGRESS: +-----------+----------+--------------+-------------------+---------------------+
PROGRESS: | 1         | 2        | 1.178102     | 0.844299          | 0.842842            |
PROGRESS: | 2         | 3        | 1.292191     | 0.844186          | 0.842842            |
PROGRESS: | 3         | 4        | 1.404507     | 0.844276          | 0.843142            |
PROGRESS: | 4         | 5        |

In [106]:
##examine the weights the learned classifier assigned to each of the 11 words in selected_words and gain intuition as 
##to what the ML algorithm did for your data using these features

In [108]:
selected_words_model['coefficients']

name,index,class,value
(intercept),,1,1.36728315229
awesome,,1,1.05800888878
great,,1,0.883937894898
fantastic,,1,0.891303090304
amazing,,1,0.892802422508
love,,1,1.39989834302
horrible,,1,-1.99651800559
bad,,1,-0.985827369929
terrible,,1,-2.09049998487
awful,,1,-1.76469955631


In [110]:
selected_words_model['coefficients'].sort('value')

name,index,class,value
terrible,,1,-2.09049998487
horrible,,1,-1.99651800559
awful,,1,-1.76469955631
hate,,1,-1.40916406276
bad,,1,-0.985827369929
wow,,1,-0.0541450123333
great,,1,0.883937894898
fantastic,,1,0.891303090304
amazing,,1,0.892802422508
awesome,,1,1.05800888878


In [111]:
###awesome is the most positive, terrible is the most negative. 

In [115]:
selected_words_model.evaluate(test, metric = 'roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 1001
 
 Data:
 +------------------+----------------+------------------+-------+------+
 |    threshold     |      fpr       |       tpr        |   p   |  n   |
 +------------------+----------------+------------------+-------+------+
 |       0.0        |      0.0       | 3.5734705546e-05 | 27984 | 5297 |
 | 0.0010000000475  |      1.0       |  0.999964265294  | 27984 | 5297 |
 | 0.00200000009499 | 0.999811213895 |  0.999964265294  | 27984 | 5297 |
 | 0.00300000002608 | 0.999811213895 |  0.999964265294  | 27984 | 5297 |
 | 0.00400000018999 | 0.999622427789 |  0.999964265294  | 27984 | 5297 |
 | 0.00499999988824 | 0.999622427789 |  0.999964265294  | 27984 | 5297 |
 | 0.00600000005215 | 0.999433641684 |  0.999928530589  | 27984 | 5297 |
 | 0.00700000021607 | 0.999433641684 |  0.999928530589  | 27984 | 5297 |
 | 0.00800000037998 | 0.999433641684 |  0.999928530589  | 27984 | 5297 |
 | 0.00899999961257 

In [113]:
##selected words: 84314
##sentiment: 91625
##accuracy majority class: .84
##

In [116]:
selected_words_model.show(view='Evaluation')

In [117]:
diaper_champ_reviews = products[products['name']=='Baby Trend Diaper Champ']

In [124]:
diaper_champ_reviews['predicted_sentiment'] = selected_words_model.predict(diaper_champ_reviews,
                                                                           output_type='probability')

In [125]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending = False)

In [129]:
diaper_champ_reviews[diaper_champ_reviews['review'] == 'Baby Luke can turn a clean diaper to a dirty diaper in 3 seconds flat. The diaper champ turns the smelly diaper into "what diaper smell" in less time than that. I hesitated and wondered what I REALLY needed for the nursery. This is one of the best purchases we made. The champ, the baby bjorn, fluerville diaper bag, and graco pack and play bassinet all vie for the best baby purchase.Great product, easy to use, economical, effective, absolutly fabulous.UpdateI knew that I loved the champ, and useing the diaper genie at a friend\'s house REALLY reinforced that!! There is no comparison, the chanp is easy and smell free, the genie was difficult to use one handed (which is absolutly vital if you have a little one on a changing pad) and there was a deffinite odor eminating from the genieplus we found that the quick tie garbage bags where the ties are integrated into the bag work really well because there isn\'t any added bulk around the sealing edge of the champ.'].show('predicted_sentiment')



In [130]:
print selected_words

['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']


In [132]:
diaper_champ_reviews[0]['review']

'I LOVE LOVE LOVE this product! It is SO much easier to use than the Diaper Genie, (you need a PHD in poopy to figure out how to use the darn thing!) and it even takes the same bags as my kitchen trash can, shich is super convenient, and cost efficient as I can buy them in bulk.The only reason for not rating it a 5 star was that I did have one small problem with it. The foam gasket in the barrell which keeps the poopy smell inside the unit ripped somehow, and it got VERY stinky. HOWEVER, I contacted the manufacturer though their website, and received an email back the same day stating that this was unusual, and that replacement gaskets were on their way to me. They arrived inside of a week and after replacing, it works great again! (They even sent me extras should it happen again)I HIGHLY reccomend this diaper pail over ANY competitors, you will not be sorry!'