# Week 3 - Quiz

In [1]:
import turicreate as tc

## Prepare data

In [2]:
data = tc.SFrame('./amazon-baby.sframe/')

### Ignore reviews having rating = 3

In [3]:
data = data[data['rating'] != 3]

### Add 'sentiment' column (positive or negative)

In [4]:
data['sentiment'] = data['rating'] >= 4

### Add 'word_count' column

In [5]:
data['word_count'] = tc.text_analytics.count_words(data['review'])

### Add a column for each selected word

In [6]:
selected_words = [
    'awesome', 
    'great', 
    'fantastic', 
    'amazing', 
    'love', 
    'horrible',
    'bad',
    'terrible', 
    'awful',
    'wow', 
    'hate'
]

In [7]:
for word in selected_words:
    data[word] = data['word_count'].apply(lambda c : c.get(word, 0))

### Display data

In [8]:
data

name,review,rating,sentiment,word_count,awesome
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,1,"{'recommend': 1.0, 'disappointed': 1.0, ...",0.0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,1,"{'quilt': 1.0, 'the': 1.0, 'than': 1.0, 'fu ...",0.0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,1,"{'tool': 1.0, 'clever': 1.0, 'binky': 2.0, ...",0.0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,1,"{'rock': 1.0, 'many': 1.0, 'headaches': 1.0, ...",0.0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,1,"{'thumb': 1.0, 'or': 1.0, 'break': 1.0, 'trying': ...",0.0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,1,"{'for': 1.0, 'barnes': 1.0, 'at': 1.0, 'is': ...",0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,1,"{'right': 1.0, 'because': 1.0, 'questions': 1.0, ...",0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,1,"{'like': 1.0, 'and': 1.0, 'changes': 1.0, 'the': ...",0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,1,"{'in': 1.0, 'pages': 1.0, 'out': 1.0, 'run': 1.0, ...",0.0
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4.0,1,"{'tracker': 1.0, 'now': 1.0, 'its': 1.0, 'sti ...",0.0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0.0,0.0,0.0,1.0,0,0,0.0,0,0,0
0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.0,0.0,0.0,2.0,0,0,0.0,0,0,0
1.0,0.0,0.0,1.0,0,0,0.0,0,0,0
1.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.0,1.0,0.0,0.0,0,0,0.0,0,0,0
0.0,0.0,0.0,0.0,0,0,0.0,0,0,0
0.0,0.0,0.0,2.0,0,0,0.0,0,0,0


# Important words summary

In [9]:
important_words_sum = {}

for word in selected_words:
    important_words_sum[word] = data[word].sum()
    
sorted([(value,key) for (key,value) in important_words_sum.items()])

[(425, 'wow'),
 (687, 'awful'),
 (1107, 'hate'),
 (1110, 'horrible'),
 (1146.0, 'terrible'),
 (1664.0, 'fantastic'),
 (2628.0, 'amazing'),
 (3892.0, 'awesome'),
 (4183, 'bad'),
 (41994.0, 'love'),
 (55791.0, 'great')]

## Split training and test data

In [10]:
training_data, test_data = data.random_split(.8, seed=0)

## Model based on all words of the reviews

In [11]:
sentiment_model = tc.logistic_classifier.create(training_data, target='sentiment', features=['word_count'], validation_set=test_data)

In [12]:
sentiment_model.evaluate(test_data)

{'accuracy': 0.9176975738650012,
 'auc': 0.9342357833151299,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  1397 |
 |      1       |        0        |  1344 |
 |      0       |        0        |  3931 |
 |      1       |        1        | 26632 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.951057941255245,
 'log_loss': 0.33047871872412254,
 'precision': 0.9501587641371436,
 'recall': 0.9519588218472976,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+--------------------+--------------------+-------+------+
 | threshold |        fpr         |        tpr         |   p   |  n   |
 +-----------+--------------------+--------------------+-------+------+
 |    0.

## Selected words model

In [13]:
selected_words_model = tc.logistic_classifier.create(training_data, target='sentiment', features=selected_words, validation_set=test_data)

### Coefficients

In [14]:
selected_words_model.coefficients.sort('value', ascending=False).print_rows(num_rows=12)

+-------------+-------+-------+-----------------------+----------------------+
|     name    | index | class |         value         |        stderr        |
+-------------+-------+-------+-----------------------+----------------------+
|     love    |  None |   1   |   1.3592688669225153  | 0.028068300152099435 |
| (intercept) |  None |   1   |   1.3365913848877558  | 0.008929969787656753 |
|   awesome   |  None |   1   |   1.133534666034145   | 0.08399643983187526  |
|   amazing   |  None |   1   |   1.1000933113660283  | 0.09954776260465983  |
|  fantastic  |  None |   1   |   0.8858047568814295  | 0.11167591293399656  |
|    great    |  None |   1   |   0.8630655001196618  | 0.018955052444377323 |
|     wow     |  None |   1   | -0.009538236067678897 |  0.1604641122471166  |
|     bad     |  None |   1   |  -0.9914778800650565  | 0.03848428664699063  |
|     hate    |  None |   1   |  -1.3484407222463124  | 0.07715698604297333  |
|    awful    |  None |   1   |  -2.0529082040313513

### Evaluate

In [15]:
selected_words_model.evaluate(test_data)

{'accuracy': 0.8463848186404036,
 'auc': 0.6936022046674926,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      1       |        0        |  159  |
 |      0       |        0        |  371  |
 |      0       |        1        |  4957 |
 |      1       |        1        | 27817 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9157860082304526,
 'log_loss': 0.3962265467087378,
 'precision': 0.8487520595594068,
 'recall': 0.9943165570488991,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+--------------------+-----+-------+------+
 | threshold |        fpr         | tpr |   p   |  n   |
 +-----------+--------------------+-----+-------+------+
 |    0.0    |        1.0         | 1.0 | 27976 | 532

## Majority class classifier

In [16]:
positives = test_data[test_data['sentiment'] == 1].num_rows()
negatives = test_data[test_data['sentiment'] == 0].num_rows()
total = test_data.num_rows()

print(f'positives: {positives}')
print(f'negatives: {negatives}')
print(f'total: {total}')

max(positives, negatives) / total

positives: 27976
negatives: 5328
total: 33304


0.8400192169108815

## Evaluate reviews of a specific product

In [17]:
diaper_champ_reviews = data[data['name'] == 'Baby Trend Diaper Champ']
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')
diaper_champ_reviews['selected_words_predicted_sentiment'] = selected_words_model.predict(diaper_champ_reviews, output_type='probability')

In [18]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
diaper_champ_reviews['name', 'review', 'rating', 'predicted_sentiment', 'selected_words_predicted_sentiment']

name,review,rating,predicted_sentiment,selected_words_predicted_ sentiment ...
Baby Trend Diaper Champ,I read a review below that can explain exactly ...,4.0,0.999999999989594,0.7919288370624453
Baby Trend Diaper Champ,I have never written a review for Amazon but I ...,5.0,0.9999999999868132,0.936781924479937
Baby Trend Diaper Champ,I originally put this item on my baby registry ...,5.0,0.9999999999465672,0.7919288370624453
Baby Trend Diaper Champ,Baby Luke can turn a clean diaper to a dirty ...,5.0,0.9999999999302822,0.9002186948093641
Baby Trend Diaper Champ,Diaper Champ or Diaper Genie? That was my ...,5.0,0.9999999999174132,0.9002186948093641
Baby Trend Diaper Champ,I am one of those super- critical shoppers who ...,5.0,0.9999999998430964,0.936781924479937
Baby Trend Diaper Champ,I LOOOVE this diaper pail! Its the easies ...,5.0,0.9999999997360196,0.936781924479937
Baby Trend Diaper Champ,"As a first time mother, I wanted to get the best ...",5.0,0.9999999995664316,0.936781924479937
Baby Trend Diaper Champ,I see that there are complaints of stinkiness ...,5.0,0.9999999985015902,0.7919288370624453
Baby Trend Diaper Champ,I have a 10 year old daughter and an 8 month ...,5.0,0.999999998056851,0.9829620803909052


### Evaluate prediction of the most positive review

In [19]:
diaper_champ_reviews[0]['review']

"I read a review below that can explain exactly what we experienced. We've had it for 16 months and it has worked wonderful for us. No smells, change it out once a week, easy to clean. Then a diaper snagged this foam material in the head part, so I pulled the rest of the foam out. Big mistake!!! Now it can no loner retain the stinkiness and we're looking for a replacement. Be careful of overloading and never take out that foam piece that is cushioned between pieces. I have figured out that it is key to keeping the stink out."