In [1]:
import turicreate as tc
from turicreate import text_analytics

### Read in product review data

In [2]:
reviews = tc.SFrame('./amazon_baby.sframe')

In [3]:
reviews.num_rows()

183531

In [4]:
reviews = reviews[reviews['name'] == 'Vulli Sophie the Giraffe Teether']

### Build a Word-Count vector for each of the reviews
You must throw away every word not in the `selected_words` array.

In [5]:
def filter_words(words):
    # Receives a dictionary of words and their count, and an array of selected_words.
    # Returns a new dictionary containing only selected words
    selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
    newDict = dict()
    for w in selected_words:
        newDict[w] = 0

    for k, v in words.items():
        if k in selected_words:
            newDict[k] = v
    return newDict
            

In [6]:
def single_count(w_dict, w):
    if w in w_dict:
        return w_dict[w]
    else:
        return 0

In [7]:
reviews['word-count'] = text_analytics.count_words(reviews['review']) 
# produces a dictionary of unique word:count pairs

In [8]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
for word in selected_words:
    reviews[word] = reviews['word-count'].apply(lambda x : single_count(x, word), dtype=int)

In [9]:
reviews.head()

name,review,rating,word-count,awesome,great,fantastic
Vulli Sophie the Giraffe Teether ...,He likes chewing on all the parts especially the ...,5.0,"{'purchase': 1.0, 'teething': 1.0, ...",0,1,0
Vulli Sophie the Giraffe Teether ...,My son loves this toy and fits great in the diaper ...,5.0,"{'a': 1.0, 'is': 1.0, 'when': 1.0, 'him': 1.0, ...",0,1,0
Vulli Sophie the Giraffe Teether ...,There really should be a large warning on the ...,1.0,"{'made': 1.0, 'of': 1.0, 'packaging': 1.0, 'no': ...",0,0,0
Vulli Sophie the Giraffe Teether ...,All the moms in my moms' group got Sophie for ...,5.0,"{'another': 1.0, 'out': 1.0, 'run': 1.0, 'lost': ...",0,0,0
Vulli Sophie the Giraffe Teether ...,I was a little skeptical on whether Sophie was ...,5.0,"{'disappointed': 1.0, 'will': 1.0, 'take': ...",0,0,0
Vulli Sophie the Giraffe Teether ...,I have been reading about Sophie and was going ...,5.0,"{'late': 1.0, 'perfect': 1.0, 'pack': 1.0, 'on ...",0,0,0
Vulli Sophie the Giraffe Teether ...,My neice loves her sophie and has spent hours ...,5.0,"{'delight': 1.0, 'in': 1.0, 'other': 1.0, ...",0,0,0
Vulli Sophie the Giraffe Teether ...,What a friendly face! And those mesmerizing ...,5.0,"{'inside': 1.0, 'water': 1.0, 'don': 1.0, 'up': ...",0,0,0
Vulli Sophie the Giraffe Teether ...,We got this just for my son to chew on instea ...,5.0,"{'its': 1.0, 'fine': 1.0, 'is': 1.0, 'which': 1.0, ...",0,0,0
Vulli Sophie the Giraffe Teether ...,"My baby seems to like this toy, but I could ...",3.0,"{'off': 1.0, 'have': 2.0, 'of': 1.0, 'some': 1.0, ...",0,0,0

amazing,love,horrible,bad,terrible,awful,wow,hate
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0
0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0


In [10]:
sums = dict()
for word in selected_words:
    s = reviews[word].sum()
    print(s)
    sums[word] = s

5
176
6
6
173
4
20
5
1
6
3


In [11]:
sums  # great is the most common word

{'awesome': 5,
 'great': 176,
 'fantastic': 6,
 'amazing': 6,
 'love': 173,
 'horrible': 4,
 'bad': 20,
 'terrible': 5,
 'awful': 1,
 'wow': 6,
 'hate': 3}

In [12]:
reviews['word-count'] = reviews['word-count'].apply(filter_words)

## Label our Data
Ignore reviews with 3.0 stars.
Reviews higher or equal to 4 will be regarded as "good",
lower than 3 are regarded as "bad".

In [13]:
reviews = reviews[reviews['rating'] != 3.0]

In [14]:
reviews['sentiment'] = reviews['rating'] >= 4.0

## Build a Sentiment Analysis Model with selected words

Now that we have a "word-count" column with a dictionary containing the counts of only the selected words, we can create a logistic regression classifier.

In [15]:
# Add a sentiment column to label our data?
# reviews['sentiment'] = 

In [16]:
train_data, test_data = reviews.random_split(0.8, seed=0)

In [17]:
selected_model = tc.logistic_classifier.create(train_data, features=selected_words, target='sentiment')

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [18]:
selected_model.coefficients.sort('value').print_rows(num_rows=12) # awesome is highest, awful is lowest

+-------------+-------+-------+---------------------+--------------------+
|     name    | index | class |        value        |       stderr       |
+-------------+-------+-------+---------------------+--------------------+
|    awful    |  None |   1   |  -8.719754368905177 | 45.74741483767056  |
|   terrible  |  None |   1   |  -2.433720621697301 | 1.3974384540781501 |
|   horrible  |  None |   1   | -1.9132652365601521 |  1.35001400879482  |
|   amazing   |  None |   1   | -1.3790201386599592 | 1.1996305146350452 |
|  fantastic  |  None |   1   | -0.9546350569683221 | 1.142211443810633  |
|     wow     |  None |   1   | -0.8686736108945361 | 1.1782295797471187 |
|     bad     |  None |   1   | -0.6973262626159622 | 0.5895183342908636 |
|     love    |  None |   1   |  0.9565351856930924 | 0.4340345568971649 |
|    great    |  None |   1   |  1.022325183314849  | 0.4458031316134507 |
| (intercept) |  None |   1   |  1.7719308634312678 | 0.1536106463910901 |
|     hate    |  None |  

In [19]:
selected_model.evaluate(test_data)

{'accuracy': 0.8533333333333334,
 'auc': 0.6835937500000001,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 2
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |   22  |
 |      1       |        1        |  128  |
 +--------------+-----------------+-------+
 [2 rows x 3 columns],
 'f1_score': 0.9208633093525179,
 'log_loss': 0.3849835730345825,
 'precision': 0.8533333333333334,
 'recall': 1.0,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 1001
 
 Data:
 +-----------+-----+-----+-----+----+
 | threshold | fpr | tpr |  p  | n  |
 +-----------+-----+-----+-----+----+
 |    0.0    | 1.0 | 1.0 | 128 | 22 |
 |   0.001   | 1.0 | 1.0 | 128 | 22 |
 |   0.002   | 1.0 | 1.0 | 128 | 22 |
 |   0.003   | 1.0 | 1.0 | 128 | 22 |
 |   0.004   | 1.0 | 1.0 | 128 | 22 |
 |   0.005   | 1.0 | 1.0 

In [20]:
reviews.groupby('sentiment', {'count': tc.aggregate.COUNT()})

sentiment,count
0,93
1,630


In [21]:
majority_accuracy = 630.0 / reviews.num_rows()

In [22]:
majority_accuracy

0.8713692946058091

## Analyze Baby Trend Diaper Champ reviews

In [23]:
reviews2 = tc.SFrame('./amazon_baby.sframe')

In [24]:
reviews2 = reviews2[reviews2['name'] == 'Baby Trend Diaper Champ']

In [25]:
reviews2.num_rows()

333

In [26]:
reviews2 = reviews2[reviews2['rating'] != 3.0]

In [27]:
reviews2['sentiment'] = reviews2['rating'] >= 4.0