In [1]:
import numpy as np
import sframe as sf

In [2]:
products = sf.SFrame('../data/amazon_baby.gl/')

[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1472352428.log


In [3]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [4]:
products['review_clean'] = products['review'].apply(remove_punctuation)
products = products.fillna('review', '')
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'].apply(lambda rating: +1 if rating > 3 else -1)

In [5]:
train_data, test_data = products.random_split(.8, seed = 1)

# Building the Bag Of Words aka Word Count aka Count Vector

Bag of words is the distinct collection of words in the corpus or training data. For each document or row, it is count of each word from the corpus.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')

# Convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'].to_numpy())
test_matrix = vectorizer.transform(test_data['review_clean'].to_numpy())

# Train Sentiment Classifier With Logistic Regression

## Learn the Logistic Regression Classifier

In [7]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression()

sentiment_model.fit(X = train_matrix, y = train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [140]:
coeff = sentiment_model.coef_
pos_wts = coeff[ coeff >= 0 ]
print "Ans: No of Weights >= 0 is", np.shape(pos_wts)[0]

Ans: No of Weights >= 0 is 86906


In [9]:
print "Shape of Coefficients", np.shape(coeff)
print "Shape of Training data", np.shape(train_matrix)


Shape of Coefficients (1, 121712)
Shape of Training data (133416, 121712)


### Making Predictions With Logistic Regression

In [10]:
sample_test_data = test_data[10:13]
print sample_test_data

+-------------------------------+-------------------------------+--------+
|              name             |             review            | rating |
+-------------------------------+-------------------------------+--------+
|   Our Baby Girl Memory Book   | Absolutely love it and all... |  5.0   |
| Wall Decor Removable Decal... | Would not purchase again o... |  2.0   |
| New Style Trailing Cherry ... | Was so excited to get this... |  1.0   |
+-------------------------------+-------------------------------+--------+
+-------------------------------+-----------+
|          review_clean         | sentiment |
+-------------------------------+-----------+
| Absolutely love it and all... |     1     |
| Would not purchase again o... |     -1    |
| Was so excited to get this... |     -1    |
+-------------------------------+-----------+
[3 rows x 5 columns]



In [11]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'].to_numpy())
scores = sentiment_model.decision_function(sample_test_matrix)
print scores

[  5.60224818  -3.17079879 -10.42488847]


In [12]:
def predictLabelsFromSframe(model, sf, colName):
    def predictLabels(scores):
        def f(a):
            if(a > 0):
                return 1
            else:
                return -1
        g = np.vectorize(f)    
        return g(scores)
    
    matrix = vectorizer.transform(sf[colName].to_numpy())
    return predictLabels( model.decision_function(matrix) )

predictLabelsFromSframe(sentiment_model, sample_test_data, 'review_clean')

array([ 1, -1, -1])

In [15]:
def predictProbabilityUsingScores(scores):
    denom = 1 + np.exp(-scores)
    return 1.0 / denom

predictProbabilityUsingScores(scores)

array([  9.96324003e-01,   4.02795251e-02,   2.96835318e-05])

In [28]:
def calculateScores(model, data):
    scores = data * model.coef_.T
    return  scores + model.intercept_

scores0 = calculateScores(sentiment_model, sample_test_matrix)
myProbabilities = predictProbabilityUsingScores(scores0).T
print "Probability Using Custom funtions:", myProbabilities
print "Probability Using Sklearn funtions:", sentiment_model.predict_proba(sample_test_matrix)[:, 1].T

def quiz12Ans(myProbabilities):
    firstProb = myProbabilities[0][0]
    secondProb = myProbabilities[0][1]
    thirdProb = myProbabilities[0][2]
    if firstProb < secondProb:
        if firstProb < thirdProb:
            return "first"
        else: 
            if thirdProb < secondProb:
                return "third"
            else: 
                return "second"
    else:
        if secondProb < thirdProb:
            return "second"
        else:
            return "third"
print "Ans to Quiz from Section 12:", quiz12Ans(myProbabilities)        

Probability Using Custom funtions: [[  9.96324003e-01   4.02795251e-02   2.96835318e-05]]
Probability Using Sklearn funtions: [  9.96324003e-01   4.02795251e-02   2.96835318e-05]
Ans to Quiz from Section 12: third


In [38]:
print "Type:", type(myProbabilities[0]), "Shape:", np.shape(myProbabilities[0])

Type: <type 'numpy.ndarray'> Shape: (3,)


### Finding the most positive and negative review

In [49]:
myProbabilities[0].astype(np.float)

array([  9.96324003e-01,   4.02795251e-02,   2.96835318e-05])

In [65]:
def associateProbabilities(model, sframe, data):
    probabilities = predictProbabilityUsingScores(calculateScores(model, data))
    # print "Shape of probabilies:", np.shape(probabilities)
    # print "Shape of SFrame: Dim[", sframe.num_rows(), ", ", sframe.num_cols(), "]"
    sframe['Probability'] = np.ravel(probabilities)
    return sframe

d = associateProbabilities(sentiment_model, test_data, test_matrix)

In [97]:
top30Names = sf.SFrame({'name': d.topk('Probability', k = 30)['name']})
top30Names.add_row_number()
bottom30Names = sf.SFrame({'name': d.topk('Probability', k = 30, reverse=True)['name']})
bottom30Names.add_row_number

<bound method SFrame.add_row_number of Columns:
	name	str

Rows: 30

Data:
+-------------------------------+
|              name             |
+-------------------------------+
| Fisher-Price Ocean Wonders... |
| Levana Safe N'See Digital ... |
| Safety 1st Exchangeable Ti... |
| Adiri BPA Free Natural Nur... |
| VTech Communications Safe ... |
| The First Years True Choic... |
| Safety 1st High-Def Digita... |
| Cloth Diaper Sprayer--styl... |
| Philips AVENT Newborn Star... |
| Motorola Digital Video Bab... |
+-------------------------------+
[30 rows x 1 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>

### Quiz Q. 13

In [104]:
top30Names.add_row_number().print_rows(num_rows=30, max_column_width=60)

+----+--------------------------------------------------------------+
| id |                             name                             |
+----+--------------------------------------------------------------+
| 0  |    Infantino Wrap and Tie Baby Carrier, Black Blueberries    |
| 1  |   Buttons Cloth Diaper Cover - One Size - 8 Color Options    |
| 2  |        Britax Decathlon Convertible Car Seat, Tiffany        |
| 3  |          P'Kolino Silly Soft Seating in Tias, Green          |
| 4  |        Mamas &amp; Papas 2014 Urbo2 Stroller - Black         |
| 5  |    Evenflo X Sport Plus Convenience Stroller - Christina     |
| 6  |   Baby Jogger City Mini GT Single Stroller, Shadow/Orange    |
| 7  |         Evenflo 6 Pack Classic Glass Bottle, 4-Ounce         |
| 8  |     Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L      |
| 9  |          Diono RadianRXT Convertible Car Seat, Plum          |
| 10 |       Baby Einstein Around The World Discovery Center        |
| 11 | Freemie Hands

### Quiz Q. 14

In [106]:
bottom30Names.add_row_number().print_rows(num_rows=30, max_column_width=60)

+----+-------------------------------------------------------------+
| id |                             name                            |
+----+-------------------------------------------------------------+
| 0  |         Fisher-Price Ocean Wonders Aquarium Bouncer         |
| 1  | Levana Safe N'See Digital Video Baby Monitor with Talk-t... |
| 2  |        Safety 1st Exchangeable Tip 3 in 1 Thermometer       |
| 3  | Adiri BPA Free Natural Nurser Ultimate Bottle Stage 1 Wh... |
| 4  | VTech Communications Safe &amp; Sounds Full Color Video ... |
| 5  | The First Years True Choice P400 Premium Digital Monitor... |
| 6  |             Safety 1st High-Def Digital Monitor             |
| 7  |            Cloth Diaper Sprayer--styles may vary            |
| 8  |              Philips AVENT Newborn Starter Set              |
| 9  | Motorola Digital Video Baby Monitor with Room Temperatur... |
| 10 |            Ellaroo Mei Tai Baby Carrier - Hershey           |
| 11 |         Cosco Alpha Omega E

### Quiz Q. 15 - Finding Accuracy

The accuracy of the trained classifier can be given by:-

$$accuracy = \frac{no-of-correctly-classified-examples}{total-examples}$$

In [132]:
def get_classification_accuracy(model, sf, xColName, yColName):
    test_matrix = vectorizer.transform(sf['review_clean'].to_numpy())
    predictions = model.predict(test_matrix)
    errors = predictions == sf[yColName]
    return predictions, float(errors.sum())/len(predictions)

In [141]:
predictions, accuracy = get_classification_accuracy(sentiment_model, test_data, 'review_clean', 'sentiment')
print "Accuracy:", accuracy

Accuracy: 0.932265418766


## Classifier With Fewer Words

In [135]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [136]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])

In [137]:
simple_model = LogisticRegression()
simple_model.fit(X = train_matrix_word_subset, y = train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [148]:
simple_model_coef_table = sf.SFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})
sentiment_model_coef_table = sf.SFrame({'word':significant_words,
                                         'coefficient':sentiment_model.coef_.flatten()})

RuntimeError: All column should be of the same length

In [144]:
simple_model_coef_table.print_rows(20)

+-----------------+--------------+
|   coefficient   |     word     |
+-----------------+--------------+
|  1.36368975931  |     love     |
|  0.943999590572 |    great     |
|  1.19253827349  |     easy     |
|  0.085512779463 |     old      |
|  0.520185762718 |    little    |
|  1.50981247669  |   perfect    |
|  1.67307389259  |    loves     |
|  0.503760457767 |     well     |
|  0.190908572065 |     able     |
| 0.0588546711524 |     car      |
|  -1.65157634496 |    broke     |
| -0.209562864534 |     less     |
| -0.511379631799 |     even     |
|  -2.03369861394 |    waste     |
|  -2.3482982195  | disappointed |
| -0.621168773641 |     work     |
| -0.320556236735 |   product    |
| -0.898030737715 |    money     |
| -0.362166742274 |    would     |
|  -2.10933109032 |    return    |
+-----------------+--------------+
[20 rows x 2 columns]



In [145]:
pos_ones = simple_model_coef_table[simple_model_coef_table['coefficient'] >= 0]

In [149]:
pos_ones.print_rows(20)

+-----------------+---------+
|   coefficient   |   word  |
+-----------------+---------+
|  1.36368975931  |   love  |
|  0.943999590572 |  great  |
|  1.19253827349  |   easy  |
|  0.085512779463 |   old   |
|  0.520185762718 |  little |
|  1.50981247669  | perfect |
|  1.67307389259  |  loves  |
|  0.503760457767 |   well  |
|  0.190908572065 |   able  |
| 0.0588546711524 |   car   |
+-----------------+---------+
[10 rows x 2 columns]



In [154]:
train_data.head(5)

name,review,rating,review_clean,sentiment
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,it came early and was not disappointed i love ...,1
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,Very soft and comfortable and warmer than it ...,1
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,This is a product well worth the purchase I ...,1
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,All of my kids have cried nonstop when I tried to ...,1
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,When the Binky Fairy came to our house we didnt ...,1
