In [21]:
import pandas as pd
import numpy as np

products = pd.read_csv('amazon_baby.csv')
products = products.dropna()
products = products[products.rating != 3]

products.index = range(len(products))
products = products.loc[:9999]

def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 
products['review_clean'] = products.review.apply(remove_punctuation)

products['sentiment'] = products.rating.apply(lambda x: +1 if x>3 else -1)

import random
products_idx = range(len(products))
test_idx = random.sample(products_idx,len(products)/5)
train_idx = [i for i in products_idx if i not in test_idx]

test_data = products.loc[test_idx]
test_data.index  = range(len(test_data))

train_data = products.loc[train_idx]
train_data.index = range(len(train_data))

train_data = train_data.dropna()

# feature engineering and Models

## sentiment_mdoel based on countVectorizer

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data.review_clean)
test_matrix = vectorizer.transform(test_data.review_clean)

In [63]:
from sklearn.linear_model import LogisticRegression
sentiment_model= LogisticRegression()
sentiment_model.fit(train_matrix,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
sentiment_model.score(train_matrix,train_data['sentiment'])
#E_in

0.995

In [72]:
np.round(sentiment_model.score(test_matrix,test_data['sentiment']),2)
#E_out

0.89000000000000001

## simple model based on significant_words

In [74]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [81]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [85]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})
simple_model_coef_table.head()

Unnamed: 0,coefficient,word
0,1.437587,love
1,0.741325,great
2,1.133432,easy
3,0.175802,old
4,0.487788,little


In [87]:
np.round(simple_model.score(train_matrix_word_subset,train_data['sentiment']),2)
#Ein

0.81999999999999995

In [92]:
np.round(simple_model.score(test_matrix_word_subset,test_data['sentiment']),2)
#Eout

0.82999999999999996

### model based on tf idf

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
train_matrix_tfidf = vectorizer.fit_transform(train_data.review_clean)

test_matrix_tfidf = vectorizer.transform(test_data.review_clean)

In [114]:
tfidf_model = LogisticRegression()
tfidf_model.fit(train_matrix_tfidf,train_data['sentiment'])

In [116]:
tfidf_model.score(train_matrix_tfidf,train_data['sentiment'])
#E_in

0.90500000000000003

In [117]:
tfidf_model.score(test_matrix_tfidf,test_data['sentiment'])
#E_out

0.878

In [118]:
train_matrix_tfidf.shape

(8000, 21654)

In [119]:
train_matrix.shape

(8000, 21687)

In [120]:
train_matrix_word_subset.shape

(8000, 20)