In [1]:
import re
import pickle
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

In [3]:
df = pd.read_csv('../data/Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df['Text'] = df['Text'] + ' ' + df['Summary']

In [5]:
# Delete unused columns
del df['Id']
del df['ProfileName']
del df['Summary']
del df['HelpfulnessNumerator']
del df['HelpfulnessDenominator']
del df['Time']
del df['ProductId']
del df['UserId']

In [6]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [7]:
df.loc[df['Score'] <= 3, 'ReviewSentiment'] = 0
df.loc[df['Score'] > 3, 'ReviewSentiment'] = 1

df['ReviewSentiment'] = df['ReviewSentiment'].astype(int)
#convert na to ""
df['Text'].fillna("", inplace=True)

In [8]:
def preprocess(s):
    # Remove html tags
    s = re.sub('<\S+>', '', s)
    # Replace urls with token
    s = re.sub(r'http:\S+', 'url', s)
    s = re.sub(r'https:\S+', 'url', s)
    
    s = s.lower()
    # Remove any other special characters
    s = re.sub(r'[^a-z ]', ' ', s)
    
    words = s.split()
    result = []
    
    # Remove stop words and lemmatize the words
    for word in words:
        if word in stop_words:
            continue
        word = lemmatizer.lemmatize(word)
        result.append(word)
    return ' '.join(result)

In [9]:
df['PreprocessText'] = df['Text'].apply(preprocess)

In [10]:
df.head()

Unnamed: 0,Score,Text,ReviewSentiment,PreprocessText
0,5,I have bought several of the Vitality canned d...,1,bought several vitality canned dog food produc...
1,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrived labeled jumbo salted peanut pe...
2,4,This is a confection that has been around a fe...,1,confection around century light pillowy citrus...
3,2,If you are looking for the secret ingredient i...,0,looking secret ingredient robitussin believe f...
4,5,Great taffy at a great price. There was a wid...,1,great taffy great price wide assortment yummy ...


In [11]:
tfidfv = pickle.load(open('model/tfidfv', 'rb'))
fv = tfidfv.transform(df['PreprocessText'])

In [12]:
fv.shape

(568454, 64878)

In [13]:
lr_tfidf = pickle.load(open('model/lr_tfidf', 'rb'))
y_pred = lr_tfidf.predict(fv)

In [14]:
count=0
for i, y in enumerate(y_pred):
    if y != df['ReviewSentiment'][i]:
        count += 1
      
        print('\nActual:', df['ReviewSentiment'][i])
        print('Predicted:', y)
        print('Score:', df['Score'][i])
        print('Text:', df['Text'][i])
    if count == 20:
        break
print(count)


Actual: 0
Predicted: 1
Score: 2
Text: I love eating them and they are good for watching TV and looking at movies! It is not too sweet. I like to transfer them to a zip lock baggie so they stay fresh so I can take my time eating them. poor taste

Actual: 1
Predicted: 0
Score: 5
Text: Product received is as advertised.<br /><br /><a href="http://www.amazon.com/gp/product/B001GVISJM">Twizzlers, Strawberry, 16-Ounce Bags (Pack of 6)</a> Twizzlers - Strawberry

Actual: 1
Predicted: 0
Score: 5
Text: What else do you need to know? Oatmeal, instant (make it with a half cup of low-fat milk and add raisins;nuke for 90 seconds). More expensive than Kroger store brand oatmeal and maybe a little tastier or better texture or something. It's still just oatmeal. Mmm, convenient! it's oatmeal

Actual: 1
Predicted: 0
Score: 4
Text: I really like the Maple and Brown Sugar flavor. The regular is fine with brown sugar added. The Apples and Cinnamon flavor is OK. This is a very quick, easy and satisfying b