In [2]:
import re
import pickle
import pandas as pd
import numpy as np
from sklearn.utils import resample
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

In [5]:
df = pd.read_csv('../data/Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
df['Text'] = df['Text'] + ' ' + df['Summary']

In [7]:
# Delete unused columns
del df['Id']
del df['ProfileName']
del df['Summary']
del df['HelpfulnessNumerator']
del df['HelpfulnessDenominator']
del df['Time']
del df['ProductId']
del df['UserId']

In [8]:
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [9]:
df.loc[df['Score'] <= 3, 'ReviewSentiment'] = 0
df.loc[df['Score'] > 3, 'ReviewSentiment'] = 1

df['ReviewSentiment'] = df['ReviewSentiment'].astype(int)
#convert na to ""
df['Text'].fillna("", inplace=True)

In [10]:
def preprocess(s):
    # Remove html tags
    s = re.sub('<\S+>', '', s)
    # Replace urls with token
    s = re.sub(r'http:\S+', 'url', s)
    s = re.sub(r'https:\S+', 'url', s)
    
    s = s.lower()
    # Remove any other special characters
    s = re.sub(r'[^a-z ]', ' ', s)
    
    words = s.split()
    result = []
    
    # Remove stop words and lemmatize the words
    for word in words:
        if word in stop_words:
            continue
        word = lemmatizer.lemmatize(word)
        result.append(word)
    return ' '.join(result)

In [11]:
df['PreprocessText'] = df['Text'].apply(preprocess)
df.head()

Unnamed: 0,Score,Text,ReviewSentiment,PreprocessText
0,5,I have bought several of the Vitality canned d...,1,bought several vitality canned dog food produc...
1,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrived labeled jumbo salted peanut pe...
2,4,This is a confection that has been around a fe...,1,confection around century light pillowy citrus...
3,2,If you are looking for the secret ingredient i...,0,looking secret ingredient robitussin believe f...
4,5,Great taffy at a great price. There was a wid...,1,great taffy great price wide assortment yummy ...


In [12]:
negative = df[df['ReviewSentiment']==0]
positive = df[df['ReviewSentiment']==1]
print('Number of negative samples:', len(negative))
print('Number of positive samples:', len(positive))

Number of negative samples: 124677
Number of positive samples: 443777


In [13]:
positive_downsampled = resample(positive, replace=True, # sample with replacement
                                n_samples=len(negative), # match number in minority class
                                random_state=1)
print('Number of positive downsampled:', len(positive_downsampled))

downsampled = pd.concat([negative, positive_downsampled])
print('Total Number of rows after downsampling:', len(downsampled))

Number of positive downsampled: 124677
Total Number of rows after downsampling: 249354


In [14]:
x_train, x_test, y_train, y_test = train_test_split(downsampled['PreprocessText'], downsampled['ReviewSentiment'], test_size=0.2, random_state=1, stratify=downsampled['ReviewSentiment'])
print('Number of train samples:', len(x_train))
print('Number of test samples:', len(x_test))

Number of train samples: 199483
Number of test samples: 49871


In [15]:
# Delete unused objects
del stop_words
del lemmatizer
del df
del downsampled
del negative
del positive

In [16]:
# Majority vote classifier
class MajorityVote:
    def __init__(self, classifiers):
        # These classifiers are already 'fit' with the training data
        self.classifiers = classifiers
    
    def predict(self, X):
        predictions = np.asarray([clf.predict(X[i])
                                      for i, clf in enumerate(self.classifiers)]).T
        
        maj_vote = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x)),
                                      axis=1,
                                      arr=predictions)
        return maj_vote
    
    def score(self, X, y):
        pred_y = self.predict(X)
        correct = 0
        for i in range(len(y)):
            if pred_y[i] == y[i]:
                correct += 1
        acc = correct/len(y)
        return acc       
 

## Count Vectorizer + Logistic Regression

In [17]:
# Get the count vectors
cv = CountVectorizer()
cv_train = cv.fit_transform(x_train)
cv_test = cv.transform(x_test)

print('Shape of train count vector:', cv_train.shape)
print('Shape of test count vector:', cv_test.shape)

Shape of train count vector: (199483, 64878)
Shape of test count vector: (49871, 64878)


In [None]:
# Logistic Regression with CountVectorizer
lr_cv = pickle.load(open('model/lr_grid_model', 'rb'))
lr_cv.score(cv_test, y_test)

0.887409516552706

## TF-IDF + Logistic Regression

In [None]:
tfidfv = TfidfVectorizer()
tfidf_train = tfidfv.fit_transform(x_train)
tfidf_test = tfidfv.transform(x_test)

print('Shape of train count vector:', tfidf_train.shape)
print('Shape of test count vector:', tfidf_test.shape)

Shape of train count vector: (199483, 64878)
Shape of test count vector: (49871, 64878)


In [None]:
lr_tfidf = pickle.load(open('model/lr_tfidf', 'rb'))
print(lr_tfidf.score(tfidf_test, y_test))

0.8889936034970223


## Count Vectorizer + Naive Bayes

In [18]:
nb_cv = pickle.load(open('model/nb', 'rb'))
nb_cv.score(cv_test, y_test)

0.8582743478173688

## Glove + Logistic Regression

In [None]:
embeddings_index = {}
with open('glove/glove.6B.300d.txt', encoding="utf8") as f:
    line = f.readline()
    while line:
        values = line.split()
        word = values[0]
        try:
           coefs = np.asarray(values[1:], dtype='float32')
           embeddings_index[word] = coefs
        except ValueError:
           pass
        line = f.readline()


In [None]:
xtrain_glove = [doc2vec(x) for x in tqdm(x_train)]
xtest_glove = [doc2vec(x) for x in tqdm(x_test)]

In [None]:
glv_lr = pickle.load(open('model/lr_glove_grid_model', 'rb'))
print('Loaded GridCV Model on test data:', glv_lr.score(xtest_glove, y_test.tolist()))

## Ensemble
1. CV + LR
1. TFIDF + LR
1. CV + NB

In [38]:
classifiers = []
X = []

classifiers.append(lr_cv)
X.append(cv_test)

classifiers.append(lr_tfidf)
X.append(tfidf_test)

classifiers.append(nb_cv)
X.append(cv_test)

mv = MajorityVote(classifiers)
mv_pred = mv.predict(X)

acc = mv.score(X, y_test.to_list())
print('Accuracy on test data: %.2f' %(acc))

Accuracy on test data: 0.89
