In [1]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Open DataFrames
df = pd.read_csv('df_10.csv')

# Show first 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,index,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,...,Total_Votes,Helpful_Percentage,Review_Length,Sentence_Length,Word_Length,Helpful_Rating,Helpful_Votes_Scaled,Helpful_Percentage_Book_Score,Badge,Helpful?
0,14,34,000100039X,"[81, 92]",5.0,This is one of the first (literary) books I re...,"06 24, 2000",A1NPNGWBVD9AK3,"Bruce Kendall ""BEK""",Simple Wisdom,...,92,0.880435,1542,15.0,4.258865,0.921901,0.993132,0.587119,4,1
1,18,43,000100039X,"[8, 10]",5.0,The Prophet is Kahlil Gibran's best known work...,"10 27, 2009",A3IS4WGMFR4X65,"Colin Lewis ""WisdomNote""",The most loving book ever written,...,10,0.8,2294,22.705882,4.883289,0.816553,0.851511,0.545942,4,1
2,19,46,000100039X,"[8, 10]",5.0,Gibran Khalil Gibran was born in 1883 in what ...,"01 10, 2006",AWLFVCT9128JV,"Dave_42 ""Dave_42""",The Lessons Of Life,...,10,0.8,712,19.428571,4.103704,0.816553,0.851511,0.545942,4,1
3,35,75,000100039X,"[10, 12]",5.0,"Certainly the words are of Kahlil Gibran, but ...","11 5, 2001",A1SP45I55GQIIE,harendra desai,divine wisdom in earthly words.,...,12,0.833333,700,21.0,4.675,0.851313,0.889774,0.547093,4,1
4,36,76,000100039X,"[0, 13]",2.0,"I evidently misread the writeup, I thought it ...","05 3, 2009",A2ZB1G1KUE6OS6,Harley,Unexpected purchase,...,13,0.0,178,11.666667,3.885714,0.010826,0.0,0.541325,1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 787981 entries, 0 to 787980
Data columns (total 22 columns):
Unnamed: 0                       787981 non-null int64
index                            787981 non-null int64
asin                             787981 non-null object
helpful                          787981 non-null object
overall                          787981 non-null float64
reviewText                       787981 non-null object
reviewTime                       787981 non-null object
reviewerID                       787981 non-null object
reviewerName                     786144 non-null object
summary                          787978 non-null object
unixReviewTime                   787981 non-null int64
Helpful_Votes                    787981 non-null int64
Total_Votes                      787981 non-null int64
Helpful_Percentage               787981 non-null float64
Review_Length                    787981 non-null int64
Sentence_Length                  787981 non-null floa

In [4]:
df['Helpful'] = df['Helpful?']

In [5]:
corpus = df['reviewText']

import nltk
import re
nltk.download('stopwords')

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(corpus)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/coreyjwade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def make_xy(df, vectorizer):
    vectorizer = vectorizer
    X = vectorizer.fit_transform(df.reviewText)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = df.Helpful
    return X, y

def make_xy_norm(df, vectorizer):
    vectorizer = vectorizer
    X = vectorizer.fit_transform(norm_corpus)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = df.Helpful
    return X, y

In [7]:
vectorizers = [CountVectorizer(), CountVectorizer(ngram_range=(1,2)), CountVectorizer(ngram_range=(1,3)), TfidfVectorizer(), TfidfVectorizer(ngram_range=(1, 2)), TfidfVectorizer(ngram_range=(1, 3))]

In [8]:
xy_options = [make_xy, make_xy_norm]

In [20]:
# keras_regression_test requires "from sklearn.model_selection import train_test_split"

def keras_classification_test(X, y, numbers=[128,64], activation='relu', optimizer='adam', loss='binary_crossentropy'):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    # Save the number of columns in predictors: n_cols
    n_cols = X_train.shape[1]
    input_shape = (n_cols,)

    # Set up the model: model
    model = Sequential()
    
    # Add the first layer
    model.add(Dense(numbers[0], activation=activation, input_shape=(n_cols,)))
    
    # Add addition layers
    for i in range(len(numbers)-1):
        model.add(Dense(numbers[i+1], activation=activation))

    # Add the output layer
    model.add(Dense(1, activation='softmax'))

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=2)

    # Fit the model
    model.fit(X_train, y_train, validation_split=0.3, epochs=30, callbacks=[early_stopping_monitor])
    
    # Get score for predictions
    score = model.evaluate(X_test, y_test)
    
    # Print score
    print(score)

In [10]:
df_original = df.copy

In [11]:
df = df.sample(frac=0.1)

In [12]:
len(df)

78798

In [19]:
len(df[df['Helpful']==0])/len(df)

0.27840808142338636

In [13]:
X, y = make_xy(df, CountVectorizer())

In [17]:
keras_classification_test(X, y, numbers=[128,64])

Train on 41368 samples, validate on 17730 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
[0.7913176821694157, 0.7397969543026184]


In [21]:
X, y = make_xy(df, CountVectorizer(1,2))

In [22]:
keras_classification_test(X, y, numbers=[100,50])

Train on 41368 samples, validate on 17730 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
[4.487336421133903, 0.7185279188059308]


In [23]:
X, y = make_xy(df, TfidfVectorizer())

In [25]:
keras_classification_test(X, y, numbers=[128,64])

Train on 41368 samples, validate on 17730 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
[4.39669951375971, 0.7242131979695432]


In [None]:
X, y = make_xy(df, CountVectorizer())
keras_classification_test(X, y, numbers=[100, 100, 100])

Train on 41368 samples, validate on 17730 samples
Epoch 1/30
   32/41368 [..............................] - ETA: 42:23 - loss: 2.4910 - acc: 0.8438