# Baseline Food Review Classification Model using Naive Bayes

In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
# Read from Dave's file directory
df = pd.read_csv('~/.kaggle/datasets/snap/amazon-fine-food-reviews/Reviews.csv', encoding='utf8')
df_imdb = pd.read_csv('../data_prep/imdb_1_5.csv', encoding='utf8')
df_yelp = pd.read_csv('../data_prep/yelp_1_5.csv', encoding='utf8')
df_beer = pd.read_csv('../data_prep/rate_beer_binary_medium.csv', encoding='utf8')

In [5]:
df = df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

For development, just use 10 rows for simplicity and speed

In [6]:
print(df.shape)

(568454, 2)


In [7]:
def convertTo1or5(score):
    if score == 2:
        return 1
    elif score == 4:
        return 5
    else:
        return score

# Reduce training dataset to 1s and 5s
df_binary = df[(df.Score == 1) | (df.Score == 2) | (df.Score == 4) | (df.Score == 5)]
df_binary.reset_index(inplace=True)

df_binary['Score'] = df_binary['Score'].apply(lambda score: convertTo1or5(score))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Tokenize the reviews

In [8]:
df_binary['Tokens'] = df_binary['Text'].apply(lambda text: word_tokenize(text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Create train / test split

In [9]:
from sklearn.model_selection import train_test_split
X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(df_binary['Text'], df_binary['Score'], test_size=0.33)

Convert to vectors for input to BernoulliNB

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train_tokens)
X_test = vectorizer.transform(X_test_tokens)

Create Bernoulli Naive Bayes model

In [11]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

In [12]:
nb = BernoulliNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set: {:.02%}".format(accuracy))

Accuracy on test set: 87.43%


In [13]:
# The below code will use the Amazon trained Naive Bayes classifer and predict IMDB reviews

# Convert text into tokens
df_imdb['Tokens'] = df_imdb['Text'].apply(lambda text: word_tokenize(text))

# Ensure train and test matrices are in same vector shapes for BernoulliNB model
X_imdb_train_tokens, X_imdb_test_tokens, y_imdb_train, y_imdb_test = train_test_split(df_imdb['Text'], df_imdb['Score'], test_size=.99)
imdb_test = vectorizer.transform(X_imdb_test_tokens)

# Calculate predictions of test data
y_imdb_pred = nb.predict(imdb_test)
imdb_accuracy = accuracy_score(y_imdb_test,y_imdb_pred)
print("IMDB test size: ", y_imdb_pred.shape)
print("Accuracy on IMDB test set: {:.02%}".format(imdb_accuracy))

IMDB test size:  (1980,)
Accuracy on IMDB test set: 50.61%


In [14]:
# The below code will use the Amazon trained Naive Bayes classifer and predict yelp reviews

# Convert text into tokens
df_yelp['Tokens'] = df_yelp['Text'].apply(lambda text: word_tokenize(text))

# Ensure train and test matrices are in same vector shapes for BernoulliNB model
X_yelp_train_tokens, X_yelp_test_tokens, y_yelp_train, y_yelp_test = train_test_split(df_yelp['Text'], df_yelp['Score'], test_size=.99)
yelp_test = vectorizer.transform(X_yelp_test_tokens)

# Calculate predictions of test data
y_yelp_pred = nb.predict(yelp_test)
yelp_accuracy = accuracy_score(y_yelp_test,y_yelp_pred)
print("Yelp test size: ", y_yelp_pred.shape)
print("Accuracy on yelp test set: {:.02%}".format(yelp_accuracy))

Yelp test size:  (1425,)
Accuracy on yelp test set: 58.11%


In [15]:
# The below code will use the Amazon trained Naive Bayes classifer and predict RateBeer reviews

# Convert text into tokens
df_beer['Tokens'] = df_beer['Text'].apply(lambda text: word_tokenize(text))

# Ensure train and test matrices are in same vector shapes for BernoulliNB model
X_beer_train_tokens, X_beer_test_tokens, y_beer_train, y_beer_test = train_test_split(df_beer['Text'], df_beer['Score'], test_size=.99)
beer_test = vectorizer.transform(X_beer_test_tokens)

# Calculate predictions of test data
y_beer_pred = nb.predict(beer_test)
beer_accuracy = accuracy_score(y_beer_test,y_beer_pred)
print("RateBeer test size: ", y_beer_pred.shape)
print("Accuracy on RateBeer test set: {:.02%}".format(beer_accuracy))

RateBeer test size:  (70714,)
Accuracy on RateBeer test set: 58.97%
