# Amazon Review Polarity
## Text Classification Using Machine Learning Algorithms

In [28]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/72/0c/173ac467d0a53e33e41b521e4ceba74a8ac7c7873d7b857a8fbdca88302d/bayesian-optimization-1.0.1.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.0.1-cp36-none-any.whl size=10032 sha256=4a190440d725c2d580a038fd0bc260b9f4cf0f50e890bc529671f804c969205f
  Stored in directory: /root/.cache/pip/wheels/1d/0d/3b/6b9d4477a34b3905f246ff4e7acf6aafd4cc9b77d473629b77
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.0.1


In [34]:
import os
import re
import tqdm
import string
import unicodedata
import collections
import numpy as np
import scipy as sp
import pandas as pd

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes
from sklearn import metrics

from bayes_opt import BayesianOptimization

nltk.download("stopwords")
STOPWORDS = set(stopwords.words("english"))
SEED = 78

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
!wget --no-check-certificate \
    'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz' \
    -O './amazon_review_polarity_csv.tgz'

!tar -xzvf './amazon_review_polarity_csv.tgz'

--2020-01-19 23:39:02--  https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.186.157
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.186.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 688339454 (656M) [application/x-tar]
Saving to: ‘./amazon_review_polarity_csv.tgz’


2020-01-19 23:39:15 (52.6 MB/s) - ‘./amazon_review_polarity_csv.tgz’ saved [688339454/688339454]

amazon_review_polarity_csv/
amazon_review_polarity_csv/train.csv
amazon_review_polarity_csv/readme.txt
amazon_review_polarity_csv/test.csv


In [4]:
%%time
train = pd.read_csv('amazon_review_polarity_csv/train.csv', header=None)
test = pd.read_csv('amazon_review_polarity_csv/test.csv', header=None)
print(f'Train shape: {train.shape} - Test shape: {test.shape}')

Train shape: (3600000, 3) - Test shape: (400000, 3)
CPU times: user 16.6 s, sys: 1.27 s, total: 17.9 s
Wall time: 17.9 s


In [5]:
train.columns = ['label', 'review_title', 'review_text']
test.columns = ['label', 'review_title', 'review_text']
train.head()

Unnamed: 0,label,review_title,review_text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [38]:
print(train.label.value_counts())
print(test.label.value_counts())

2    1800000
1    1800000
Name: label, dtype: int64
2    200000
1    200000
Name: label, dtype: int64


## Data preprocessing

In [0]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
stemmer = nltk.stem.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-z1-9'-]", r' ', text)
    text = re.sub(r'\s+', r' ', text)
    return text

def tokenize(text, tokenizer=tokenizer, stopwords=STOPWORDS):
    tokens = [word for word in tokenizer.tokenize(text)
                if word not in stopwords]
    return tokens

def stem(tokens, stemmer=stemmer):
    return [stemmer.stem(token) for token in tokens]

def lemmatize(tokens, lemmatizer=lemmatizer):
    return [lemmatizer.lemmatize(token) for token in tokens]

## Feature Extraction

In [0]:
class Vocabulary:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
        self.word2count = {}
        self.num_words = 0

    def add_word(self, word):
        if word in self.word2count.keys():
            self.word2count[word] += 1
        else:
            self.word2index[word] = self.num_words
            self.index2word[self.num_words] = word
            self.word2count[word] = 1
            self.num_words +=1

    def add_sentence(self, sentence):
        for word in sentence.split():
            self.add_word(word)

    def trim(self, min_count):
        '''Remove words below a certain count threshold'''
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        
        print(f'Keeping {len(keep_words)} over {len(self.word2index)} '
                f'=> {(len(keep_words) / len(self.word2index) * 100):.2f}%')
        
        self.word2index = {}
        self.index2word = {}
        self.word2count = {}
        self.num_words = 0
        
        for word in keep_words:
            self.add_word(word)

In [0]:
def get_bag_of_words(tokens, vocabulary: Vocabulary):
    bow = np.zeros(vocabulary.num_words)
    for token in tokens:
        if token in vocabulary.word2index:
            bow[vocabulary.word2index[token]] += 1
    return bow

def get_sparse_matrix(bows):
    return sp.sparse.vstack([sp.sparse.csr_matrix(bow) for bow in bows])

def train_tf_idf(train_data, tokenizer=tokenizer):
    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                       min_df=5, max_df=0.9,
                                       ngram_range=(1, 3))
    tfidf_vectorizer.fit(train_data)
    return tfidf_vectorizer

def get_tf_idf(vectorizer, train_data, test_data):
    return vectorizer.transform(train_data), vectorizer.transform(test_data)

## Modeling

In [0]:
def bayesian_optimization(dataset, function, parameters):
    X_tr, y_tr, X_te, y_te = dataset
    n_iterations = 25
    gp_params = {'alpha': 1e-4}

    optimizer = BayesianOptimization(function, parameters)
    optimizer.maximize(n_iter=n_iterations, **gp_params)

    return optimizer.max

def train_model(data, function, parameters, clf):
    X_tr, y_tr, X_te, y_te = dataset
    best_solution = bayesian_optimization(dataset, function, parameters)      
    params = best_solution['params']

    assert callable(clf), 'Error! clf must be a callable!'
    model = clf(**params)
    model.fit(X_tr, y_tr)

    return model

def optimize(cv_splits):
    def function(clf, params):
        return model_selection.cross_val_score(clf(**params), X_tr, y_tr,
                                               cv=cv_splits,
                                               scoring='accuracy',
                                               n_jobs=-1).mean()
    return function, parameters

In [33]:
callable(svm.SVC)

True