# Capstone Project: Amazon Review Classification (Part 3)
Author: **Steven Lee**

# Transformers and Pretrained Models

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Libraries" data-toc-modified-id="Import-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Libraries</a></span></li><li><span><a href="#Prepare-Data" data-toc-modified-id="Prepare-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare Data</a></span></li><li><span><a href="#Sentence-Clustering" data-toc-modified-id="Sentence-Clustering-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence Clustering</a></span></li><li><span><a href="#Zero-Shot-Classification" data-toc-modified-id="Zero-Shot-Classification-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Zero-Shot Classification</a></span></li></ul></div>

## Import Libraries

In [1]:
import pandas as pd
from random import sample

# Set pandas display options.
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

## Prepare Data

In [2]:
# Read in LDA data.
reviews = pd.read_csv("../data/reviews_lda.csv")

In [5]:
# Cleanup null brand values.
reviews.loc[reviews['brand'].isnull(), 'brand'] = "None"

## Sentence Clustering

In [4]:
embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')

# Corpus with example sentences
# corpus = ['A man is eating food.',
#           'A man is eating a piece of bread.',
#           'A man is eating pasta.',
#           'The girl is carrying a baby.',
#           'The baby is carried by the woman',
#           'A man is riding a horse.',
#           'A man is riding a white horse on an enclosed ground.',
#           'A monkey is playing drums.',
#           'Someone in a gorilla costume is playing a set of drums.',
#           'A cheetah is running behind its prey.',
#           'A cheetah chases prey on across a field.'
#           ]

# corpus_embeddings = embedder.encode(corpus)

In [8]:
corpus = reviews['reviewText'][:500]
corpus_embeddings = embedder.encode(corpus)

In [9]:
corpus_embeddings[:5]

array([[ 0.1338466 ,  0.37409598,  0.03262435, ..., -0.21958704,
         0.25208724,  0.25093827],
       [-0.04750726,  0.48556578, -0.2864431 , ...,  0.06699755,
         0.19338712,  0.11747508],
       [ 0.20254762,  0.7905678 ,  0.04845462, ...,  0.5033512 ,
         0.40976092, -0.09966847],
       [-0.18335253,  0.08183303, -0.23407994, ..., -0.07139245,
         0.2032198 , -0.2108928 ],
       [-0.04755043,  0.18304166, -0.04208577, ..., -0.13454206,
         0.00271911, -0.15800172]], dtype=float32)

In [12]:
# Perform kmean clustering
num_clusters = 28
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [13]:
cluster_assignment[:5]

array([10, 21, 10, 12,  2])

In [14]:
# for sentence_id, cluster_id in enumerate(cluster_assignment):
#     if sentence_id == 10: break
#     print(cluster_id, sentence_id)

In [15]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
#     print(cluster_id, sentence_id)
    clustered_sentences[cluster_id].append(corpus[sentence_id])

In [16]:
clustered_sentences[8]

['This little device keeps my marriage running. When I want to read and she wants to sleep it is the perfect tool. Shines bright in a small area and easy on batteries.',
 'my wife uses every night to read in bed! very bright but concentrated so not to bother me! however, the adapter sux; wire keeps breaking',
 'Second one purchased. Perfect for low lighting in the bedroom at night. Wife kept stealing it so I had to order her one as well!',
 'Perfect. My wife can now read through the night and I can sleep soundly.',
 "Perfect reading light for my daughter's bed. Adjustable arm stays where you put it, and it has two levels of brightness.",
 "Great reading light that I attached to my son's bed. The adjustable arm adjusts easily and stays where you put it. It also has two levels of brightness.",
 'Practical, good amount of light without bothering my sleeping wife. Easy to use and seems well-constructed.',
 "I bought this for my wife.  She already had a book light but it wasn't very bright.

In [29]:
for i, cluster in enumerate(clustered_sentences):
#     if i > 2: break
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['Super great clip for the low price', 'good cheap replacement', 'Perfect replacement and good price.', 'perfect replacements for my KitchenAid drop in microwave/oven combo', 'Works well and cheap fix for a broken lamp', 'Nice replacement bulb, at a super low price']

Cluster  2
["It's a bulb and lasts pretty good. Halogens are hot", "Works great, lots cheaper than the bulbs I was buying at Lowe's.", "It's a bulb...", 'A little smaller than the original bulb, but it does the job just fine.', 'I got this pack of bulbs for my halogen lamp. I think I have enough to last a lifetime now, but the bulbs are great!', 'love these bulbs and the price', '$10 for 10 bulbs, compared to HD, $5 for ONE bulb? No brainer. Perfect replacement for my undercounter lights.', 'These bulbs were larger in size than the original bulbs but they work very nicely.', 'great deal on good bulbs', "They're light bulbs...what can I say. They work.", '$1.10/bulb for a $4 bulb: Awesome! Accurate size descript

## Zero-Shot Classification

In [17]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartModel: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification m

The warning is telling us we are throwing away some weights (the vocab_transform and vocab_layer_norm layers) and randomly initializing some other (the pre_classifier and classifier layers). This is absolutely normal in this case, because we are removing the head used to pretrain the model on a masked language modeling objective and replacing it with a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.

In [20]:
# sequence_to_classify = "one day I will see the world"
sequence_to_classify = "Keeps the mist of your wood trim and on you. Bendable too."
# candidate_labels = ['travel', 'cooking', 'dancing']
candidate_labels = ['fit', 'price', 'quality', 'design', 'feature', 'beauty', 'portable', 'bright', 'convenient', 
                    'useful', 'effective', 'easy', 'setting', 'size', 'warranty', 'color', 'sound', 'display', 
                    'install', 'efficient', 'durable', 'elegant', 'style', 'material', 'flexible', 'texture', 
                    'sturdy', 'lasting']
classifier(sequence_to_classify, candidate_labels)
#{'labels': ['travel', 'dancing', 'cooking'],
# 'scores': [0.9938651323318481, 0.0032737774308770895, 0.002861034357920289],
# 'sequence': 'one day I will see the world'}

{'sequence': 'Keeps the mist of your wood trim and on you. Bendable too.',
 'labels': ['useful',
  'effective',
  'convenient',
  'durable',
  'efficient',
  'lasting',
  'feature',
  'sturdy',
  'flexible',
  'fit',
  'quality',
  'sound',
  'portable',
  'style',
  'setting',
  'design',
  'display',
  'elegant',
  'size',
  'beauty',
  'material',
  'texture',
  'bright',
  'install',
  'warranty',
  'color',
  'price',
  'easy'],
 'scores': [0.14417190849781036,
  0.13556835055351257,
  0.09085024148225784,
  0.05804915726184845,
  0.0545324869453907,
  0.05263342335820198,
  0.04637948423624039,
  0.04562646523118019,
  0.045600395649671555,
  0.04559176415205002,
  0.04240129142999649,
  0.03137057274580002,
  0.030001936480402946,
  0.029940489679574966,
  0.02437846176326275,
  0.018749482929706573,
  0.017612097784876823,
  0.01677836664021015,
  0.012511210516095161,
  0.01239504013210535,
  0.011936400085687637,
  0.008958389051258564,
  0.005489616189152002,
  0.00542015768

In [6]:
# candidate_labels = ['travel', 'cooking', 'dancing', 'exploration']
#classifier(sequence_to_classify, candidate_labels, multi_class=True)
# {'labels': ['travel', 'exploration', 'dancing', 'cooking'],
# 'scores': [0.9945111274719238,
#  0.9383890628814697,
#  0.0057061901316046715,
#  0.0018193122232332826],
# 'sequence': 'one day I will see the world'}