# Capstone Project: Amazon Review Classification (Part 4)
Author: **Steven Lee**

In [1]:
import pandas as pd
from random import sample

# Set pandas display options.
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 150)

from IPython.core.display import display, HTML

from transformers import pipeline

In [2]:
# Read in LDA data.
reviews = pd.read_csv("../data/reviews_demo.csv")
total_rev = reviews.shape[0]
keywords = pd.read_csv("../data/keywords_lda.csv")

In [3]:
for i in range(30):
    col = "T{:02d}-WT".format(i)
    keywords[col] = keywords[col].round(4)

In [4]:
# Initialise pipeline transformers.
sa_pipeline = pipeline('sentiment-analysis')
bert_cl_pl = pipeline("zero-shot-classification", model="bert-base-cased")
fb_cl_pl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

## Sentiment Analysis

In [5]:
# Randomly select a review.
rev_idx = sample(range(total_rev), 1)
topic, rev_txt, rev_doc, rating = reviews.loc[rev_idx[0], ['dmnt_topic', 'reviewText', 'document', 'overall']]

In [6]:
print("Review Text:\n")
print(rev_txt, "\n")
print(f"Review Rating (max. 5): {rating}\n")
output = sa_pipeline(rev_txt)
output

Review Text:

Very pleased with this door knob.  "Juno" is better quality than the "Tylo", both from Qwikset.  For only a couple dollars more.  Highly recommend the "Juno" model series.  Smart-key works as advertised, easy-peasy.  Finally get my front door and back to use same key ... after too many years. 

Review Rating (max. 5): 4.0



[{'label': 'POSITIVE', 'score': 0.9948467016220093}]

## Zero-Shot Classification

In [7]:
cols = []
cols.append("T{:02d}-KW".format(topic))
cols.append("T{:02d}-WT".format(topic))
print("LDA Model Label-Score Pairs:")
keywords[cols].T

LDA Model Label-Score Pairs:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
T20-KW,door,lock,open,easy,close,knob,front,great,work,install
T20-WT,0.0888,0.049,0.0315,0.0187,0.0154,0.0142,0.0127,0.0121,0.0109,0.0101


In [8]:
candidate_labels = list(keywords.loc[:, cols[0]])
results_bert = bert_cl_pl(rev_doc, candidate_labels)
results_fb = fb_cl_pl(rev_doc, candidate_labels)
# print("Topic Candidate Labels:")
# labels = pd.DataFrame(candidate_labels)
# labels.T

In [9]:
def print_results(results):
    labels = results['labels']
    scores = results['scores']
    output = []

    for i in range(10):
        pair = ()
        pair = (labels[i], round(scores[i], 4))
        output.append(pair)
        
    return output

output = pd.DataFrame(print_results(results_bert))
print("Bert Model Label-Score Pairs:")
output.T

Bert Model Label-Score Pairs:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,great,close,install,open,lock,easy,work,front,door,knob
1,0.1099,0.1076,0.1075,0.107,0.1053,0.1043,0.1011,0.0925,0.0906,0.0741


In [10]:
output = pd.DataFrame(print_results(results_fb))
print("Bart Model Label-Score Pairs:")
output.T

Bart Model Label-Score Pairs:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,great,knob,front,door,easy,work,open,close,install,lock
1,0.2195,0.2078,0.1528,0.1423,0.1329,0.0713,0.0329,0.0241,0.0125,0.0039


## LDA Visualization

In [11]:
display(HTML("../ldavis/ldavis_bigrams_30.html"))