In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F

In [2]:
tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
model = AutoModel.from_pretrained('deepset/sentence_bert')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=385.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=438006864.0), HTML(value='')))




## Example from the HF Zero-Shot blog

In [3]:
sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                     return_tensors='pt',
                                     pad_to_max_length=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

# now find the labels with the highest cosine similarities to
# the sentence
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

label: politics 	 similarity: 0.2156151533126831
label: business 	 similarity: 0.004524126183241606
label: art & culture 	 similarity: -0.027396924793720245




## Let's try it on Amazon Products

In [6]:
df = pd.read_csv("../data/filtered_amazon_co-ecommerce_sample.csv")

In [29]:
def model_input(text, labels):
    inputs = tokenizer.batch_encode_plus([text] + labels,
                                         return_tensors='pt',
                                         padding='longest')
    return inputs

In [8]:
def model_output(inputs):
    output = model(**inputs)[0]
    sentence_rep = output[:1].mean(dim=1)
    label_reps = output[1:].mean(dim=1)
    
    return sentence_rep, label_reps

In [53]:
def closest_label(sentence_representation, label_representations):
    similarities = F.cosine_similarity(sentence_representation, label_representations)
    closest = similarities.argsort(descending=True)
    return similarities, closest

In [22]:
labels = list(df['category'].unique())
labels

['Hobbies',
 'Characters & Brands',
 'Fancy Dress',
 'Arts & Crafts',
 'Games',
 'Figures & Playsets',
 'Sports Toys & Outdoor',
 'Die-Cast & Toy Vehicles',
 'Baby & Toddler Toys',
 'Party Supplies',
 'Dolls & Accessories',
 'Puppets & Puppet Theatres',
 'Jigsaws & Puzzles']

In [76]:
idx = 100
example = df.iloc[idx]

inputs = model_input(example['product_name'], labels)

In [77]:
sentence_rep, label_reps = model_output(inputs)

In [78]:
similarities, closest = closest_label(sentence_rep, label_reps) 

closest

tensor([ 7, 12,  5, 11,  3, 10,  6,  0,  1,  8,  4,  2,  9])

In [79]:
for ind in closest:
    print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')
print()
print(f"True Label: {example['category']}")

label: Die-Cast & Toy Vehicles 	 similarity: 0.4015389084815979
label: Jigsaws & Puzzles 	 similarity: 0.32925093173980713
label: Figures & Playsets 	 similarity: 0.2397889792919159
label: Puppets & Puppet Theatres 	 similarity: 0.2038014829158783
label: Arts & Crafts 	 similarity: 0.17721787095069885
label: Dolls & Accessories 	 similarity: 0.16500996053218842
label: Sports Toys & Outdoor 	 similarity: 0.16155120730400085
label: Hobbies 	 similarity: 0.12504489719867706
label: Characters & Brands 	 similarity: 0.10231133550405502
label: Baby & Toddler Toys 	 similarity: 0.08563225716352463
label: Games 	 similarity: 0.06645378470420837
label: Fancy Dress 	 similarity: 0.06069502234458923
label: Party Supplies 	 similarity: 0.016845915466547012

True Label: Hobbies


In [80]:
example['product_name']

'Piko 36165 35mm Metal Wheels (2 Axles)'

In [81]:
example['description']

'Suitable for the following scale(s): G Scale'

In [82]:
example['product_information']

'Technical Details Manufacturer recommended age:3 years and up Item model numberPiko 36165 Scale1::22.5 Track Width/GaugeG \xa0\xa0 Additional Information ASINB00ACH5PC4 Best Sellers Rank 994,387 in Toys & Games (See top 100) #1418 in\xa0Toys & Games > Model Trains & Railway Sets > Rail Vehicles > Wagons #1835 in\xa0Toys & Games > Model Trains & Railway Sets > Rail Vehicles > Trains Delivery Destinations:Visit the Delivery Destinations Help page to see where this item can be delivered. Date First Available29 May 2013 \xa0\xa0 Feedback \xa0Would you like to update product info or give feedback on images?'