#### Google drive


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np

In [5]:
path = '/content/drive/MyDrive/final_project_itc/data.npy'
data = np.load(path, allow_pickle = True)

In [6]:
df = pd.DataFrame(data, columns = ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])

In [7]:
df = df.loc[:,['ProductId','Text','Score']]

#### zero shot classification

In [8]:
from transformers import pipeline
classifier = pipeline(task="zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device=0)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
all_reviews = df.Text.iloc[:100].to_list()

In [9]:
candidate_labels = ['flavor','taste','texture', 'price','expiration date','quality', 'size','delivery','packaging']

In [10]:
hypothesis_template = "What the customer like on the product is the {}."

##### single topic

In [11]:
single_topic_prediction = classifier(all_reviews, candidate_labels, hypothesis_template=hypothesis_template)

KeyboardInterrupt: 

In [None]:
single_topic_prediction = pd.DataFrame(single_topic_prediction)

In [None]:
single_topic_prediction['predicted_topic'] = single_topic_prediction['labels'].apply(lambda x: x[0])
single_topic_prediction['predicted_topic_score'] = single_topic_prediction['scores'].apply(lambda x: x[0])
single_topic_prediction.head()

In [11]:
from transformers import BartForSequenceClassification, BartTokenizer

model_name = "facebook/bart-large-mnli"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForSequenceClassification.from_pretrained(model_name)


In [53]:
premise = single_topic_prediction.sequence.iloc[26]
hypothesis = f'What the customer like on the product is the {single_topic_prediction.predicted_topic.iloc[26]}'
tokens = tokenizer(premise, hypothesis, return_tensors="pt")
outputs = model(**tokens)
outputs.keys()

odict_keys(['logits', 'past_key_values', 'encoder_last_hidden_state'])

In [54]:
logits = outputs.logits
logits.shape

torch.Size([1, 3])

In [55]:
entail_contradiction_logits = logits[:,[0,2]]
entail_contradiction_logits

tensor([[-1.4193, -1.5887]], grad_fn=<IndexBackward0>)

In [56]:
probs = entail_contradiction_logits.softmax(dim=1)
probs

tensor([[0.5422, 0.4578]], grad_fn=<SoftmaxBackward0>)

In [50]:
premise

"My cats have been happily eating Felidae Platinum for more than two years. I just got a new bag and the shape of the food is different. They tried the new food when I first put it in their bowls and now the bowls sit full and the kitties will not touch the food. I've noticed similar reviews related to formula changes in the past. Unfortunately, I now need to find a new food that my cats will eat."

In [51]:
single_topic_prediction.predicted_topic.iloc[12]

'packaging'

In [14]:
hypothesis_template_neg = "What the customer does not like on the product is the {}."
single_topic_prediction_neg = classifier(all_reviews, candidate_labels, hypothesis_template=hypothesis_template_neg)
single_topic_prediction_neg = pd.DataFrame(single_topic_prediction_neg)
single_topic_prediction_neg['predicted_topic'] = single_topic_prediction_neg['labels'].apply(lambda x: x[0])
single_topic_prediction_neg['predicted_topic_score'] = single_topic_prediction_neg['scores'].apply(lambda x: x[0])
single_topic_prediction_neg.head()

Unnamed: 0,sequence,labels,scores,predicted_topic,predicted_topic_score
0,I have bought several of the Vitality canned d...,"[texture, taste, quality, flavor, packaging, s...","[0.1624506711959839, 0.13631795346736908, 0.12...",texture,0.162451
1,Product arrived labeled as Jumbo Salted Peanut...,"[delivery, size, packaging, taste, flavor, qua...","[0.32973819971084595, 0.26082807779312134, 0.1...",delivery,0.329738
2,This is a confection that has been around a fe...,"[texture, size, quality, delivery, price, pack...","[0.2105044722557068, 0.1402858942747116, 0.122...",texture,0.210504
3,If you are looking for the secret ingredient i...,"[taste, flavor, texture, quality, packaging, s...","[0.19049909710884094, 0.17524242401123047, 0.1...",taste,0.190499
4,Great taffy at a great price. There was a wid...,"[texture, taste, quality, flavor, packaging, s...","[0.1412028968334198, 0.13406331837177277, 0.13...",texture,0.141203


In [38]:
np.where(df.Score == 1)

(array([     1,     12,     26, ..., 568431, 568432, 568433]),)

In [16]:
single_topic_prediction.predicted_topic.iloc[6]

'texture'

In [52]:
single_topic_prediction['sequence'].iloc[26]

'The candy is just red , No flavor . Just  plan and chewy .  I would never buy them again'

In [18]:
single_topic_prediction.predicted_topic.value_counts()

predicted_topic
taste        31
flavor       22
quality      17
delivery      9
size          8
texture       6
price         4
packaging     3
Name: count, dtype: int64

In [19]:
single_topic_prediction['sequence'].iloc[4]

'Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.'

In [20]:
np.where(single_topic_prediction.predicted_topic == 'price')

(array([36, 60, 90, 99]),)

##### multi topic

In [16]:
multi_topic_prediction = classifier(all_reviews, candidate_labels, hypothesis_template=hypothesis_template, multi_label=True)

multi_topic_prediction = pd.DataFrame(multi_topic_prediction)


multi_topic_prediction.head()

Unnamed: 0,sequence,labels,scores
0,I have bought several of the Vitality canned d...,"[quality, flavor, packaging, texture, taste, p...","[0.9917555451393127, 0.8814021348953247, 0.802..."
1,Product arrived labeled as Jumbo Salted Peanut...,"[packaging, size, quality, flavor, taste, text...","[0.7082586288452148, 0.667267918586731, 0.6548..."
2,This is a confection that has been around a fe...,"[texture, flavor, taste, quality, size, delive...","[0.970458447933197, 0.9575488567352295, 0.9385..."
3,If you are looking for the secret ingredient i...,"[taste, flavor, quality, delivery, packaging, ...","[0.9148136377334595, 0.8775236010551453, 0.485..."
4,Great taffy at a great price. There was a wid...,"[delivery, price, taste, flavor, quality, pack...","[0.9934879541397095, 0.9791789054870605, 0.908..."


In [22]:
multi_topic_prediction.sequence.iloc[4]

'Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.'

In [17]:
threshold = 0.6

multi_topic_prediction = multi_topic_prediction.set_index('sequence').apply(pd.Series.explode).reset_index()
multi_topic_prediction = multi_topic_prediction[multi_topic_prediction['scores'] >= threshold]
multi_topic_prediction.head()

Unnamed: 0,sequence,labels,scores
0,I have bought several of the Vitality canned d...,quality,0.991756
1,I have bought several of the Vitality canned d...,flavor,0.881402
2,I have bought several of the Vitality canned d...,packaging,0.802387
3,I have bought several of the Vitality canned d...,texture,0.764957
9,Product arrived labeled as Jumbo Salted Peanut...,packaging,0.708259


In [58]:
premise = multi_topic_prediction.sequence.iloc[26]
hypothesis = f'What the customer like on the product is the {multi_topic_prediction.labels.iloc[26]}'
tokens = tokenizer(premise, hypothesis, return_tensors="pt")
outputs = model(**tokens)
outputs.keys()

odict_keys(['logits', 'past_key_values', 'encoder_last_hidden_state'])

In [59]:
logits = outputs.logits
entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)

In [60]:
probs

tensor([[0.1340, 0.8660]], grad_fn=<SoftmaxBackward0>)

In [74]:
type(probs[0][0].item())

float

In [75]:
np.argmax((probs[0][0].item(),probs[0][1].item()))

1

In [62]:
premise

"This saltwater taffy had great flavors and was very soft and chewy.  Each candy was individually wrapped well.  None of the candies were stuck together, which did happen in the expensive version, Fralinger's.  Would highly recommend this candy!  I served it at a beach-themed party and everyone loved it!"

In [22]:
import torch
from tqdm import tqdm
def get_sentiment(df):
  all_sentiments = []
  for i in tqdm(range(len(df))):
    premise = df.sequence.iloc[i]
    hypothesis = f'What the customer like on the product is the {df.labels.iloc[i]}'
    tokens = tokenizer(premise, hypothesis, return_tensors="pt")
    outputs = model(**tokens)
    entail_contradiction_logits = outputs.logits[:,[0,2]]
    probs = torch.softmax(entail_contradiction_logits,dim=1)
    argmax_sentiment = torch.argmax(probs, dim = 1).item()
    all_sentiments.append(argmax_sentiment)
  return all_sentiments

In [27]:
all_sents = get_sentiment(multi_topic_prediction)

100%|██████████| 302/302 [06:56<00:00,  1.38s/it]


In [30]:
np.unique(all_sents, return_counts = True)

(array([0, 1]), array([ 20, 282]))

In [32]:
multi_topic_prediction['sentiment'] = all_sents

In [59]:
counts = multi_topic_prediction.groupby(by = 'labels').sentiment.value_counts(normalize = True).mul(100).unstack(fill_value = 0).round(2)

In [64]:
counts[0].delivery, counts[1].delivery

(16.0, 84.0)