In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import re

#### Google drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/Colab Notebooks/ITC/Final Project/data.npy'
data = np.load(path, allow_pickle = True)

In [4]:
df = pd.DataFrame(data, columns = ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'])

#### Keep only 3 features

In [5]:
df = df.loc[:,['ProductId','Text','Score']]

In [6]:
df['Text'] = df['Text'].str.replace(r'<[^>]*>', '', regex=True)

#### Split the dataset into train val and test using 100k samples

In [7]:
from sklearn.model_selection import GroupShuffleSplit
splitter_temp = GroupShuffleSplit(test_size=.40, n_splits=1, random_state = 42)
split_temp = splitter_temp.split(df[:100000], groups=df[:100000]['ProductId'])
train_inds, temp_inds = next(split_temp)

train = df.iloc[train_inds]
temp = df.iloc[temp_inds]

In [8]:
splitter_val = GroupShuffleSplit(test_size=.50, n_splits=1, random_state = 42)
split_val = splitter_val.split(temp, groups=temp['ProductId'])
val_inds, test_inds = next(split_val)

val = temp.iloc[val_inds]
test = temp.iloc[test_inds]

In [9]:
X_train = train.drop(columns = 'Score')
y_train = train.Score

X_val = val.drop(columns = 'Score')
y_val = val.Score

X_test = test.drop(columns = 'Score')
y_test = test.Score

# Zero-Shot Classification

In [10]:
from transformers import pipeline

In [11]:
def get_topic_scores(data, num_texts):
  classifier = pipeline(model="facebook/bart-large-mnli")
  results = []

  for text in tqdm(data.Text[:num_texts]):
    result = classifier(text,
              candidate_labels=["flavor", "taste", "price", "size"]
  )
    results.append(result)

  for text, result in zip(data.Text[:num_texts], results):
    print()
    print(f"Text: {text}")
    print(f" Labels and Scores:")
    for label, score in zip(result['labels'], result['scores']):
      print(f" Label: {label}, Score: {score:.2f}")


def get_text_topic(data, num_texts):
  """Returns list of tuples with text and winning topic"""
  classifier = pipeline(model="facebook/bart-large-mnli")
  topics = []

  for text in tqdm(data.Text[:num_texts]):
    results = []
    result = classifier(text,
              candidate_labels=["flavor", "taste", "price", "size"]
  )
    results.append(result)

    max_index = result['scores'].index(max(result['scores']))
    label = result['labels'][max_index]
    topics.append((text, label))
  return topics

In [12]:
get_topic_scores(X_train, 15)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

100%|██████████| 15/15 [00:22<00:00,  1.50s/it]


Text: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
 Labels and Scores:
 Label: flavor, Score: 0.51
 Label: taste, Score: 0.26
 Label: size, Score: 0.17
 Label: price, Score: 0.05

Text: This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.
 Labels and Scores:
 Label: flavor, Score: 0.46
 Label: taste, Score: 0.27
 Label: s




In [13]:
result = get_text_topic(X_train, 10)
labels = [label for _, label in result]
for label in labels:
  print(label)

100%|██████████| 10/10 [00:12<00:00,  1.25s/it]

flavor
flavor
flavor
price
flavor
flavor
flavor
size
size
size



