In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd
import torch
from tqdm import tqdm
import os

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] ="1"

In [2]:
!head test_descriptions.csv

description
"Knit midi dress with a V-neckline, straps and matching lace detail.<br/><br/>HEIGHT OF MODEL: 177 CM. / 69.6″"
"Loose-fitting dress with a round neckline, long sleeves, pleat details and a buttoned opening at the back.<br/><br/>HEIGHT OF MODEL: 177 CM / 69.6″"
Nautical cap with peak.<br/><br/>This item must be returned with the original cardboard packaging intact.
Nautical cap with peak. Adjustable inner strap detail.
Nautical cap with side button detail.<br/><br/>This item must be returned with the original cardboard packaging intact.
"Faded short sleeve T-shirt with a round neckline and a front print.<br/><br/>Due to the dyeing process, the print on each T-shirt is unique and may differ from what is shown in the photo.<br/><br/>HEIGHT OF MODEL: 177 cm. / 69.6″"
"Coat with a round collar and long sleeves. Featuring front welt pockets, faux suede interior and button fastening on the front. <br/><br/>HEIGHT OF MODEL: 177 cm. / 69.6″"
Ripped T-shirt. Round neck and s

In [3]:
zara_data = pd.read_csv("zara_total_1101.csv")

In [4]:
train = pd.read_csv("train.csv")

In [5]:
zara_data = zara_data[~zara_data["name"].isin(train["name"])]

In [6]:
zara_data.shape

(192, 2)

In [3]:
with open("./data/val.source", "r") as f:
    val_source = f.readlines()

In [4]:
with open("./data/val.target", "r") as f:
    val_target = f.readlines()

In [5]:
len(val_source)

1824

In [6]:
len(val_target)

1824

In [5]:
val_source[-1]

'Knit top with a round neckline, long sleeves and puff shoulders. Featuring matching fabric detail.\n'

In [6]:
val_target[-1]

'CONTRAST KNIT TOP\n'

In [7]:
val_target[0]

'STRIPED SHIRT\n'

In [8]:
val_source[0]

'Collared shirt featuring long sleeves, a button-up front and a striped print.\n'

In [9]:
def remove_(source, target):
    news, newt = [], []
    for i in range(min(len(source), len(target))):
        s = source[i].lower()
        t = target[i].lower()
        s_sp = s.split(" ")
        t_sp = t.split(" ")
        if any([t_ not in s_sp for t_ in t_sp]):
            news.append(s)
            newt.append(t)
    return news, newt

In [10]:
source, target = remove_(val_source, val_target)

In [11]:
len(source), len(target)

(3359, 3359)

In [5]:
model = BartForConditionalGeneration.from_pretrained("./model5/checkpoint-6000")

In [6]:
tokenizer = BartTokenizer.from_pretrained("./model5/checkpoint-6000")

In [9]:
encoded = tokenizer.encode(val_source[0], return_tensors="pt")

In [10]:
model.to("cuda:0")

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): FusedLayerNorm(torch.Size([

In [12]:
encoded = encoded.to("cuda:0")

In [13]:
with torch.no_grad():
    gen = model.generate(encoded)

In [14]:
gen

tensor([[    2,     0,     0,     0, 30549,  3808,  1691,  4729, 17831,  1691,
          4584, 40835,     2]], device='cuda:0')

In [15]:
val_source[0]

'Collared shirt featuring long sleeves, a button-up front and a striped print.\n'

In [16]:
for g in gen:
    print(tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False))

STRIPED PRINTED SHIRT


In [7]:
def get_predictions(model, tokenizer, texts, numseqs=50, device="cuda:1"):
    model.to(device)
    predictions = []
    for text in tqdm(texts):
        preds = []
        text_encoded = tokenizer.encode(text, return_tensors="pt")
        #text_encoded = text_encoded.to(device)
        with torch.no_grad():
            gen = model.generate(text_encoded.to(device), num_return_sequences=numseqs, num_beams=numseqs)
        decoded = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                  for g in gen]
        for d in decoded:
            if d not in preds:
                preds.append(d)
        decoded = preds[:10]
        predictions.append(decoded)
    return predictions

In [10]:
#CUDA_LAUNCH_BLOCKING=1

In [11]:
predictions = get_predictions(model, tokenizer, val_source, numseqs=30, device="cuda")

100%|██████████| 1825/1825 [18:43<00:00,  1.62it/s]


In [None]:
#predictions2 = get_predictions(model, tokenizer, zara_data["description"].tolist(), numseqs=30, device="cuda")

In [54]:
torch.cuda.is_available() 

True

In [1]:
"Hola hola".strip()

'Hola hola'

In [12]:
props = []
for pp in predictions:
    pp_ = []
    for p in pp:
        pp_.append(p.lower())
    props.append(pp_)

In [13]:
def dcg(predictions, targets):
    """Computes Discounted Cumulative Gain for a list of proposals and expected targets"""
    targets = [t.replace("\n", "").lower() for t in targets]
    score = 0
    for pred, target in zip(predictions, targets):
        if target in pred:
            idx = pred.index(target)
            score += 1 / np.log2(idx+2) 
    return score / len(predictions) * 100

In [14]:
import numpy as np

In [15]:
dcg(props, val_target)

13.607801664472335

In [18]:
dcg(props, val_target)

14.728968496439354

In [18]:
dcg(props, zara_data["name"].tolist())

185.6126272867843

In [19]:
dcg(props, val_target)

12020.993688411749

In [26]:
dcg(props, target)

13.820023270240652

In [20]:
predictions[0]

['STRIPED PRINTED SHIRT',
 'STRIPED OXFORD SHIRT',
 'STRIPED CHECK SHIRT',
 'STRIPED POPLIN SHIRT',
 'STRIPED OXFORD SHIRT',
 'STRIPED DENIM SHIRT',
 'STRIPED OXFORD SHIRT',
 'STRIPED STRIPED SHIRT',
 'STRIPED PRINT SHIRT',
 'STRIPED SHIRT WITH BUTTONS']

In [21]:
target[0]

'striped shirt\n'