In [30]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
import csv


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
model_dir = "ranker_2001"

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [6]:
def _read_textfile(file):
    with open(file, "r") as f:
        return f.readlines()

In [7]:
submission = _read_textfile("submission_eolo.csv")

In [33]:
submission = []

with open("submission_eolo.csv", "r") as f:
    reader = csv.reader(f)
    for line in reader:
        submission.append(line)

In [35]:
submission = submission[1:]

In [36]:
#submission = [subm.split(",") for subm in submission]

In [11]:
len(submission)

1441

In [12]:
len(submission[0])

10

In [37]:
all([len(subm) == 10 for subm in submission])

True

In [38]:
#[subm for subm in submission if len(subm) != 10]

In [15]:
input_ids = tokenizer.encode(submission[0][0], return_tensors="pt")

In [16]:
input_ids

tensor([[  101, 10722, 10954, 22219,  9919,  1708,   157,  2069,  2271,   102]])

In [22]:
device = torch.device("cuda")

In [23]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [24]:
modelret = model(input_ids.to(device))

In [26]:
dir(modelret)

['__annotations__',
 '__class__',
 '__contains__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__post_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'attentions',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'logits',
 'loss',
 'move_to_end',
 'pop',
 'popitem',
 'setdefault',
 'to_tuple',
 'update',
 'values']

In [33]:
modelret.logits.detach().cpu().numpy()[0][1]

-1.5583711

In [39]:
test_descriptions = pd.read_csv("test_descriptions.csv")

In [40]:
def clean_text(text):
    text = text.replace("<br/>", "")
    text = text.replace("\n", "")
    clean_exprs = ["HEIGHT OF MODEL", "height of model", "model height", "MODEL HEIGHT"]
    for expr in clean_exprs:
        if expr in text:
            text = text[: text.find(expr)]
    return text

In [41]:
test_descriptions["description"] = test_descriptions["description"].apply(clean_text)

In [42]:
def create_text_tuples(description, possible_names):
    return [(description, name) for name in possible_names]

In [42]:
input_ids = tokenizer.batch_encode_plus(
    create_text_tuples(
        test_descriptions["description"].tolist()[0],
        submission[0]), 
    return_tensors="pt", truncation=True, padding=True)

In [45]:
inputs = input_ids.input_ids

In [43]:
def to_device(d, device):
    for k in d:
        d[k].to(device)

In [54]:
to_device(input_ids, device)

In [55]:
modelret = model(**input_ids)

In [56]:
modelret_logits1 = modelret.logits[:, 1]

In [51]:
modelret_logits1

tensor([-0.8698, -0.7265, -0.7176, -0.8638, -1.1724, -0.6543, -0.6523, -0.8555,
        -0.8558, -1.1087], device='cuda:0', grad_fn=<SelectBackward>)

In [57]:
modelret_logits1

tensor([-1.2840, -0.3544, -0.5937, -1.1419, -1.1238, -1.1979, -0.0451, -0.7340,
        -0.4619, -0.7884], device='cuda:0', grad_fn=<SelectBackward>)

In [61]:
dic_subm = {s: val for s, val in zip(submission[0],modelret_logits1.detach().cpu().numpy()) }

In [23]:
sorted_dic = list(sorted(dic_subm, key=dic_subm.get, reverse=True))

NameError: name 'dic_subm' is not defined

In [63]:
sorted_dic

['LACE DRESS TRF',
 'STRAPPY KNIT DRESS',
 'LONG KNIT DRESS',
 'LONG KNIT DRESS TRF',
 'STRAPPY KNIT DRESS TRF\n',
 'KNIT DRESS WITH LACE',
 'LACE DRESS',
 'LACE DRESS WITH STRAPS TRF',
 'KNIT DRESS WITH LACE TRIM',
 'LACE-UP DRESS TRF']

In [44]:
def sort_submission_by_score(submission, model, descriptions, device=None):
    model.to(device)
    new_submissions = []
    for description, subm in tqdm(zip(descriptions, submission)):
        tuples = create_text_tuples(description, subm)
        inputs = tokenizer.batch_encode_plus(tuples, return_tensors="pt", truncation=True, padding=True)
        #to_device(inputs, device)
        output = model(**{k:v.to(device) for k, v in inputs.items()})
        logits_positive = output.logits[:, 1].detach().cpu().numpy()
        subm_dict = {s: val for s, val in zip(subm, logits_positive)}
        sorted_names = [name.replace("\n", "") for name in list(sorted(subm_dict, key=subm_dict.get, reverse=True))]
        new_submissions.append(sorted_names)
    return new_submissions

In [45]:
new_submission = sort_submission_by_score(submission, model, test_descriptions["description"].tolist(), device=torch.device("cuda"))

1441it [00:58, 24.47it/s]


In [46]:
#new_submission[:10]

In [47]:
#submission[9]

In [27]:
import csv

In [28]:
with open("new_submission_ranker.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["name"])
    writer.writerows(new_submission)

In [23]:
import pandas as pd

In [54]:
di = {f"pred_{i}": [] for i in range(10)}
for names in new_submission:
    if len(names) != 10:
        print(names)
        print(len(names))
    for i in range(len(names)):
        di[f"pred_{i}"].append(names[i])

In [55]:
df = pd.DataFrame(di)

In [56]:
df.to_csv("new_submission_ranker.csv", header=False, index=False)