## Data Generation

In [1]:
import os
import pandas as pd

In [2]:
input_file = r'../data/yahoo_top_products_click_popularity_20200201.txt'

In [3]:
df_train = pd.read_csv(input_file, sep=r'\t')

# temp descriptions
#df_train.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,Parent L1 Description,Parent L2 Description,MID,Merchant Name,OID,Title,Description,Manufacturer,Price,Min Offer Price 30 Day,Price Change,Click Popularity
0,Pet Supplies,More Pet Supplies,248942,Chewy.com,9151715234,Wisdom Panel Health Breed & Health Identificat...,Wisdom Panel Health Breed & Health Identificat...,Wisdom Panel,149.99,89.99,66.67,35415.048
1,Clothing & Accessories,Men's Clothing,313027,eBay PLA US,11114337071,Girl Scout Cookies 2019-20 New Cookies are in!...,Girl Scout Cookies 2019-20 New Cookies are in!...,Girl Scouts,48.0,48.0,0.0,12215.2644
2,Clothing & Accessories,Men's Clothing,134716,MensUSA.com,5651235052,Classic Long Royal Blue Fashion Zoot Suit,"""This Zoot Suit is as nice and unique as it ge...",mensusa,139.0,139.0,0.0,8317.775
3,'N/A','N/A',76071,Joe's New Balance Outlet,9756405838,New Balance Women's FuelCore NERGIZE Shoes Bla...,Slip on the FuelCore NERGIZE women's training ...,New Balance,38.99,38.99,0.0,6132.854
4,Clothing & Accessories,Handbags & Luggage,31851,Kohl's,5681651813,"Stone & Co. Irene Leather Hobo, Grey",Watch the product video here. Stone & Co. embo...,Stone & Co.,99.0,69.3,42.85,5251.5413


In [4]:
# Null Preprocessing
null_rows = (df_train['Title'].isnull()) \
            | (df_train['Parent L1 Description'].isnull()) \
            | (df_train['Parent L2 Description'].isnull()) \
            | (df_train['Price'].isnull()) \
            | (df_train['Click Popularity'].isnull())
df_train = df_train[~null_rows]

# MAX LENGTH substitution
#np.max(df_train['Title'].apply(lambda x : len(x)).to_list())
MAX_LENGTH = 50
df_train = df_train[~(df_train.Title.apply(lambda x : len(x)) > MAX_LENGTH)]

# Sampling
SAMPLE_FRAC = 0.1
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9487)

# Column Selections
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['Title', 'Price', 'Click Popularity']]
df_train.head()

Unnamed: 0,Title,Price,Click Popularity
0,Seiko Blue Men's Silver-Tone Blue Dial Chronog...,119.99,0.1932
1,1999-2005 Volkswagen Jetta Clutch Kit - LUK 17...,201.43,0.3865
2,Madden Girl Bounce Platform Sneakers - Tan,34.5,0.1932
3,G.H. Lammerse Dahlia - 2 per package,19.99,0.1932
4,Popcornopolis 6-Cone Holiday Popcorn Variety Pack,23.08,0.1932


In [5]:
# idempotence
df_train.to_csv("../data/train.tsv", sep="\t", index=False)

print("sample number：", len(df_train))
df_train.head()

sample number： 38481


Unnamed: 0,Title,Price,Click Popularity
0,Seiko Blue Men's Silver-Tone Blue Dial Chronog...,119.99,0.1932
1,1999-2005 Volkswagen Jetta Clutch Kit - LUK 17...,201.43,0.3865
2,Madden Girl Bounce Platform Sneakers - Tan,34.5,0.1932
3,G.H. Lammerse Dahlia - 2 per package,19.99,0.1932
4,Popcornopolis 6-Cone Holiday Popcorn Variety Pack,23.08,0.1932


In [6]:
# Test Dataset Generation
## TODO

## BERT-format + Loader

In [7]:
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset

In [8]:
PRETRAINED_MODEL_NAME = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [9]:
class TargetTextDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode
        self.df = pd.read_csv("../data/" + mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer
        
    def __getitem__(self, idx):
        if self.mode == "test":
            text, price = self.df.iloc[idx, :2].values
            reg_score_tensor = None
        else:
            text, price, reg_score = self.df.iloc[idx, :].values
            reg_score_tensor = torch.tensor(reg_score)
            
        # BERT tokens + [SEP]
        word_pieces = ["[CLS]"]
        tokens = self.tokenizer.tokenize(text)
        word_pieces += tokens + ["[SEP]"]
        len_text = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_text, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, reg_score_tensor)
    
    def __len__(self):
        return self.len

In [10]:
trainset = TargetTextDataset("train", tokenizer=tokenizer)

In [11]:
def check_diff_dataset(dataset, index=0):
    # 選擇第一個樣本
    sample_idx = index

    # 將原始文本拿出做比較
    text_a, price_a, reg_score_a = trainset.df.iloc[sample_idx].values

    # 利用剛剛建立的 Dataset 取出轉換後的 id tensors
    tokens_tensor, segments_tensor, reg_score_tensor = trainset[sample_idx]

    # 將 tokens_tensor 還原成文本
    tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
    combined_text = " ".join(tokens)

    # 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
    print(f"""[原始文本]
    句子 1：{text_a}
    句子 2：{price_a}
    分類  ：{reg_score_a}

    --------------------

    [Dataset 回傳的 tensors]
    tokens_tensor  ：{tokens_tensor}

    segments_tensor：{segments_tensor}

    label_tensor   ：{reg_score_tensor}

    --------------------

    [還原 tokens_tensors]
    {combined_text}
    """)

In [12]:
check_diff_dataset(trainset)

[原始文本]
    句子 1：Seiko Blue Men's Silver-Tone Blue Dial Chronograph
    句子 2：119.99
    分類  ：0.1932

    --------------------

    [Dataset 回傳的 tensors]
    tokens_tensor  ：tensor([  101,  7367, 12676,  2630,  2273,  1005,  1055,  3165,  1011,  4309,
         2630, 13764, 10381,  4948,  8649, 24342,   102])

    segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

    label_tensor   ：0.1932000070810318

    --------------------

    [還原 tokens_tensors]
    [CLS] se ##iko blue men ' s silver - tone blue dial ch ##ron ##og ##raph [SEP]
    


In [13]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [14]:
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        reg_score = torch.stack([s[2] for s in samples])
    else:
        reg_score = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, reg_score


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [15]:
def check_dataloader(dataloader):
    data = next(iter(dataloader))

    tokens_tensors, segments_tensors, \
        masks_tensors, reg_score = data

    print(f"""
    tokens_tensors.shape   = {tokens_tensors.shape} 
    {tokens_tensors}
    ------------------------
    segments_tensors.shape = {segments_tensors.shape}
    {segments_tensors}
    ------------------------
    masks_tensors.shape    = {masks_tensors.shape}
    {masks_tensors}
    ------------------------
    label_ids.shape        = {reg_score.shape}
    {reg_score}
    """)

In [16]:
check_dataloader(trainloader)


    tokens_tensors.shape   = torch.Size([64, 28]) 
    tensor([[  101,  7367, 12676,  ...,     0,     0,     0],
        [  101,  2639,  1011,  ...,     0,     0,     0],
        [  101, 24890,  2611,  ...,     0,     0,     0],
        ...,
        [  101,  1056, 29602,  ...,     0,     0,     0],
        [  101,  1996,  1017,  ...,     0,     0,     0],
        [  101, 29450,  1058,  ...,     0,     0,     0]])
    ------------------------
    segments_tensors.shape = torch.Size([64, 28])
    tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
    ------------------------
    masks_tensors.shape    = torch.Size([64, 28])
    tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 

## Modeling
- Huggingface

In [17]:
# 載入一個可以做中文多分類任務的模型，n_class = 3
from transformers import BertForSequenceClassification

#PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 1

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

#clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=1, bias=True)


In [18]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "repetition_penalty": 1.0,
  "temperature": 1.0,
  "top_k": 50,
  "top_p": 1.0,
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,

In [25]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total_error = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors, reg_score_tensors = data
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors,
                            labels=reg_score_tensors)
            
            loss, logits = outputs[:2]
            #_, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += reg_score_tensors.size(0)
                total_error += float(loss.data) * total
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = logits.data
            else:
                predictions = torch.cat((predictions, logits.data))
            
            del tokens_tensors
            del segments_tensors
            del masks_tensors
            del reg_score_tensors
            torch.cuda.empty_cache()
    
    if compute_acc:
        omse = total_error / total
        return predictions, omse
    return predictions

In [None]:
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

In [19]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：109483009
線性分類器的參數量：769



In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

device: cuda:0


In [21]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 6  # 幸運數字
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()

        del outputs
        del tokens_tensors
        del segments_tensors
        del masks_tensors
        del labels
        
        torch.cuda.empty_cache()

        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    #_, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f' %
          (epoch + 1, running_loss))

[epoch 1] loss: 89431.751
[epoch 2] loss: 89178.224
[epoch 3] loss: 88589.665
[epoch 4] loss: 87287.502
[epoch 5] loss: 86109.136
[epoch 6] loss: 84800.436
Wall time: 28min 25s


In [22]:
# save model
torch.save(model.to("cpu"), "../model/bert_test_model")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [26]:
%%time
# 用分類模型預測測試集
predictions = get_predictions(model, trainloader)

Wall time: 19min 53s


In [39]:
result = predictions.numpy()[:64, ].ravel()

In [38]:
true_value = df_train['Click Popularity'][:64].to_numpy()

In [40]:
import numpy as np
from sklearn import metrics

sort_index = np.argsort(-1*true_value)
y = np.array([2] * 30 + [1] * 34)
scores = result[sort_index]

In [44]:
sort_index

array([50, 32, 48, 60, 56,  8, 55, 19, 26, 47, 57, 40, 15, 29, 17, 16, 31,
       35, 27, 12, 49, 21, 22, 34, 59,  1, 45, 41, 42, 43, 52, 61, 51, 46,
       58, 54, 53, 44, 39,  0, 37,  2,  3,  4,  5,  6,  7,  9, 10, 11, 13,
       14, 18, 20, 23, 24, 25, 28, 30, 62, 33, 36, 38, 63], dtype=int64)

In [45]:
result

array([ 0.5786721 ,  0.4731172 ,  1.2724991 ,  0.5953915 ,  0.98217916,
        2.859677  ,  0.77891695,  0.912122  ,  1.4449575 ,  0.6277391 ,
        0.47311452,  0.56747794,  0.07988021,  0.11860785,  0.64048314,
        0.45623112,  0.6713881 ,  0.6968294 ,  0.36715102,  0.6362337 ,
        0.34025955,  0.33606663,  0.7585087 ,  0.5409566 ,  0.58195615,
       12.652956  ,  0.48444462,  0.3234909 ,  1.0304863 , -0.28407213,
        0.73470926,  0.5768399 , 12.88768   ,  0.48148265,  0.4336914 ,
        0.22489761,  0.36202967,  0.78652614,  0.7793033 ,  0.30464888,
        0.7858947 ,  0.50743526,  0.30857506,  0.24206609,  0.56826574,
        0.4461549 ,  0.40718803,  1.3025625 ,  2.7594044 ,  0.43979415,
        9.14373   ,  0.8396159 ,  0.4253715 ,  0.92645556,  2.0673487 ,
        1.2378422 ,  0.38447294,  1.1953018 ,  0.7545968 ,  0.5629024 ,
        0.44944483,  2.0599785 ,  0.31129   ,  0.43424943], dtype=float32)

In [46]:
true_value

array([ 0.1932,  0.3865,  0.1932,  0.1932,  0.1932,  0.1932,  0.1932,
        0.1932,  2.3192,  0.1932,  0.1932,  0.1932,  0.3865,  0.1932,
        0.1932,  0.9663,  0.773 ,  0.773 ,  0.1932,  1.1596,  0.1932,
        0.3865,  0.3865,  0.1932,  0.1932,  0.1932,  1.1596,  0.3865,
        0.1932,  0.773 ,  0.1932,  0.773 ,  8.6971,  0.1932,  0.3865,
        0.5798,  0.1932,  0.1932,  0.1932,  0.1932,  0.9663,  0.1932,
        0.1932,  0.1932,  0.1932,  0.3865,  0.1932,  0.9663,  5.2182,
        0.3865, 13.5289,  0.1932,  0.1932,  0.1932,  0.1932,  2.3192,
        2.7057,  0.9663,  0.1932,  0.3865,  2.7057,  0.1932,  0.1932,
        0.1932])

In [41]:
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)

In [42]:
auc = (tpr -fpr + 1) / 2

In [43]:
print(auc)

[0.5        0.51666667 0.50196078 0.51862745 0.50392157 0.52058824
 0.49117647 0.5245098  0.50980392 0.54313725 0.45490196 0.47156863
 0.44215686 0.45882353 0.42941176 0.4627451  0.44803922 0.46470588
 0.40588235 0.42254902 0.39313725 0.40980392 0.39509804 0.42843137
 0.41372549 0.43039216 0.41568627 0.48235294 0.46764706 0.48431373
 0.45490196 0.47156863 0.42745098 0.46078431 0.44607843 0.4627451
 0.44803922 0.48137255 0.46666667 0.5       ]


In [None]:
import numpy as np
from sklearn import metrics
y = np.array([1, 1, 2, 2])
scores = np.array([0.1, 0.4, 0.35, 0.8])
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)

In [None]:
class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.init_weights()

    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Examples::
        from transformers import BertTokenizer, BertForSequenceClassification
        import torch
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
        """

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)