# Fine-Tuning Bert for Price Regression

## Dependencies

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from transformers import BertModel, BertTokenizer

import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


## Load and Preprocess Data

In [2]:
# REPLACE WITH DATA PATH
PATH_TO_DATA = "../../mercari-data/train.tsv"
#PATH_TO_DATA = "../../data"

In [3]:
df = pd.read_csv(PATH_TO_DATA, sep="\t", index_col="train_id")

In [4]:
df

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
...,...,...,...,...,...,...,...
1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."
1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,0,Little mermaid handmade dress never worn size 2t
1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape."
1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...


In [5]:
# subset data to make it faster (DELETE IN LATER RUN)
df = df.head(25)

In [6]:
df.isna().sum()

name                 0
item_condition_id    0
category_name        0
brand_name           7
price                0
shipping             0
item_description     0
dtype: int64

In [7]:
condition_mapper = {1:"Poor", 2:"Okay", 3:"Good", 4:"Excellent", 5:"Like New"}
shipping_mapper = {0: "No Shipping", 1: "Includes Shipping"}

In [8]:
df["brand_name"] = df["brand_name"].fillna("No Brand")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["brand_name"] = df["brand_name"].fillna("No Brand")


In [9]:
df.loc[:,"item_condition_id"] = df["item_condition_id"].map(condition_mapper).astype(str)
df.loc[:,"shipping"] = df["shipping"].map(shipping_mapper).astype(str)

 'Okay' 'Poor' 'Okay' 'Poor' 'Good' 'Poor' 'Poor' 'Poor' 'Poor' 'Okay'
 'Good' 'Poor' 'Good' 'Poor' 'Okay']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,"item_condition_id"] = df["item_condition_id"].map(condition_mapper).astype(str)
 'No Shipping' 'No Shipping' 'No Shipping' 'Includes Shipping'
 'No Shipping' 'No Shipping' 'Includes Shipping' 'No Shipping'
 'No Shipping' 'Includes Shipping' 'No Shipping' 'Includes Shipping'
 'Includes Shipping' 'Includes Shipping' 'Includes Shipping' 'No Shipping'
 'Includes Shipping' 'No Shipping' 'No Shipping' 'Includes Shipping'
 'No Shipping']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:,"shipping"] = df["shipping"].map(shipping_mapper).astype(str)


In [10]:
# to be suitable for the bert model, we create a dataset with 
combined = "Item Name: " + df["name"] + \
            " Description: " + df["item_description"] + \
            " Condition: " + df["item_condition_id"] + \
            " Category: " + df["category_name"] + \
            " Brand " + df["brand_name"] + \
            " Shipping: " + df["shipping"]

In [11]:
data = pd.concat([combined, df["price"]], axis=1)
data.columns = ["description", "price"]

In [12]:
data

Unnamed: 0_level_0,description,price
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Item Name: MLB Cincinnati Reds T Shirt Size XL...,10.0
1,Item Name: Razer BlackWidow Chroma Keyboard De...,52.0
2,Item Name: AVA-VIV Blouse Description: Adorabl...,10.0
3,Item Name: Leather Horse Statues Description: ...,35.0
4,Item Name: 24K GOLD plated rose Description: C...,44.0
5,Item Name: Bundled items requested for Ruie De...,59.0
6,Item Name: Acacia pacific tides santorini top ...,64.0
7,Item Name: Girls cheer and tumbling bundle of ...,6.0
8,Item Name: Girls Nike Pro shorts Description: ...,19.0
9,Item Name: Porcelain clown doll checker pants ...,8.0


In [54]:
# inspect one item
print(f"Data: {data.loc[0]["description"]}")
print(f"Label: {data.loc[0]["price"]}")

Data: Item Name: MLB Cincinnati Reds T Shirt Size XL Description: No description yet Condition: Good Category: Men/Tops/T-shirts Brand No Brand Shipping: Includes Shipping
Label: 10.0


In [74]:
# check how long descriptions are
data["description"].str.split().str.len().mean()
# we can use a max token length of ~80 later

44.48

## Dataset and Regression Class

In [13]:
class RegressionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = float(self.labels[idx])

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].flatten(),
            "attention_mask": inputs["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.float)
        }


class BERTRegression(nn.Module):
    def __init__(self):
        super(BERTRegression, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits.squeeze(-1)

## Split Dataset and Instantiate Dataloaders

In [76]:
# Set data and targets
texts = data.description.values
labels = data.price.values

In [15]:
# instantiate
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_len = 80

In [77]:
# split data into training and test sets
_texts, test_texts, _labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(_texts, _labels, test_size=0.111, random_state=42)

In [17]:
# Create datasets and data loaders
train_dataset = RegressionDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = RegressionDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

## Instantiate Model and Set Training Parameters

In [18]:
# Instantiate the model, optimizer, and loss function
model = BERTRegression()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()

# Training loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BERTRegression(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

## Training Loop

In [19]:
for epoch in range(num_epochs):
    model.train()

    i = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        i += 1
        if i // 10 == 0:
            print(f"batch {i}")

    # Validation loop
    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask)
            val_loss = criterion(outputs, labels)
            val_losses.append(val_loss.item())

    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {np.mean(val_losses)}")

batch 1
batch 2
batch 3
Epoch 1/3, Validation Loss: 318.81097412109375
batch 1
batch 2
batch 3
Epoch 2/3, Validation Loss: 301.62847900390625
batch 1
batch 2
batch 3
Epoch 3/3, Validation Loss: 292.0242919921875


## Test Loop

In [78]:
# Test the model
# test_texts = [...]  # List of test texts
# test_labels = [...]  # List of corresponding regression labels

test_dataset = RegressionDataset(test_texts, test_labels, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.cpu().numpy())

# Calculate and print mean squared error
mse = mean_squared_error(test_labels, predictions)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 126.84798567886423


## Save the Model

In [95]:
torch.save(model.state_dict(), 'bert_regression_model.pth')

## Gradio Application

In [125]:
import torch
from transformers import BertTokenizer
from regression_models import BERTRegression

max_len = 80

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Load model architecture
bertregressor = BERTRegression()
bertregressor.load_state_dict(torch.load('bert_regression_model.pth', map_location=torch.device('cpu')))
bertregressor.eval()

def predict_price(name, item_condition, category, brand_name, shipping_included, item_description):
    print((name, item_condition, category, brand_name, shipping_included, item_description))
    # Preprocess Input
    if shipping_included:
        shipping_str = "Includes Shipping"
    else:
        shipping_str = "No Shipping"
        
    combined = "Item Name: " + name + \
            " Description: " + item_description + \
            " Condition: " + item_condition + \
            " Category: " + category + \
            " Brand " + brand_name + \
            " Shipping: " + shipping_str
    
    inputs = tokenizer.encode_plus(
        combined,
        None,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    with torch.no_grad():
        output = bertregressor(input_ids, attention_mask)
    
    return output.item()

    
demo = gr.Interface(
    
    fn = predict_price,
    
    inputs = [gr.Textbox(label="Item Name"), 
              gr.Dropdown(['Poor', 'Okay', 'Good', 'Excellent', 'Like New'], label="Item Condition", info="What condition is the item in?"),
              gr.Textbox(label="Category on Mercari"),
              gr.Textbox(label="Brand"),
              gr.Checkbox(label="Shipping Included"),
              gr.Textbox(label="Description")
             ],
    
    #outputs = gr.Textbox()
    outputs= gr.Number()
)


demo.launch()

Running on local URL:  http://127.0.0.1:7883

To create a public link, set `share=True` in `launch()`.




In [114]:
name="Razer BlackWidow Chroma Keyboard"
item_condition="Excellent"
category= "Electronics/Computers & Tablets/Components"
brand_name= "Razer"
shipping_included= True
item_description= "This keyboard is in great condition" 

predict_price(name, item_condition, category, brand_name, shipping_included, item_description)

('Razer BlackWidow Chroma Keyboard', 'Excellent', 'Electronics/Computers & Tablets/Components', 'Razer', True, 'This keyboard is in great condition')
Item Name: Razer BlackWidow Chroma Keyboard Description: This keyboard is in great condition Condition: Excellent Category: Electronics/Computers & Tablets/Components Brand Razer Shipping: Includes Shipping


'Item Name: Razer BlackWidow Chroma Keyboard Description: This keyboard is in great condition Condition: Excellent Category: Electronics/Computers & Tablets/Components Brand Razer Shipping: Includes Shipping'

In [69]:
x_in = tokenizer.encode("Expensive gucci sweatshirt", return_tensors="pt")

x_in = tokenizer.encode_plus(
            "puffer jacket with diamond padding",
            None,
            add_special_tokens=True,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
print(x_in)

{'input_ids': tensor([[  101, 23893,  2121,  6598,  2007,  6323, 11687,  4667,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [70]:
model(x_in["input_ids"], x_in["attention_mask"])

tensor([1.0163], grad_fn=<SqueezeBackward1>)

In [None]:
self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

In [46]:
for x in train_loader:
    d = x
    in_ = d["input_ids"][3:8]
    attn_msk = d["attention_mask"][3:8]
    break

In [47]:
d["attention_mask"][3:8]

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [48]:
model(in_, attn_msk) 

tensor([1.9241, 1.6717, 1.9484, 1.9869, 1.9255], grad_fn=<SqueezeBackward1>)