In [1]:
import os
import csv
import numpy as np
from dotenv import load_dotenv
from tqdm.notebook import tqdm

from huggingface_hub import login
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import CosineAnnealingLR

from pricer.evaluator import evaluate
from pricer.items import Item

In [2]:
# Static configuration

LITE_MODEL = False
EPOCHS = 30
BATCH_SIZE = 64
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-4
N_FEATURES = 5000
VAL_SPLIT = 0.1
PATIENCE = 5
RANDOM_SEED = 42

In [3]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x21b3c376190>

In [4]:
load_dotenv(override=True)
hf_token = os.environ["HF_TOKEN"]
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
username = "ed-donner"
dataset = f"{username}/items_lite" if LITE_MODEL else f"{username}/items_full"

In [6]:
# Load dataset

train, val, test = Item.from_hub(dataset)
print(f"Loaded {len(train):,} training items, {len(val):,} validation_items, {len(test):,} test items")

Loaded 800,000 training items, 10,000 validation_items, 10,000 test items


In [7]:
# Human baseline from previous testing

human_predictions = []
with open("human_in.csv", "w", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    for t in test[:100]:
       writer.writerow([t.summary, 0])

In [8]:
human_predictions = []
with open("human_out.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        human_predictions.append(float(row[1]))

In [9]:
def human_pricer(item):
    idx =test.index(item)
    return human_predictions[idx]

In [10]:
human = human_pricer(test[0])
actual = test[0].price
print(f"Human predicted {human} for an item that actually cost: {actual}")

Human predicted 120.0 for an item that actually cost: 219.0


In [11]:
evaluate(human_pricer, test, size=100)

  0%|          | 0/100 [00:00<?, ?it/s]

[91m$99 [91m$184 [92m$12 [92m$15 [92m$18 [92m$10 [91m$119 [91m$135 [92m$6 [91m$270 [91m$643 [91m$329 [92m$15 [92m$26 [92m$24 [92m$18 [92m$29 [92m$25 [92m$25 [93m$53 [92m$35 [91m$126 [92m$25 [91m$127 [91m$273 [91m$398 [92m$55 [92m$6 [91m$101 [93m$51 [92m$30 [92m$5 [92m$35 [92m$9 [92m$10 [91m$419 [92m$25 [92m$11 [91m$186 [92m$33 [91m$161 [93m$51 [92m$23 [91m$155 [91m$150 [92m$4 [92m$31 [92m$18 [91m$115 [91m$82 [92m$25 [91m$111 [91m$410 [93m$75 [93m$67 [92m$34 [92m$8 [92m$10 [91m$122 [92m$28 [91m$116 [92m$17 [92m$19 [93m$60 [91m$599 [93m$60 [91m$160 [91m$355 [93m$75 [92m$34 [92m$17 [92m$2 [93m$70 [93m$76 [93m$41 [92m$9 [91m$226 [92m$5 [92m$5 [92m$4 [92m$0 [92m$7 [92m$5 [93m$74 [92m$7 [92m$10 [93m$68 [93m$74 [92m$5 [92m$3 [92m$17 [93m$45 [92m$5 [92m$16 [92m$0 [91m$153 [92m$2 [91m$122 [91m$150 [91m$355 

In [12]:
# Log transform prices to normalize the skewed distribution

y_raw = np.array([float(item.price) for item in train])
y = np.log1p(y_raw)

In [13]:
print(f"Raw prices - Mean: {y_raw.mean():.2f}, Median: {np.median(y_raw):.2f}, Std: {y_raw.std():.2f}")
print(f"Log prices - Mean: {y.mean():.2f}, Median: {np.median(y):.2f}, Std: {y.std():.2f}")

Raw prices - Mean: 140.57, Median: 80.10, Std: 160.07
Log prices - Mean: 4.43, Median: 4.40, Std: 1.03


In [14]:
# Bag of words vectorization with hashing (memory efficient, no vocab storage needed)

documents = [item.summary for item in train]
vectorizer = HashingVectorizer(n_features=N_FEATURES, stop_words="english", binary=True)
X = vectorizer.fit_transform(documents)

In [15]:
print(f"Features matrix shape: {X.shape}")

Features matrix shape: (800000, 5000)


In [16]:
# NN architecture

class ImprovedPriceNueralNetwork(nn.Module):
    """
    3 hidden layer network with Batchnorm and Dropout.
    Wider first layer (256) to handle 5000 dimensional sparse input
    and then progressively narrow.
    """

    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(

            # Layer 1: Compress 5000 sparse features into dense representation
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            # Layer 2: Refine representation
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),

            # Layer 3: Final hidden layer
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1),

            # Output: Single price prediction (in log space)
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.net(x)


In [17]:
# Convert to Pytorch tensors

X_tensor = torch.FloatTensor(X.toarray())
y_tensor = torch.FloatTensor(y).unsqueeze(1)

In [18]:
# Train / validation split (val 10% per VAL_SPLIT config value)

X_train, X_val, y_train, y_val = train_test_split(
    X_tensor, y_tensor, test_size=VAL_SPLIT, random_state=RANDOM_SEED
)

print(f"Training samples: {X_train.shape[0]:.2f}")
print(f"Validation samples: {X_val.shape[0]:.2f}")

Training samples: 720000.00
Validation samples: 80000.00


In [19]:
# Dataloader for mini-back training

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [20]:
# Initialize model

input_size = X_tensor.shape[1]
model = ImprovedPriceNueralNetwork(input_size)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Traininable parameters: {trainable_params:,}")

Traininable parameters: 1,322,369


In [21]:
# Measure loss using HuberLoss - ore robust to outliers than MSE

loss_function = nn.HuberLoss(delta=1.0)

In [22]:
# Adam optimizer with weight decay for layer 2 regularization

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [23]:
# Cosine annealing to decay learning rate smoothly to zero

scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

In [24]:
# Training loop with eary stopping

best_val_loss = float("inf")
patience_counter = 0
train_losses = []
val_losses = []

for epoch in range(EPOCHS):
    # Training phase
    model.train()
    epoch_losses = []

    for batch_X, batch_y in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{EPOCHS}"):
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = loss_function(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())

    avg_train_loss = np.mean(epoch_losses)
    train_losses.append(avg_train_loss)

    # Validation phase
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = loss_function(val_outputs, y_val).item()
    val_losses.append(val_loss)

    # Step the learning rate schedule
    scheduler.step()
    current_lr = scheduler.get_last_lr()[0]

    print(
        f"Epoch [{epoch + 1}/EPOCHS] "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Loss: {val_loss:.4f} | "
        f"LR: {current_lr}:.6f"
    )

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
        patience_counter = 0
        print(f"New best model saved (val_loss: {val_loss}:.4f)")
    else:
        patience_counter += 1
        print(f"No improvement ({patience_counter}/{PATIENCE})")
        if patience_counter >= PATIENCE:
            print(f"\nEarly stopping triggered at epoch {epoch + 1}")
            break

Epoch 1/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [1/EPOCHS] Train Loss: 0.2807 | Val Loss: 0.2011 | LR: 0.0009972609476841367:.6f
New best model saved (val_loss: 0.20110392570495605:.4f)


Epoch 2/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [2/EPOCHS] Train Loss: 0.2219 | Val Loss: 0.1945 | LR: 0.0009890738003669028:.6f
New best model saved (val_loss: 0.19454076886177063:.4f)


Epoch 3/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [3/EPOCHS] Train Loss: 0.2127 | Val Loss: 0.1901 | LR: 0.0009755282581475768:.6f
New best model saved (val_loss: 0.19012172520160675:.4f)


Epoch 4/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [4/EPOCHS] Train Loss: 0.2092 | Val Loss: 0.1891 | LR: 0.0009567727288213003:.6f
New best model saved (val_loss: 0.1890687197446823:.4f)


Epoch 5/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [5/EPOCHS] Train Loss: 0.2071 | Val Loss: 0.1883 | LR: 0.0009330127018922195:.6f
New best model saved (val_loss: 0.18833616375923157:.4f)


Epoch 6/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [6/EPOCHS] Train Loss: 0.2055 | Val Loss: 0.1881 | LR: 0.0009045084971874739:.6f
New best model saved (val_loss: 0.18808963894844055:.4f)


Epoch 7/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [7/EPOCHS] Train Loss: 0.2042 | Val Loss: 0.1869 | LR: 0.0008715724127386972:.6f
New best model saved (val_loss: 0.18686728179454803:.4f)


Epoch 8/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [8/EPOCHS] Train Loss: 0.2032 | Val Loss: 0.1841 | LR: 0.0008345653031794292:.6f
New best model saved (val_loss: 0.18411953747272491:.4f)


Epoch 9/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [9/EPOCHS] Train Loss: 0.2016 | Val Loss: 0.1834 | LR: 0.0007938926261462366:.6f
New best model saved (val_loss: 0.18344877660274506:.4f)


Epoch 10/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [10/EPOCHS] Train Loss: 0.2000 | Val Loss: 0.1819 | LR: 0.00075:.6f
New best model saved (val_loss: 0.18188397586345673:.4f)


Epoch 11/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [11/EPOCHS] Train Loss: 0.1980 | Val Loss: 0.1813 | LR: 0.0007033683215379003:.6f
New best model saved (val_loss: 0.1813039928674698:.4f)


Epoch 12/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [12/EPOCHS] Train Loss: 0.1962 | Val Loss: 0.1798 | LR: 0.0006545084971874739:.6f
New best model saved (val_loss: 0.17977741360664368:.4f)


Epoch 13/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [13/EPOCHS] Train Loss: 0.1938 | Val Loss: 0.1765 | LR: 0.0006039558454088797:.6f
New best model saved (val_loss: 0.17649401724338531:.4f)


Epoch 14/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [14/EPOCHS] Train Loss: 0.1912 | Val Loss: 0.1755 | LR: 0.0005522642316338269:.6f
New best model saved (val_loss: 0.1754743456840515:.4f)


Epoch 15/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [15/EPOCHS] Train Loss: 0.1880 | Val Loss: 0.1746 | LR: 0.0005000000000000002:.6f
New best model saved (val_loss: 0.1745610535144806:.4f)


Epoch 16/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [16/EPOCHS] Train Loss: 0.1849 | Val Loss: 0.1723 | LR: 0.0004477357683661734:.6f
New best model saved (val_loss: 0.17231619358062744:.4f)


Epoch 17/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [17/EPOCHS] Train Loss: 0.1812 | Val Loss: 0.1704 | LR: 0.00039604415459112036:.6f
New best model saved (val_loss: 0.1704096496105194:.4f)


Epoch 18/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [18/EPOCHS] Train Loss: 0.1771 | Val Loss: 0.1679 | LR: 0.0003454915028125264:.6f
New best model saved (val_loss: 0.1679464727640152:.4f)


Epoch 19/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [19/EPOCHS] Train Loss: 0.1722 | Val Loss: 0.1662 | LR: 0.0002966316784621:.6f
New best model saved (val_loss: 0.16615024209022522:.4f)


Epoch 20/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [20/EPOCHS] Train Loss: 0.1670 | Val Loss: 0.1634 | LR: 0.00025000000000000017:.6f
New best model saved (val_loss: 0.1634192168712616:.4f)


Epoch 21/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [21/EPOCHS] Train Loss: 0.1605 | Val Loss: 0.1616 | LR: 0.00020610737385376354:.6f
New best model saved (val_loss: 0.16160084307193756:.4f)


Epoch 22/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [22/EPOCHS] Train Loss: 0.1547 | Val Loss: 0.1606 | LR: 0.00016543469682057108:.6f
New best model saved (val_loss: 0.1605968326330185:.4f)


Epoch 23/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [23/EPOCHS] Train Loss: 0.1478 | Val Loss: 0.1593 | LR: 0.00012842758726130298:.6f
New best model saved (val_loss: 0.15927919745445251:.4f)


Epoch 24/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [24/EPOCHS] Train Loss: 0.1411 | Val Loss: 0.1587 | LR: 9.549150281252637e-05:.6f
New best model saved (val_loss: 0.15869849920272827:.4f)


Epoch 25/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [25/EPOCHS] Train Loss: 0.1343 | Val Loss: 0.1576 | LR: 6.698729810778068e-05:.6f
New best model saved (val_loss: 0.15758903324604034:.4f)


Epoch 26/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [26/EPOCHS] Train Loss: 0.1274 | Val Loss: 0.1570 | LR: 4.322727117869953e-05:.6f
New best model saved (val_loss: 0.15698786079883575:.4f)


Epoch 27/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [27/EPOCHS] Train Loss: 0.1215 | Val Loss: 0.1572 | LR: 2.447174185242324e-05:.6f
No improvement (1/5)


Epoch 28/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [28/EPOCHS] Train Loss: 0.1170 | Val Loss: 0.1571 | LR: 1.092619963309716e-05:.6f
No improvement (2/5)


Epoch 29/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [29/EPOCHS] Train Loss: 0.1140 | Val Loss: 0.1569 | LR: 2.7390523158633003e-06:.6f
New best model saved (val_loss: 0.1568501740694046:.4f)


Epoch 30/30:   0%|          | 0/11250 [00:00<?, ?it/s]

Epoch [30/EPOCHS] Train Loss: 0.1122 | Val Loss: 0.1570 | LR: 0.0:.6f
No improvement (1/5)


In [26]:
# Load the best model weights

model.load_state_dict(torch.load("best_model.pt"))
print(f"\nLoaded best model with val_los: {best_val_loss:.4f}")


Loaded best model with val_los: 0.1569


In [27]:
# Evaluation

def neural_network(item):
    """Predict price for a single item.  Returns price in original dollar scale."""
    model.eval()
    with torch.no_grad():
        vector = vectorizer.transform([item.summary])
        vector = torch.FloatTensor(vector.toarray())
        log_prediction = model(vector)[0].item()
    # Reverse the log1p transform and clamp to non negative
    return max(0, np.expm1(log_prediction))

In [28]:
# test

sample = test[0]
predicted = neural_network(sample)
print(f"Predicted: ${predicted:.2f} | Actual: ${sample.price:.2f}")

Predicted: $91.70 | Actual: $219.00


In [29]:
evaluate(neural_network, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[91m$127 [93m$76 [92m$17 [93m$61 [91m$99 [91m$123 [93m$48 [92m$10 [92m$9 [93m$103 [93m$257 [93m$128 [92m$35 [91m$109 [92m$26 [92m$5 [92m$11 [92m$15 [92m$13 [92m$32 [93m$43 [92m$35 [93m$43 [91m$140 [93m$92 [91m$239 [91m$260 [92m$6 [93m$63 [93m$53 [92m$38 [92m$16 [92m$20 [93m$47 [92m$4 [93m$253 [92m$21 [92m$20 [91m$102 [92m$2 [91m$96 [92m$22 [92m$9 [92m$16 [91m$116 [92m$10 [92m$3 [92m$2 [91m$81 [92m$13 [92m$1 [91m$80 [91m$268 [93m$75 [92m$28 [93m$76 [92m$17 [93m$41 [92m$10 [92m$9 [91m$142 [93m$42 [92m$30 [92m$20 [93m$238 [92m$27 [92m$28 [91m$286 [92m$20 [91m$115 [92m$15 [92m$7 [92m$19 [93m$77 [92m$22 [92m$11 [91m$121 [92m$6 [92m$17 [92m$3 [93m$45 [92m$22 [93m$50 [93m$69 [92m$9 [91m$115 [92m$21 [93m$84 [93m$42 [93m$50 [92m$0 [92m$13 [92m$2 [92m$1 [92m$9 [91m$119 [92m$5 [92m$4 [93m$73 [93m$187 [92m$12 [92m$34 [92m$3 [92m$39 [92m$25 [92m$4 [92m$16 [91m$262 [92m$6 [91m$130 [92m$25 