In [4]:
from google.colab import drive
drive.mount('/content/drive')
# cd to the folder where train.csv and test.csv live, example:
# %cd /content/drive/MyDrive/ML_CHALLENGE_dataset



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Run this cell once
!pip install -q numpy pandas scikit-learn lightgbm catboost tqdm sentence-transformers timm torch torchvision pillow


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import os, re, gc
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
from PIL import Image

# Small helpers
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).replace('\n',' ').strip()
    return re.sub(r'\s+',' ',s).lower()

def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true, float), np.array(y_pred, float)
    denom = (np.abs(y_true) + np.abs(y_pred))/2.0
    denom[denom==0] = 1.0
    return 100 * np.mean(np.abs(y_pred - y_true)/denom)


In [7]:
DATASET_FOLDER = '.'
TRAIN_CSV = os.path.join(DATASET_FOLDER, '/content/drive/MyDrive/ML_CHALLENGE_dataset/train.csv')
TEST_CSV  = os.path.join(DATASET_FOLDER, '/content/drive/MyDrive/ML_CHALLENGE_dataset/test.csv')
OUTPUT_CSV = os.path.join(DATASET_FOLDER, '/content/drive/MyDrive/ML_CHALLENGE_dataset/sample_test_out.csv')
IMAGE_FOLDER = os.path.join(DATASET_FOLDER, 'images')  # change if images in other folder

train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

print("Train rows:", len(train), "Test rows:", len(test))
train.head()

Train rows: 75000 Test rows: 75000


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [8]:
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())
print("Price nulls in train:", train['price'].isna().sum())  # remove/handle if >0

# Create basic text column if naming differs (safe guard)
if 'catalog_content' not in train.columns:
    # try common alternatives
    for c in ['description','text','product_description','title']:
        if c in train.columns:
            train = train.rename(columns={c:'catalog_content'})
            break
if 'catalog_content' not in test.columns and 'catalog_content' in train.columns:
    # create empty column in test if missing
    test['catalog_content'] = ""


Train columns: ['sample_id', 'catalog_content', 'image_link', 'price']
Test columns: ['sample_id', 'catalog_content', 'image_link']
Price nulls in train: 0


In [9]:
def make_numeric_features(df):
    df['catalog_clean'] = df['catalog_content'].astype(str).apply(clean_text)
    # example numeric features
    df['len_text'] = df['catalog_clean'].str.len().fillna(0).astype(int)
    df['num_tokens'] = df['catalog_clean'].str.split().apply(lambda x: len(x) if isinstance(x,list) else 0).astype(int)
    df['num_digits'] = df['catalog_clean'].str.count(r'\d').fillna(0).astype(int)
    return df

train = make_numeric_features(train)
test  = make_numeric_features(test)


In [10]:
print("Loading text embedding model (all-MiniLM-L6-v2)...")
text_model = SentenceTransformer('all-MiniLM-L6-v2')

train_texts = train['catalog_clean'].fillna("").tolist()
test_texts  = test['catalog_clean'].fillna("").tolist()

train_text_emb = text_model.encode(train_texts, batch_size=64, show_progress_bar=True)
test_text_emb  = text_model.encode(test_texts, batch_size=64, show_progress_bar=True)

print("train_text_emb shape:", train_text_emb.shape)


Loading text embedding model (all-MiniLM-L6-v2)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

train_text_emb shape: (75000, 384)


In [11]:
# Only run if image column exists and images are available
if 'image_name' in train.columns and os.path.isdir(IMAGE_FOLDER):
    import torch, torch.nn as nn, timm
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("Using device:", device)
    img_model = timm.create_model('tf_efficientnet_b3_ns', pretrained=True)
    # remove classifier
    try:
        img_model.classifier = nn.Identity()
    except:
        # some timm model variants differ
        img_model.reset_classifier(0)
    img_model.eval().to(device)

    from torchvision import transforms
    tfm = transforms.Compose([
        transforms.Resize((300,300)),
        transforms.ToTensor(),  # scales to [0,1]
    ])

    def load_img_tensor(path):
        try:
            im = Image.open(path).convert('RGB')
            return tfm(im)
        except:
            # return zero image if missing
            return torch.zeros(3,300,300)

    def get_image_embeddings(df):
        emb_list = []
        batch = []
        batch_paths = []
        batch_size = 16
        for i, name in enumerate(tqdm(df['image_name'].fillna("").tolist(), total=len(df))):
            path = os.path.join(IMAGE_FOLDER, name)
            batch.append(load_img_tensor(path))
            if len(batch) == batch_size or i == len(df)-1:
                with torch.no_grad():
                    x = torch.stack(batch).to(device)
                    e = img_model(x).cpu().numpy()
                    emb_list.append(e)
                batch = []
        return np.vstack(emb_list)

    train_img_emb = get_image_embeddings(train)
    test_img_emb  = get_image_embeddings(test)
    print("Image embeddings shapes:", train_img_emb.shape, test_img_emb.shape)
else:
    train_img_emb = None
    test_img_emb = None
    print("Skipping image embeddings (no image_name column or images folder missing).")


Skipping image embeddings (no image_name column or images folder missing).


In [12]:
# Choose numeric columns you created
dense_cols = ['len_text', 'num_tokens', 'num_digits']
X_train_parts = [train_text_emb]
X_test_parts  = [test_text_emb]

# append image embeddings if available
if train_img_emb is not None:
    X_train_parts.append(train_img_emb)
    X_test_parts.append(test_img_emb)

# append dense numeric columns
X_train_parts.append(train[dense_cols].fillna(0).values)
X_test_parts.append(test[dense_cols].fillna(0).values)

X_train = np.hstack(X_train_parts)
X_test  = np.hstack(X_test_parts)

print("Final X_train shape:", X_train.shape, "X_test shape:", X_test.shape)

# standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y = np.log1p(train['price'].values)  # log target


Final X_train shape: (75000, 387) X_test shape: (75000, 387)


In [17]:
NFOLD = 5
kf = KFold(n_splits=NFOLD, shuffle=True, random_state=42)

oof = np.zeros(len(train))
preds = np.zeros(len(test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\nFold {fold+1}/{NFOLD}")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval = lgb.Dataset(X_val, label=y_val)

    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.05,
        'num_leaves': 128,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 42,
        'verbose': -1
    }

    model = lgb.train(params, dtrain, num_boost_round=2000,
                      valid_sets=[dval],
                      callbacks=[lgb.early_stopping(100, verbose=200)]) # Use callbacks instead of early_stopping_rounds


    oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    preds += model.predict(X_test, num_iteration=model.best_iteration) / NFOLD

# convert back from log space
oof_price = np.expm1(oof)
pred_price = np.expm1(preds)

print("OOF SMAPE:", smape(train['price'].values, oof_price))


Fold 1/5
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1999]	valid_0's rmse: 0.752618

Fold 2/5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1637]	valid_0's rmse: 0.74175

Fold 3/5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1668]	valid_0's rmse: 0.739818

Fold 4/5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1161]	valid_0's rmse: 0.739541

Fold 5/5
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1977]	valid_0's rmse: 0.747459
OOF SMAPE: 57.45603948718133


In [19]:
# Make sure test has an id column; use 'sample_id' or 'id' depending on your test.csv
id_col = 'sample_id' if 'sample_id' in test.columns else ( 'id' if 'id' in test.columns else None)
if id_col is None:
    # create default id
    test['sample_id'] = np.arange(len(test))
    id_col = 'sample_id'

submission = pd.DataFrame({id_col: test[id_col], 'price': np.maximum(pred_price, 0.01)})
submission.to_csv(OUTPUT_CSV, index=False)
print("Saved submission to:", OUTPUT_CSV)
submission.head()

Saved submission to: /content/drive/MyDrive/ML_CHALLENGE_dataset/sample_test_out.csv


Unnamed: 0,sample_id,price
0,100179,15.063255
1,245611,19.209009
2,146263,23.652177
3,95658,11.439017
4,36806,22.722598


Saved submission to: /content/drive/MyDrive/ML_CHALLENGE_dataset/sample_test_out.csv


Unnamed: 0,sample_id,price
0,100179,15.063255
1,245611,19.209009
2,146263,23.652177
3,95658,11.439017
4,36806,22.722598
