In [1]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Example: you placed the competition folder inside "MyDrive/ML_Challenge_2025/"
%cd /content/drive/MyDrive/ML_CHALLENGE_dataset

/content/drive/MyDrive/ML_CHALLENGE_dataset


In [3]:

!pip install numpy pandas scikit-learn lightgbm tqdm scipy



In [4]:
# === 4. Imports
import os, re, gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from scipy.sparse import hstack

In [5]:
# === 5. Define helper functions
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true, float), np.array(y_pred, float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1.0
    return 100 * np.mean(np.abs(y_pred - y_true) / denom)

def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).replace('\n', ' ').strip()
    return re.sub(r'\s+', ' ', s).lower()

def parse_ipq(text):
    if pd.isna(text): return 1.0
    s = str(text).lower()
    m = re.search(r'(\d+(?:[\.,]\d+)?)\s*(?:x|pack|packs|pk|pcs|count|ct|ml|g|kg|l|litre|oz|piece|pieces)?\b', s)
    if m:
        try:
            return float(m.group(1).replace(',', '.'))
        except: pass
    return 1.0

def make_numeric_features(df):
    df['catalog_clean'] = df['catalog_content'].astype(str).apply(clean_text)
    df['ipq'] = df['catalog_clean'].apply(parse_ipq)
    df['ipq_log'] = np.log1p(df['ipq'])
    df['len_text'] = df['catalog_clean'].str.len().fillna(0).astype(int)
    df['num_digits'] = df['catalog_clean'].str.count(r'\d').fillna(0).astype(int)
    df['num_tokens'] = df['catalog_clean'].str.split().apply(lambda x: len(x) if isinstance(x, list) else 0).astype(int)
    return df

In [10]:
# === 6. Paths (adjust if dataset in a different folder)
DATASET_FOLDER = '.'
TRAIN_CSV = os.path.join(DATASET_FOLDER, 'train.csv')
TEST_CSV  = os.path.join(DATASET_FOLDER, 'test.csv')
OUTPUT_CSV = os.path.join(DATASET_FOLDER, 'test_out.csv')

In [11]:
# === 7. Load data
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)
print(train.shape, test.shape)

train = make_numeric_features(train)
test  = make_numeric_features(test)

(75000, 4) (75000, 3)


In [12]:
# === 8. TF-IDF features
print("Fitting TF-IDF...")
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=3)
all_text = pd.concat([train['catalog_clean'], test['catalog_clean']], axis=0)
tfidf.fit(all_text)

X_text = tfidf.transform(train['catalog_clean'])
X_text_test = tfidf.transform(test['catalog_clean'])

dense_cols = ['ipq_log', 'len_text', 'num_digits', 'num_tokens']
X_num = train[dense_cols].fillna(0).values
X_num_test = test[dense_cols].fillna(0).values

X_train_full = hstack([X_text, X_num])
X_test_full  = hstack([X_text_test, X_num_test])
y = np.log1p(train['price'].values.astype(float))


Fitting TF-IDF...


In [20]:
# === 9. Train LightGBM with CV (LightGBM >= 4.0 fix)
from scipy.sparse import csr_matrix
from lightgbm import early_stopping, log_evaluation

# Convert to CSR for slicing
X_train_full = csr_matrix(X_train_full)
X_test_full  = csr_matrix(X_test_full)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': -1,
    'seed': 42
}

NFOLD = 5
kf = KFold(n_splits=NFOLD, shuffle=True, random_state=42)
oof, preds = np.zeros(len(train)), np.zeros(len(test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(train)):
    print(f"\nFold {fold+1}/{NFOLD}")

    X_train_fold = X_train_full[tr_idx]
    y_train_fold = y[tr_idx]
    X_val_fold   = X_train_full[val_idx]
    y_val_fold   = y[val_idx]

    dtrain = lgb.Dataset(X_train_fold, label=y_train_fold, free_raw_data=False)
    dvalid = lgb.Dataset(X_val_fold, label=y_val_fold, free_raw_data=False)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=5000,
        valid_sets=[dvalid],
        callbacks=[
            early_stopping(stopping_rounds=100),
            log_evaluation(100)
        ]
    )

    oof[val_idx] = model.predict(X_val_fold, num_iteration=model.best_iteration)
    preds += model.predict(X_test_full, num_iteration=model.best_iteration) / NFOLD

    del dtrain, dvalid, X_train_fold, y_train_fold, X_val_fold, y_val_fold
    gc.collect()



Fold 1/5
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.745087
[200]	valid_0's rmse: 0.721113
[300]	valid_0's rmse: 0.711327
[400]	valid_0's rmse: 0.705497
[500]	valid_0's rmse: 0.702161
[600]	valid_0's rmse: 0.699712
[700]	valid_0's rmse: 0.698343
[800]	valid_0's rmse: 0.697315
[900]	valid_0's rmse: 0.69618
[1000]	valid_0's rmse: 0.695548
[1100]	valid_0's rmse: 0.695102
[1200]	valid_0's rmse: 0.694849
[1300]	valid_0's rmse: 0.694099
[1400]	valid_0's rmse: 0.693991
Early stopping, best iteration is:
[1376]	valid_0's rmse: 0.693902

Fold 2/5
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.72825
[200]	valid_0's rmse: 0.704023
[300]	valid_0's rmse: 0.693692
[400]	valid_0's rmse: 0.687824
[500]	valid_0's rmse: 0.683996
[600]	valid_0's rmse: 0.680656
[700]	valid_0's rmse: 0.679046
[800]	valid_0's rmse: 0.677519
[900]	valid_0's rmse: 0.67631
[1000]	valid_0's rmse: 0.675634
[1100]	valid_0's rmse: 0.675038
[1200]	val

In [21]:
# === 10. Evaluate & Save submission
oof_price = np.expm1(oof)
pred_price = np.maximum(np.expm1(preds), 0.01)

smape_score = smape(train['price'], oof_price)
print(f"OOF SMAPE: {smape_score:.4f}%")

submission = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_price})
submission.to_csv(OUTPUT_CSV, index=False)
print(f"Saved predictions to {OUTPUT_CSV}")
submission.head()


OOF SMAPE: 52.0602%
Saved predictions to ./test_out.csv


Unnamed: 0,sample_id,price
0,100179,14.278284
1,245611,18.33133
2,146263,14.796879
3,95658,15.010193
4,36806,45.389197
