In [None]:
import zipfile, os

zip_path = "/content/68e8d1d70b66d_student_resource.zip"
extract_dir = "/content/dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

os.listdir(extract_dir)


['student_resource', '__MACOSX']

In [None]:
import os

os.listdir("/content/dataset/student_resource")


['dataset',
 'Documentation_template.md',
 '.DS_Store',
 'sample_code.py',
 'README.md',
 'src']

In [None]:
!pip install lightgbm

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from scipy.sparse import hstack

train = pd.read_csv("/content/dataset/student_resource/dataset/train.csv")
test = pd.read_csv("/content/dataset/student_resource/dataset/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

def extract_ipq(text):
    if pd.isna(text):
        return 1
    text = text.lower()
    m = re.search(r'(\d+)\s*(?:ct|count|pack|pk|pcs|pieces|x)\b', text)
    if m:
        return int(m.group(1))
    m2 = re.search(r'pack\s*of\s*(\d+)', text)
    if m2:
        return int(m2.group(1))
    return 1

train["ipq"] = train["catalog_content"].apply(extract_ipq)
test["ipq"] = test["catalog_content"].apply(extract_ipq)

tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(train["catalog_content"].fillna(""))
X_test_text = tfidf.transform(test["catalog_content"].fillna(""))

X_train = hstack([X_train_text, np.log1p(train["ipq"].values).reshape(-1,1)])
X_test = hstack([X_test_text, np.log1p(test["ipq"].values).reshape(-1,1)])
y = np.log1p(train["price"].values)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.1, random_state=42)

train_set = lgb.Dataset(X_tr, label=y_tr)
val_set = lgb.Dataset(X_val, label=y_val)
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 40,
    "verbosity": -1
}
model = lgb.train(
    params=params,
    train_set=train_set,
    valid_sets=[val_set],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(100)
    ]
)

preds = model.predict(X_test, num_iteration=model.best_iteration)
preds = np.expm1(preds)
preds = np.maximum(preds, 0.01)  # ensure positive

out = pd.DataFrame({
    "sample_id": test["sample_id"],
    "price": preds
})
out.to_csv("test_out.csv", index=False)
print("✅ test_out.csv created successfully!")


Train shape: (75000, 4)
Test shape: (75000, 3)
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.73376
[200]	valid_0's rmse: 0.710238
[300]	valid_0's rmse: 0.699909
[400]	valid_0's rmse: 0.693544
[500]	valid_0's rmse: 0.689884
[600]	valid_0's rmse: 0.686767
[700]	valid_0's rmse: 0.683935
[800]	valid_0's rmse: 0.68254
[900]	valid_0's rmse: 0.681326
[1000]	valid_0's rmse: 0.680284
[1100]	valid_0's rmse: 0.679752
Early stopping, best iteration is:
[1054]	valid_0's rmse: 0.679744




✅ test_out.csv created successfully!


In [6]:
from google.colab import files
files.download("test_out.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>