In [1]:
import os
import pandas as pd
import seaborn as sns

In [2]:
os.chdir(r"C:\Users\dhruv\AI for future course\data")
#load the data
df=pd.read_csv(r'C:\Users\dhruv\ML_dataset\student_resource\dataset\test.csv')

In [3]:
df.dtypes

sample_id            int64
catalog_content     object
image_link          object
price              float64
Item Name           object
Value              float64
Unit                object
Bullet Point 1      object
Bullet Point 2      object
Bullet Point 3      object
Bullet Point 4      object
Bullet Point 5      object
Value_numeric      float64
combined_text       object
dtype: object

In [6]:
# =====================================================
# SMART PRODUCT PRICING PREDICTION
# Goal: SMAPE < 40
# =====================================================

import pandas as pd
import numpy as np
import psutil
import gc
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

# =====================================================
# Step 1: Load Data
# =====================================================
train = pd.read_csv(r'C:\Users\dhruv\ML_dataset\student_resource\dataset\train.csv')
test = pd.read_csv(r'C:\Users\dhruv\ML_dataset\student_resource\dataset\test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# =====================================================
# Step 2: Preprocess Text
# =====================================================
train["catalog_content"] = train["catalog_content"].fillna("")
test["catalog_content"] = test["catalog_content"].fillna("")

# Combine text for unified vectorization
all_text = pd.concat([train["catalog_content"], test["catalog_content"]])

# =====================================================
# Step 3: Dynamic TF-IDF based on RAM
# =====================================================
available_memory_gb = psutil.virtual_memory().available / (1024 ** 3)
if available_memory_gb > 8:
    max_features = 80000
    n_components = 250
elif available_memory_gb > 4:
    max_features = 40000
    n_components = 150
else:
    max_features = 20000
    n_components = 80

print(f"TF-IDF: max_features={max_features}, SVD={n_components}")

tfidf = TfidfVectorizer(
    max_features=max_features,
    stop_words="english",
    sublinear_tf=True
)
tfidf_all = tfidf.fit_transform(all_text)
print("TF-IDF shape:", tfidf_all.shape)

# =====================================================
# Step 4: Dimensionality Reduction (Memory Safe)
# =====================================================
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd_all = svd.fit_transform(tfidf_all)
print("SVD shape:", svd_all.shape)

# Split back
train_svd = svd_all[:len(train)]
test_svd = svd_all[len(train):]

# Clean up to free memory
del tfidf_all, svd_all, all_text
gc.collect()

# =====================================================
# Step 5: Train/Test Split
# =====================================================
X_train, X_val, y_train, y_val = train_test_split(
    train_svd, train["price"], test_size=0.2, random_state=42
)

# =====================================================
# Step 6: Define SMAPE Metric
# =====================================================
def smape(y_true, y_pred):
    return np.mean(
        np.abs(y_true - y_pred) / ((np.abs(y_true) + np.abs(y_pred)) / 2)
    ) * 100

# =====================================================
# Step 7: Train LightGBM Model
# =====================================================
print("\nTraining LightGBM model...")
model = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=128,
    colsample_bytree=0.8,
    subsample=0.8,
    random_state=42,
    n_jobs=-1
)
from lightgbm import early_stopping, log_evaluation

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='mae',
    callbacks=[early_stopping(100), log_evaluation(200)]
)

# =====================================================
# Step 8: Evaluate on Validation
# =====================================================
y_pred_val = model.predict(X_val)
val_smape = smape(y_val, y_pred_val)
print(f"\nðŸ”¹ Validation SMAPE: {val_smape:.2f}%")

# =====================================================
# Step 9: Predict on Test and Export
# =====================================================
test_preds = model.predict(test_svd)
test_preds = np.maximum(0, test_preds)  # Ensure positive prices

output = pd.DataFrame({
    "sample_id": test["sample_id"],
    "price": test_preds
})

output.to_csv("test_out.csv", index=False)
print("\nâœ… Predictions saved to test_out.csv")
print(f"âœ… Final Validation SMAPE = {val_smape:.2f}%")

Train shape: (75000, 4)
Test shape: (75000, 14)
TF-IDF: max_features=20000, SVD=80
TF-IDF shape: (150000, 20000)
SVD shape: (150000, 80)

Training LightGBM model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20400
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 80
[LightGBM] [Info] Start training from score 23.598634
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 14.6617	valid_0's l2: 1179.41
[400]	valid_0's l1: 14.4525	valid_0's l2: 1168.3
[600]	valid_0's l1: 14.4109	valid_0's l2: 1166.28
[800]	valid_0's l1: 14.3482	valid_0's l2: 1164.34
Early stopping, best iteration is:
[759]	valid_0's l1: 14.3576	valid_0's l2: 1164.05

ðŸ”¹ Validation SMAPE: 64.15%

âœ… Predictions saved to test_out.csv
âœ… Final Validation SMAPE = 64.15%


In [8]:
!pip install catboost --quiet

In [10]:
# =====================================================
# Smart Product Pricing Model - Optimized Ensemble
# Target SMAPE < 40
# =====================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# =====================================================
# Step 1: Load Data
# =====================================================
train = pd.read_csv(r'C:\Users\dhruv\ML_dataset\student_resource\dataset\train.csv')
test = pd.read_csv(r'C:\Users\dhruv\ML_dataset\student_resource\dataset\test.csv')

# Replace with your actual target column
TARGET = "price" if "price" in train.columns else train.columns[-1]

# =====================================================
# Step 2: Basic Cleaning
# =====================================================
train = train.dropna(subset=[TARGET])
train = train.fillna(0)
test = test.fillna(0)

# =====================================================
# Step 3: Feature Engineering
# =====================================================
# Convert categorical variables to numeric
for col in train.columns:
    if train[col].dtype == "object":
        le = LabelEncoder()
        le.fit(list(train[col].astype(str)) + list(test[col].astype(str)))
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

# Add polynomial features
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != TARGET]
for col in numeric_cols:
    train[col + "_sq"] = train[col] ** 2
    test[col + "_sq"] = test[col] ** 2
    train[col + "_log"] = np.log1p(train[col])
    test[col + "_log"] = np.log1p(test[col])

# Scale features
scaler = StandardScaler()
train[numeric_cols] = scaler.fit_transform(train[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])

# =====================================================
# Step 4: Split Data
# =====================================================
X = train.drop(columns=[TARGET])
y = train[TARGET]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# =====================================================
# Step 5: Define Models
# =====================================================
lgb = LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=10,
    num_leaves=60,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

cat = CatBoostRegressor(
    iterations=1200,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=5,
    loss_function='MAE',
    verbose=False,
    random_seed=42
)

xgb = XGBRegressor(
    n_estimators=1200,
    learning_rate=0.04,
    max_depth=9,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

# =====================================================
# Step 6: Stacking Ensemble
# =====================================================
ensemble = StackingRegressor(
    estimators=[('lgb', lgb), ('cat', cat), ('xgb', xgb)],
    final_estimator=Ridge(alpha=1.0)
)

# =====================================================
# Step 7: Train Model
# =====================================================
ensemble.fit(X_train, y_train)

# =====================================================
# Step 8: Evaluate Model
# =====================================================
y_pred_val = ensemble.predict(X_val)

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-9))

smape_val = smape(y_val, y_pred_val)
mae_val = mean_absolute_error(y_val, y_pred_val)

print("\n===== Validation Results =====")
print(f"SMAPE: {smape_val:.2f}")
print(f"MAE: {mae_val:.2f}")

# =====================================================
# Step 9: Predict on Test
# =====================================================
test_pred = ensemble.predict(test)
submission = pd.DataFrame({"Predicted_Price": test_pred})
submission.to_csv("final_submission.csv", index=False)

print("\nPredictions saved to final_submission.csv âœ…")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 9
[LightGBM] [Info] Start training from score 23.598634
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 9
[LightGBM] [Info] Start training from score 23.558059
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 9
[LightGBM] [Info] Start trai

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: Item Name: object, Unit: object, Bullet Point 1: object, Bullet Point 2: object, Bullet Point 3: object, Bullet Point 4: object, Bullet Point 5: object, combined_text: object

In [12]:
!pip install transformers timm torch torchvision lightgbm numpy pandas scikit-learn pillow tqdm

Defaulting to user installation because normal site-packages is not writeable


In [None]:
!pip install ipywidgets "huggingface_hub[hf_xet]" --quiet
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
import os
import pandas as pd 
import numpy as np