<a href="https://colab.research.google.com/github/fachiny17/kaggle/blob/main/dsn_bootcamp_qualification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump

In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read CSV (update path to where your file is stored in Drive)
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

In [None]:
train_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [None]:
test_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [None]:
# file: fast_pipeline.py
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, KFold
from joblib import dump
from scipy.stats import uniform, randint
import warnings
import re

def create_preprocessing_pipeline():
    numerical_feature = ['model_year', 'engine_displacement', 'milage']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Low-cardinality categorical features → OneHot
    low_card_cat = ['brand', 'model', 'ext_col', 'int_col',
                    'fuel_type', 'transmission', 'accident', 'clean_title']
    low_card_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # # High-cardinality categorical features → Ordinal
    # high_card_cat = ['brand', 'model', 'ext_col', 'int_col']
    # high_card_transformer = Pipeline(steps=[
    #     ('imputer', SimpleImputer(strategy='most_frequent')),
    #     ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    # ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_feature),
            ('low_cat', low_card_transformer, low_card_cat)
            #  ('high_cat', high_card_transformer, high_card_cat)
        ]
    )
    return preprocessor


In [None]:
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    data = df.copy()

    # Clean mileage
    data['milage'] = (
        data['milage'].astype(str).str.replace(',', '', regex=False).astype(float)
    )

    # Vectorized extraction of engine displacement
    data['engine_displacement'] = (
        data['engine'].astype(str).str.extract(r'(\d+\.?\d*)\s*L')[0].astype(float)
    )

    return data




In [None]:
def evaluate_regression(y_true, y_pred):
  """Calculates and returns regression metrics"""
  metrics = {
      'MAE': mean_absolute_error(y_true, y_pred),
      'MSE': mean_squared_error(y_true, y_pred),
      'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
      "R2": r2_score(y_true, y_pred)
  }
  print(f"Metrics: {metrics}")

  return metrics

In [None]:
def main():
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Feature engineering
    train_processed = extract_features(train_df)
    test_processed = extract_features(test_df)

    # Target variable
    y_train = (
        train_processed['price']
        .astype(str)
        .str.replace('$', '', regex=False)
        .str.replace(',', '', regex=False)
        .astype(float)
    )

    # Drop target
    X_train = train_processed.drop(columns=['price'])

    # Split training data for validation (80% train, 20% validation)
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # Build pipeline
    preprocessor = create_preprocessing_pipeline()
    model = HistGradientBoostingRegressor(
        random_state=42,
        max_iter=100,
        learning_rate=0.1,
        max_depth=10,
        min_samples_leaf=20,
        l2_regularization=0.1
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # # Cross-validation for RMSE evaluation
    # kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    # cv_scores = cross_val_score(
    #     pipeline,
    #     X_train,
    #     y_train,
    #     cv=kfold,
    #     scoring='neg_root_mean_squared_error',  # Use RMSE directly
    #     n_jobs=-1
    # )
    #
    # # Convert negative RMSE to positive (scikit-learn returns negative for scoring)
    # rmse_scores = -cv_scores
    # mean_rmse = rmse_scores.mean()
    # std_rmse = rmse_scores.std()
    #
    # print(f"Cross-Validation RMSE Scores: {rmse_scores}")
    # print(f"Mean RMSE: {mean_rmse:.2f} ± {std_rmse:.2f}")

    # Train baseline model ------------------------------------------------------
    print("Training baseline model...")
    baseline_model = pipeline.fit(X_train_split, y_train_split)
    baseline_preds = baseline_model.predict(X_val)
    baseline_metrics = evaluate_regression(y_val, baseline_preds)

    # Parameter distributions for RandomizedSearchCV -------------------
    param_distributions = {
        'model__max_iter': randint(50, 1000),
        'model__learning_rate': uniform(0.01, 0.3),  # 0.01 to 0.31
        'model__max_depth': randint(3, 20),
        'model__min_samples_leaf': randint(5, 50),
        'model__l2_regularization': uniform(0.001, 1.0),  # 0.001 to 1.001
        'model__max_bins': [128, 255],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3]
    }


    print("\nStarting RandomizedSearchCV...")
    rs_model = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=30,  # Try 30 random combinations (adjust as needed)
        scoring='neg_root_mean_squared_error',
        cv=3,  # Fewer folds for faster tuning
        verbose=2,
        n_jobs=-1,
        random_state=42
    )

    rs_model.fit(X_train_split, y_train_split)

    # Best model -------------------------------------------------------
    print(f"\nBest parameters: {rs_model.best_params_}")
    best_model = rs_model.best_estimator_
    rs_preds = best_model.predict(X_val)
    rs_metrics = evaluate_regression(y_val, rs_preds)

    # Compare metrics ------------------------------------------------------
    compare_metrics = pd.DataFrame({
        'Baseline': baseline_metrics,
        'RandomizedSearchCV': rs_metrics
    }).T
    print("\nModel Comparison:")
    print(compare_metrics)

    # Final Training: Train best model on FULL training data
    print("\nTraining final model on full training data...")
    final_model = rs_model.best_estimator_.fit(X_train, y_train)

    # Predict on ACTUAL competition test set (no price column); make sure test_proceessed doesn't have price column
    competition_test = test_processed.drop(columns=['price'], errors='ignore')
    test_predictions = final_model.predict(competition_test)

    # Save submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('histgb_randomisedsearchcv.csv', index=False)
    print("Submission file saved")

if __name__ == "__main__":
    main()

Training baseline model...
  MAE: 19835.6606
  RMSE: 68243.7918
  R2: 0.1625

Starting RandomizedSearchCV...
Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [None]:
# LightGBM Regressor with preprocessing + RandomizedSearchCV
from google.colab import drive
drive.mount('/content/drive')

# Imports
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import randint, uniform
from lightgbm import LGBMRegressor

# ---------------- Preprocessing -----------------
def create_preprocessing_pipeline():
    numerical_features = ['model_year', 'engine_displacement', 'milage']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))
    ])

    categorical_features = ['brand', 'model', 'ext_col', 'int_col',
                            'fuel_type', 'transmission', 'accident', 'clean_title']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    return ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

def extract_features(df):
    data = df.copy()
    data['milage'] = data['milage'].astype(str).str.replace(',', '', regex=False).astype(float)
    data['engine_displacement'] = data['engine'].astype(str).str.extract(r'(\d+\.?\d*)\s*L')[0].astype(float)
    return data

def evaluate_regression(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }

# ---------------- Main -----------------
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

train_processed = extract_features(train_df)
test_processed = extract_features(test_df)

y_train = train_processed['price'].astype(str).str.replace('[$,]', '', regex=True).astype(float)
X_train = train_processed.drop(columns=['price'])

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', create_preprocessing_pipeline()),
    ('model', LGBMRegressor(random_state=42))
])

# Baseline
print("Training LightGBM baseline...")
baseline_model = pipeline.fit(X_train_split, y_train_split)
baseline_preds = baseline_model.predict(X_val)
print("Baseline metrics:", evaluate_regression(y_val, baseline_preds))

# Param space
param_dist = {
    'model__n_estimators': randint(100, 1000),
    'model__learning_rate': uniform(0.01, 0.3),
    'model__max_depth': randint(3, 20),
    'model__num_leaves': randint(20, 200),
    'model__min_child_samples': randint(5, 100)
}

# RandomizedSearch
print("\nTuning LightGBM with RandomizedSearchCV...")
rs = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=30,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
rs.fit(X_train_split, y_train_split)

print("Best params:", rs.best_params_)

# Evaluate best model
val_preds = rs.predict(X_val)
print("Validation metrics:", evaluate_regression(y_val, val_preds))

# Final model on full data
final_model = rs.best_estimator_.fit(X_train, y_train)
competition_test = test_processed.drop(columns=['price'], errors='ignore')
test_preds = final_model.predict(competition_test)

# Save submission
submission = pd.DataFrame({'id': test_df['id'], 'price': test_preds})
submission.to_csv('lightgbm_submission.csv', index=False)
print("LightGBM submission saved!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training LightGBM baseline...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.683314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4012
[LightGBM] [Info] Number of data points in the train set: 169679, number of used features: 1833
[LightGBM] [Info] Start training from score 43888.560718




Baseline metrics: {'MAE': 19695.47160267817, 'RMSE': np.float64(69699.18035558485), 'R2': 0.15165160352832463}

Tuning LightGBM with RandomizedSearchCV...
Fitting 3 folds for each of 30 candidates, totalling 90 fits


KeyboardInterrupt: 

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

def extract_features(df):
    """Simple feature engineering"""
    data = df.copy()

    # Clean milage (correct spelling)
    if 'milage' in data.columns:
        data['milage'] = data['milage'].astype(str).str.replace(',', '').astype(float)

    # Extract engine displacement if engine column exists
    if 'engine' in data.columns:
        # Try to extract numeric value from engine description
        data['engine_displacement'] = (
            data['engine']
            .astype(str)
            .str.extract(r'(\d+\.?\d*)')[0]  # Extract first numeric value
            .astype(float)
        )

    return data

def main():
    # Load data
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Feature engineering
    train_processed = extract_features(train_df)
    test_processed = extract_features(test_df)

    # Target variable
    y_train = (
        train_processed['price']
        .astype(str)
        .str.replace('$', '', regex=False)
        .str.replace(',', '', regex=False)
        .astype(float)
    )

    # Features - drop price and keep important columns
    feature_cols = ['model_year', 'milage', 'brand', 'model', 'fuel_type', 'transmission']
    X_train = train_processed[feature_cols]
    X_test = test_processed[feature_cols]

    # Encode categorical variables for LightGBM
    categorical_cols = ['brand', 'model', 'fuel_type', 'transmission']
    label_encoders = {}

    for col in categorical_cols:
        if col in X_train.columns:
            le = LabelEncoder()
            # Combine train and test for consistent encoding
            combined = pd.concat([X_train[col], X_test[col]], axis=0)
            le.fit(combined.astype(str))
            X_train[col] = le.transform(X_train[col].astype(str))
            X_test[col] = le.transform(X_test[col].astype(str))
            label_encoders[col] = le

    # Handle missing values
    X_train = X_train.fillna(X_train.median())
    X_test = X_test.fillna(X_test.median())

    # Train-validation split
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # LightGBM model with early stopping in constructor
    print("Training LightGBM model...")
    model = lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1,
        early_stopping_rounds=50  # CORRECTED: moved to constructor
    )

    # Train with early stopping - CORRECTED syntax
    model.fit(
        X_train_split,
        y_train_split,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse'
    )

    # Evaluate
    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    r2 = r2_score(y_val, val_preds)

    print(f"Validation RMSE: {rmse:.2f}")
    print(f"Validation MAE: {mae:.2f}")
    print(f"Validation R²: {r2:.4f}")

    # Final training on full data
    print("\nTraining final model on full data...")
    final_model = lgb.LGBMRegressor(
        n_estimators=model.best_iteration_ if hasattr(model, 'best_iteration_') else 1000,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    final_model.fit(X_train, y_train)

    # Predict on test set
    test_predictions = final_model.predict(X_test)

    # Save submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('lightgbm_submission.csv', index=False)
    print("✅ LightGBM submission file saved!")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training LightGBM model...
Validation RMSE: 68229.69
Validation MAE: 20052.95
Validation R²: 0.1629

Training final model on full data...
✅ LightGBM submission file saved!


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform
import warnings
import re
warnings.filterwarnings('ignore')

def enhanced_extract_features(df):
    """Advanced feature engineering"""
    data = df.copy()

    # Clean milage
    if 'milage' in data.columns:
        data['milage'] = data['milage'].astype(str).str.replace(',', '').astype(float)

    # Extract engine features if available
    if 'engine' in data.columns:
        # Extract displacement
        displacement = data['engine'].astype(str).str.extract(r'(\d+\.?\d*)\s*L')[0]
        data['engine_displacement'] = pd.to_numeric(displacement, errors='coerce')

        # Extract cylinders if mentioned
        cylinders = data['engine'].str.extract(r'(\d+)\s*cyl', flags=re.IGNORECASE)[0]
        data['cylinders'] = pd.to_numeric(cylinders, errors='coerce')

        # Engine type features
        data['is_v6'] = data['engine'].str.contains('v6', case=False).astype(int)
        data['is_v8'] = data['engine'].str.contains('v8', case=False).astype(int)

    # Extract year from model year if it's a string
    if 'model_year' in data.columns:
        if data['model_year'].dtype == 'object':
            data['model_year'] = data['model_year'].str.extract(r'(\d{4})')[0].astype(float)
        data['model_year'] = pd.to_numeric(data['model_year'], errors='coerce')
        data['car_age'] = 2024 - data['model_year']  # Assuming current year is 2024

    # Create interaction features
    if all(col in data.columns for col in ['milage', 'car_age']):
        data['miles_per_year'] = data['milage'] / np.maximum(1, data['car_age'])

    # Price-based features (for training data only)
    if 'price' in data.columns:
        data['price'] = data['price'].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)

    return data

def create_additional_features(X, y=None):
    """Create additional engineered features"""
    X = X.copy()

    # Brand-model combination
    if all(col in X.columns for col in ['brand', 'model']):
        X['brand_model'] = X['brand'].astype(str) + '_' + X['model'].astype(str)

    # Fuel-transmission combination
    if all(col in X.columns for col in ['fuel_type', 'transmission']):
        X['fuel_transmission'] = X['fuel_type'].astype(str) + '_' + X['transmission'].astype(str)

    return X

def main():
    # Load data
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Enhanced feature engineering
    print("Performing advanced feature engineering...")
    train_processed = enhanced_extract_features(train_df)
    test_processed = enhanced_extract_features(test_df)

    # Target variable
    y_train = (
        train_processed['price']
        .astype(str)
        .str.replace(r'[^\d.]', '', regex=True)
        .astype(float)
    )

    # Drop target and ID columns
    X_train = train_processed.drop(columns=['price'], errors='ignore')
    X_test = test_processed.drop(columns=['price'], errors='ignore')

    # Create additional features
    X_train = create_additional_features(X_train, y_train)
    X_test = create_additional_features(X_test)

    # Select the most important features
    feature_cols = [
        'model_year', 'milage', 'car_age', 'miles_per_year', 'engine_displacement',
        'cylinders', 'is_v6', 'is_v8', 'brand', 'model', 'fuel_type', 'transmission',
        'accident', 'clean_title', 'brand_model', 'fuel_transmission'
    ]
    # Keep only columns that exist
    feature_cols = [col for col in feature_cols if col in X_train.columns]

    X_train = X_train[feature_cols]
    X_test = X_test[feature_cols]

    # Encode categorical variables
    categorical_cols = ['brand', 'model', 'fuel_type', 'transmission', 'brand_model', 'fuel_transmission']
    categorical_cols = [col for col in categorical_cols if col in X_train.columns]

    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        # Combine train and test for consistent encoding
        combined = pd.concat([X_train[col].astype(str), X_test[col].astype(str)])
        le.fit(combined)
        X_train[col] = le.transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
        label_encoders[col] = le

    # Handle missing values
    for df in [X_train, X_test]:
        for col in df.columns:
            if df[col].dtype in ['int64', 'float64']:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df[col].fillna(0, inplace=True)

    # Train-validation split
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.15, random_state=42  # Smaller validation set
    )

    # Hyperparameter tuning with RandomizedSearchCV
    print("Performing hyperparameter tuning...")
    param_distributions = {
        'n_estimators': randint(300, 1200),
        'learning_rate': uniform(0.01, 0.2),
        'max_depth': randint(5, 15),
        'num_leaves': randint(20, 100),
        'subsample': uniform(0.6, 0.4),  # 0.6 to 1.0
        'colsample_bytree': uniform(0.6, 0.4),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 1),
        'min_child_samples': randint(5, 50)
    }

    # Base model
    base_model = lgb.LGBMRegressor(
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )

    # Randomized search with few iterations for speed
    rs_model = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_distributions,
        n_iter=15,  # More iterations for better results
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    rs_model.fit(X_train_split, y_train_split)

    print(f"Best parameters: {rs_model.best_params_}")
    print(f"Best CV score: {-rs_model.best_score_:.2f}")

    # Train final model with best parameters
    print("\nTraining final optimized model...")
    best_params = rs_model.best_params_

    final_model = lgb.LGBMRegressor(
        **best_params,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )

    # Train on full training split
    final_model.fit(X_train_split, y_train_split)

    # Evaluate on validation set
    val_preds = final_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    r2 = r2_score(y_val, val_preds)

    print(f"✅ Validation RMSE: {rmse:.2f}")
    print(f"✅ Validation MAE: {mae:.2f}")
    print(f"✅ Validation R²: {r2:.4f}")

    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\n📊 Top 10 Feature Importance:")
    print(feature_importance.head(10))

    # Final training on FULL dataset with best parameters
    print("\n🎯 Training on full dataset for submission...")
    final_submission_model = lgb.LGBMRegressor(
        **best_params,
        random_state=42,
        n_jobs=-1,
        verbose=0
    )

    final_submission_model.fit(X_train, y_train)

    # Predict on test set
    test_predictions = final_submission_model.predict(X_test)

    # Apply post-processing (ensure no negative prices)
    test_predictions = np.maximum(test_predictions, 0)

    # Save submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('lightgbm_optimized_submission.csv', index=False)
    print("✅ Optimized LightGBM submission file saved!")

    # Show prediction statistics
    print(f"\n📈 Prediction Statistics:")
    print(f"Min price: ${test_predictions.min():.2f}")
    print(f"Max price: ${test_predictions.max():.2f}")
    print(f"Mean price: ${test_predictions.mean():.2f}")
    print(f"Median price: ${np.median(test_predictions):.2f}")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Performing advanced feature engineering...
Performing hyperparameter tuning...
Fitting 3 folds for each of 15 candidates, totalling 45 fits


ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/sklearn.py", line 1398, in fit
    super().fit(
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/sklearn.py", line 1049, in fit
    self._Booster = train(
                    ^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/basic.py", line 3656, in __init__
    train_set.construct()
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/basic.py", line 2590, in construct
    self._lazy_init(
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/basic.py", line 2123, in _lazy_init
    data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(
                                                                       ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/basic.py", line 868, in _data_from_pandas
    _pandas_to_numpy(data, target_dtype=target_dtype),
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/basic.py", line 814, in _pandas_to_numpy
    _check_for_bad_pandas_dtypes(data.dtypes)
  File "/usr/local/lib/python3.12/dist-packages/lightgbm/basic.py", line 805, in _check_for_bad_pandas_dtypes
    raise ValueError(
ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: accident: object, clean_title: object


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings('ignore')

def enhanced_extract_features(df):
    """Advanced feature engineering with robust handling"""
    data = df.copy()

    # Clean milage
    if 'milage' in data.columns:
        data['milage'] = data['milage'].astype(str).str.replace(',', '').astype(float)

    # Extract engine features if available
    if 'engine' in data.columns:
        # Extract displacement
        displacement = data['engine'].astype(str).str.extract(r'(\d+\.?\d*)\s*L')[0]
        data['engine_displacement'] = pd.to_numeric(displacement, errors='coerce')

    # Extract year from model year
    if 'model_year' in data.columns:
        if data['model_year'].dtype == 'object':
            data['model_year'] = data['model_year'].str.extract(r'(\d{4})')[0].astype(float)
        data['model_year'] = pd.to_numeric(data['model_year'], errors='coerce')
        data['car_age'] = 2024 - data['model_year']

    # Create interaction features
    if all(col in data.columns for col in ['milage', 'car_age']):
        data['miles_per_year'] = data['milage'] / np.maximum(1, data['car_age'])

    # Handle boolean/yes-no columns properly
    bool_columns = ['accident', 'clean_title']
    for col in bool_columns:
        if col in data.columns:
            # Convert to numeric (1 for yes/true, 0 for no/false)
            data[col] = data[col].astype(str).str.lower().map({'yes': 1, 'true': 1, '1': 1, 'no': 0, 'false': 0, '0': 0}).fillna(0).astype(int)

    # Price cleaning
    if 'price' in data.columns:
        data['price'] = data['price'].astype(str).str.replace(r'[^\d.]', '', regex=True).astype(float)

    return data

def create_additional_features(X):
    """Create additional engineered features"""
    X = X.copy()

    # Brand-model combination
    if all(col in X.columns for col in ['brand', 'model']):
        X['brand_model'] = X['brand'].astype(str) + '_' + X['model'].astype(str)

    return X

def prepare_features(X_train, X_test):
    """Prepare features for LightGBM with proper encoding"""
    # Select numeric features
    numeric_features = ['model_year', 'milage', 'car_age', 'miles_per_year', 'engine_displacement']
    numeric_features = [col for col in numeric_features if col in X_train.columns]

    # Select categorical features (excluding boolean ones that are already numeric)
    categorical_features = ['brand', 'model', 'fuel_type', 'transmission', 'brand_model']
    categorical_features = [col for col in categorical_features if col in X_train.columns]

    # Select boolean features (already converted to 0/1)
    boolean_features = ['accident', 'clean_title']
    boolean_features = [col for col in boolean_features if col in X_train.columns]

    # Combine all features
    all_features = numeric_features + categorical_features + boolean_features

    # Prepare train and test data
    X_train_prepared = X_train[all_features].copy()
    X_test_prepared = X_test[all_features].copy()

    # Encode categorical variables
    label_encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        # Combine train and test for consistent encoding
        combined = pd.concat([X_train_prepared[col].astype(str), X_test_prepared[col].astype(str)])
        le.fit(combined)
        X_train_prepared[col] = le.transform(X_train_prepared[col].astype(str))
        X_test_prepared[col] = le.transform(X_test_prepared[col].astype(str))
        label_encoders[col] = le

    # Handle missing values
    for df in [X_train_prepared, X_test_prepared]:
        for col in df.columns:
            if df[col].dtype in ['int64', 'float64']:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df[col].fillna(0, inplace=True)

    return X_train_prepared, X_test_prepared, label_encoders

def main():
    # Load data
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Enhanced feature engineering
    print("Performing advanced feature engineering...")
    train_processed = enhanced_extract_features(train_df)
    test_processed = enhanced_extract_features(test_df)

    # Target variable
    y_train = train_processed['price'].copy()

    # Drop target and ID columns
    X_train = train_processed.drop(columns=['price'], errors='ignore')
    X_test = test_processed.drop(columns=['price'], errors='ignore')

    # Create additional features
    X_train = create_additional_features(X_train)
    X_test = create_additional_features(X_test)

    # Prepare features for LightGBM
    print("Preparing features for modeling...")
    X_train_prepared, X_test_prepared, label_encoders = prepare_features(X_train, X_test)

    # Train-validation split
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train_prepared, y_train, test_size=0.15, random_state=42
    )

    # SIMPLIFIED Hyperparameter tuning - start with basic model first
    print("Training initial model...")
    base_model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )

    # First train a simple model to ensure it works
    base_model.fit(X_train_split, y_train_split)

    # Evaluate initial model
    val_preds = base_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"✅ Initial Validation RMSE: {rmse:.2f}")

    # Now try limited hyperparameter tuning
    print("Performing limited hyperparameter tuning...")
    param_distributions = {
        'n_estimators': [300, 500, 700],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [5, 7, 9],
        'num_leaves': [31, 63],
        'subsample': [0.8, 0.9],
    }

    rs_model = RandomizedSearchCV(
        estimator=lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
        param_distributions=param_distributions,
        n_iter=8,  # Few iterations for stability
        scoring='neg_root_mean_squared_error',
        cv=2,  # Fewer folds
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    rs_model.fit(X_train_split, y_train_split)

    print(f"Best parameters: {rs_model.best_params_}")
    print(f"Best CV score: {-rs_model.best_score_:.2f}")

    # Train final model with best parameters
    print("\nTraining final optimized model...")
    best_params = rs_model.best_params_

    final_model = lgb.LGBMRegressor(
        **best_params,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )

    final_model.fit(X_train_split, y_train_split)

    # Evaluate on validation set
    val_preds = final_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    r2 = r2_score(y_val, val_preds)

    print(f"✅ Final Validation RMSE: {rmse:.2f}")
    print(f"✅ Final Validation MAE: {mae:.2f}")
    print(f"✅ Final Validation R²: {r2:.4f}")

    # Final training on FULL dataset
    print("\n🎯 Training on full dataset for submission...")
    final_submission_model = lgb.LGBMRegressor(
        **best_params,
        random_state=42,
        n_jobs=-1,
        verbose=0
    )

    final_submission_model.fit(X_train_prepared, y_train)

    # Predict on test set
    test_predictions = final_submission_model.predict(X_test_prepared)
    test_predictions = np.maximum(test_predictions, 0)  # No negative prices

    # Save submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('lightgbm_optimized_submission.csv', index=False)
    print("✅ Optimized LightGBM submission file saved!")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Performing advanced feature engineering...
Preparing features for modeling...
Training initial model...
✅ Initial Validation RMSE: 67808.08
Performing limited hyperparameter tuning...
Fitting 2 folds for each of 8 candidates, totalling 16 fits
Best parameters: {'subsample': 0.8, 'num_leaves': 63, 'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.01}
Best CV score: 74018.26

Training final optimized model...
✅ Final Validation RMSE: 67229.32
✅ Final Validation MAE: 19491.80
✅ Final Validation R²: 0.1653

🎯 Training on full dataset for submission...
✅ Optimized LightGBM submission file saved!


In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform, loguniform
import warnings
warnings.filterwarnings('ignore')
import re

# Set random seeds for reproducibility
np.random.seed(42)

def elite_feature_engineering(df):
    """Elite feature engineering with robust type handling"""
    data = df.copy()

    # === PRICE CLEANING ===
    if 'price' in data.columns:
        data['price'] = (
            data['price']
            .astype(str)
            .str.replace(r'[^\d.]', '', regex=True)
            .astype(float)
        )

    # === MILEAGE ENHANCEMENT ===
    if 'milage' in data.columns:
        data['milage'] = (
            data['milage']
            .astype(str)
            .str.replace(',', '')
            .replace('', np.nan)
            .astype(float)
        )
        # Create mileage features
        data['mileage_log'] = np.log1p(data['milage'].fillna(0))
        data['mileage_bin'] = pd.qcut(data['milage'], q=5, labels=False, duplicates='drop')

    # === ENGINE FEATURE EXTRACTION ===
    if 'engine' in data.columns:
        # Multiple extraction patterns
        patterns = {
            'displacement': r'(\d+\.?\d*)\s*L',
            'cylinders': r'(\d+)\s*cyl',
            'horsepower': r'(\d+)\s*HP'
        }

        for feature, pattern in patterns.items():
            extracted = data['engine'].astype(str).str.extract(pattern, flags=re.IGNORECASE)[0]
            if not extracted.isna().all():
                data[f'engine_{feature}'] = pd.to_numeric(extracted, errors='coerce')

        # Boolean engine features
        data['engine_turbo'] = data['engine'].astype(str).str.contains('turbo', case=False, na=False).astype(int)
        data['engine_v6'] = data['engine'].astype(str).str.contains('v6', case=False, na=False).astype(int)
        data['engine_v8'] = data['engine'].astype(str).str.contains('v8', case=False, na=False).astype(int)

    # === MODEL YEAR ENHANCEMENT ===
    if 'model_year' in data.columns:
        data['model_year'] = pd.to_numeric(data['model_year'], errors='coerce')
        data['car_age'] = 2024 - data['model_year'].fillna(2000)
        data['is_modern'] = (data['model_year'] >= 2015).astype(int)
        data['is_classic'] = (data['model_year'] <= 2000).astype(int)

    # === BOOLEAN COLUMNS ===
    bool_columns = ['accident', 'clean_title']
    for col in bool_columns:
        if col in data.columns:
            data[col] = (
                data[col]
                .astype(str)
                .str.lower()
                .map({'yes': 1, 'true': 1, '1': 1, 'y': 1, 'no': 0, 'false': 0, '0': 0, 'n': 0})
                .fillna(0)
                .astype(int)
            )

    # === INTERACTION FEATURES ===
    if all(col in data.columns for col in ['milage', 'car_age']):
        data['miles_per_year'] = data['milage'] / np.maximum(1, data['car_age'])

    # === BRAND AND MODEL FEATURES ===
    if 'brand' in data.columns:
        data['brand'] = data['brand'].str.upper().str.strip()

    return data

def create_advanced_features(X):
    """Create advanced feature combinations"""
    X = X.copy()

    # Brand-Model combination
    if all(col in X.columns for col in ['brand', 'model']):
        X['brand_model'] = X['brand'].astype(str) + '_' + X['model'].astype(str)

    return X

def prepare_elite_features(X_train, X_test, y_train=None):
    """Elite feature preparation with robust type handling"""
    # Define feature categories - ONLY NUMERIC FEATURES
    numeric_features = [
        'model_year', 'milage', 'mileage_log', 'car_age', 'miles_per_year',
        'engine_displacement', 'engine_cylinders', 'engine_horsepower',
        'engine_turbo', 'engine_v6', 'engine_v8', 'is_modern', 'is_classic',
        'accident', 'clean_title'
    ]

    categorical_features = [
        'brand', 'model', 'fuel_type', 'transmission', 'brand_model'
    ]

    # Select only existing features
    numeric_features = [col for col in numeric_features if col in X_train.columns]
    categorical_features = [col for col in categorical_features if col in X_train.columns]

    all_features = numeric_features + categorical_features

    # Prepare data
    X_train_prepared = X_train[all_features].copy()
    X_test_prepared = X_test[all_features].copy()

    # Ensure all numeric features are actually numeric
    for col in numeric_features:
        if col in X_train_prepared.columns:
            X_train_prepared[col] = pd.to_numeric(X_train_prepared[col], errors='coerce')
            X_test_prepared[col] = pd.to_numeric(X_test_prepared[col], errors='coerce')

    # Smart encoding for categorical features
    label_encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        combined = pd.concat([X_train_prepared[col].astype(str), X_test_prepared[col].astype(str)])
        le.fit(combined)
        X_train_prepared[col] = le.transform(X_train_prepared[col].astype(str))
        X_test_prepared[col] = le.transform(X_test_prepared[col].astype(str))
        label_encoders[col] = le

    # Handle missing values
    for df in [X_train_prepared, X_test_prepared]:
        for col in df.columns:
            if df[col].dtype in ['int64', 'float64']:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df[col].fillna(0, inplace=True)

    return X_train_prepared, X_test_prepared, label_encoders

def main():
    print("🚀 Starting ROBUST LightGBM Modeling...")

    # Load data
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Elite feature engineering
    print("🔧 Performing robust feature engineering...")
    train_processed = elite_feature_engineering(train_df)
    test_processed = elite_feature_engineering(test_df)

    # Target variable
    y_train = train_processed['price'].copy()

    # Drop target and ID columns
    X_train = train_processed.drop(columns=['price'], errors='ignore')
    X_test = test_processed.drop(columns=['price'], errors='ignore')

    # Create advanced features
    X_train = create_advanced_features(X_train)
    X_test = create_advanced_features(X_test)

    # Prepare elite features
    print("⚙️ Preparing robust features...")
    X_train_prepared, X_test_prepared, label_encoders = prepare_elite_features(X_train, X_test, y_train)

    # Ensure all data is numeric
    print("✅ Data types in prepared features:")
    print(X_train_prepared.dtypes.value_counts())

    # Train-validation split
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train_prepared, y_train, test_size=0.1, random_state=42, shuffle=True
    )

    # === SIMPLIFIED BUT EFFECTIVE TUNING ===
    print("🎯 Performing efficient hyperparameter tuning...")

    param_distributions = {
        'n_estimators': [500, 800, 1000],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [6, 8, 10],
        'num_leaves': [31, 63, 127],
        'min_child_samples': [20, 50, 100],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
        'reg_alpha': [0.1, 0.5, 1.0],
        'reg_lambda': [0.1, 0.5, 1.0],
    }

    # Use simpler validation
    robust_model = RandomizedSearchCV(
        estimator=lgb.LGBMRegressor(
            random_state=42,
            n_jobs=-1,
            verbose=-1,
            metric='rmse'
        ),
        param_distributions=param_distributions,
        n_iter=15,  # Reduced iterations for stability
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=2,
        n_jobs=-1,
        random_state=42
    )

    robust_model.fit(X_train_split, y_train_split)

    print(f"🏆 Best parameters: {robust_model.best_params_}")
    print(f"🏆 Best CV score: {-robust_model.best_score_:.2f}")

    # === ROBUST MODEL TRAINING ===
    print("\n🔥 Training robust model...")
    best_params = robust_model.best_params_

    robust_final_model = lgb.LGBMRegressor(**best_params, random_state=42, n_jobs=-1)

    # CORRECTED: Remove 'verbose' from fit() and use callbacks for early stopping
    robust_final_model.fit(
        X_train_split,
        y_train_split,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    # === ROBUST EVALUATION ===
    val_preds = robust_final_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    r2 = r2_score(y_val, val_preds)

    print(f"✅ Validation RMSE: {rmse:.2f}")
    print(f"✅ Validation MAE: {mae:.2f}")
    print(f"✅ Validation R²: {r2:.6f}")

    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train_prepared.columns,
        'importance': robust_final_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\n📊 Top 10 Feature Importance:")
    print(feature_importance.head(10).to_string())

    # === FINAL ROBUST TRAINING ===
    print("\n🎯 Training final robust model on full data...")
    final_robust_model = lgb.LGBMRegressor(**best_params, random_state=42, n_jobs=-1)
    final_robust_model.fit(X_train_prepared, y_train)

    # === ROBUST PREDICTION ===
    test_predictions = final_robust_model.predict(X_test_prepared)

    # Post-processing
    test_predictions = np.maximum(test_predictions, 1000)
    test_predictions = np.round(test_predictions, 2)

    # Save robust submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('ROBUST_lightgbm_submission.csv', index=False)

    # Performance report
    print("\n" + "="*50)
    print("🏆 ROBUST MODEL PERFORMANCE REPORT 🏆")
    print("="*50)
    print(f"Final Validation RMSE: ${rmse:,.2f}")
    print(f"Final Validation MAE: ${mae:,.2f}")
    print(f"Final Validation R²: {r2:.6f}")
    print(f"Predictions range: ${test_predictions.min():,.2f} - ${test_predictions.max():,.2f}")
    print("✅ ROBUST submission file saved!")
    print("="*50)

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🚀 Starting ROBUST LightGBM Modeling...
🔧 Performing robust feature engineering...
⚙️ Preparing robust features...
✅ Data types in prepared features:
int64      14
float64     6
Name: count, dtype: int64
🎯 Performing efficient hyperparameter tuning...
Fitting 3 folds for each of 15 candidates, totalling 45 fits
🏆 Best parameters: {'subsample': 0.9, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'num_leaves': 31, 'n_estimators': 800, 'min_child_samples': 100, 'max_depth': 8, 'learning_rate': 0.01, 'colsample_bytree': 0.9}
🏆 Best CV score: 73267.89

🔥 Training robust model...
✅ Validation RMSE: 69195.94
✅ Validation MAE: 19525.69
✅ Validation R²: 0.163858

📊 Top 10 Feature Importance:
                feature  importance
1                milage        3549
4        miles_per_year        3124
16                model        2939
5   engine_displacement        2800
0         