<a href="https://colab.research.google.com/github/fachiny17/kaggle/blob/main/dsn_bootcamp_qualification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump

In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Read CSV (update path to where your file is stored in Drive)
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

In [None]:
train_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [None]:
test_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [None]:
# file: fast_pipeline.py
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, KFold
from joblib import dump
from scipy.stats import uniform, randint
import warnings
import re

def create_preprocessing_pipeline():
    numerical_feature = ['model_year', 'engine_displacement', 'milage']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Low-cardinality categorical features → OneHot
    low_card_cat = ['brand', 'model', 'ext_col', 'int_col',
                    'fuel_type', 'transmission', 'accident', 'clean_title']
    low_card_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # # High-cardinality categorical features → Ordinal
    # high_card_cat = ['brand', 'model', 'ext_col', 'int_col']
    # high_card_transformer = Pipeline(steps=[
    #     ('imputer', SimpleImputer(strategy='most_frequent')),
    #     ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    # ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_feature),
            ('low_cat', low_card_transformer, low_card_cat)
            #  ('high_cat', high_card_transformer, high_card_cat)
        ]
    )
    return preprocessor


In [None]:
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    data = df.copy()

    # Clean mileage
    data['milage'] = (
        data['milage'].astype(str).str.replace(',', '', regex=False).astype(float)
    )

    # Vectorized extraction of engine displacement
    data['engine_displacement'] = (
        data['engine'].astype(str).str.extract(r'(\d+\.?\d*)\s*L')[0].astype(float)
    )

    return data




In [None]:
def evaluate_regression(y_true, y_pred):
  """Calculates and returns regression metrics"""
  metrics = {
      'MAE': mean_absolute_error(y_true, y_pred),
      'MSE': mean_squared_error(y_true, y_pred),
      'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
      "R2": r2_score(y_true, y_pred)
  }
  print(f"Metrics: {metrics}")

  return metrics

In [None]:
def main():
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Feature engineering
    train_processed = extract_features(train_df)
    test_processed = extract_features(test_df)

    # Target variable
    y_train = (
        train_processed['price']
        .astype(str)
        .str.replace('$', '', regex=False)
        .str.replace(',', '', regex=False)
        .astype(float)
    )

    # Drop target
    X_train = train_processed.drop(columns=['price'])

    # Split training data for validation (80% train, 20% validation)
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # Build pipeline
    preprocessor = create_preprocessing_pipeline()
    model = HistGradientBoostingRegressor(
        random_state=42,
        max_iter=100,
        learning_rate=0.1,
        max_depth=10,
        min_samples_leaf=20,
        l2_regularization=0.1
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # # Cross-validation for RMSE evaluation
    # kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    # cv_scores = cross_val_score(
    #     pipeline,
    #     X_train,
    #     y_train,
    #     cv=kfold,
    #     scoring='neg_root_mean_squared_error',  # Use RMSE directly
    #     n_jobs=-1
    # )
    #
    # # Convert negative RMSE to positive (scikit-learn returns negative for scoring)
    # rmse_scores = -cv_scores
    # mean_rmse = rmse_scores.mean()
    # std_rmse = rmse_scores.std()
    #
    # print(f"Cross-Validation RMSE Scores: {rmse_scores}")
    # print(f"Mean RMSE: {mean_rmse:.2f} ± {std_rmse:.2f}")

    # Train baseline model ------------------------------------------------------
    print("Training baseline model...")
    baseline_model = pipeline.fit(X_train_split, y_train_split)
    baseline_preds = baseline_model.predict(X_val)
    baseline_metrics = evaluate_regression(y_val, baseline_preds)

    # Parameter distributions for RandomizedSearchCV -------------------
    param_distributions = {
        'model__max_iter': randint(50, 1000),
        'model__learning_rate': uniform(0.01, 0.3),  # 0.01 to 0.31
        'model__max_depth': randint(3, 20),
        'model__min_samples_leaf': randint(5, 50),
        'model__l2_regularization': uniform(0.001, 1.0),  # 0.001 to 1.001
        'model__max_bins': [128, 255],
        'model__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3]
    }


    print("\nStarting RandomizedSearchCV...")
    rs_model = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        n_iter=30,  # Try 30 random combinations (adjust as needed)
        scoring='neg_root_mean_squared_error',
        cv=3,  # Fewer folds for faster tuning
        verbose=2,
        n_jobs=-1,
        random_state=42
    )

    rs_model.fit(X_train_split, y_train_split)

    # Best model -------------------------------------------------------
    print(f"\nBest parameters: {rs_model.best_params_}")
    best_model = rs_model.best_estimator_
    rs_preds = best_model.predict(X_val)
    rs_metrics = evaluate_regression(y_val, rs_preds)

    # Compare metrics ------------------------------------------------------
    compare_metrics = pd.DataFrame({
        'Baseline': baseline_metrics,
        'RandomizedSearchCV': rs_metrics
    }).T
    print("\nModel Comparison:")
    print(compare_metrics)

    # Final Training: Train best model on FULL training data
    print("\nTraining final model on full training data...")
    final_model = rs_model.best_estimator_.fit(X_train, y_train)

    # Predict on ACTUAL competition test set (no price column); make sure test_proceessed doesn't have price column
    competition_test = test_processed.drop(columns=['price'], errors='ignore')
    test_predictions = final_model.predict(competition_test)

    # Save submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('histgb_randomisedsearchcv.csv', index=False)
    print("Submission file saved")

if __name__ == "__main__":
    main()

Training baseline model...
  MAE: 19835.6606
  RMSE: 68243.7918
  R2: 0.1625

Starting RandomizedSearchCV...
Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [2]:
# LightGBM Regressor with preprocessing + RandomizedSearchCV
from google.colab import drive
drive.mount('/content/drive')

# Imports
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import randint, uniform
from lightgbm import LGBMRegressor

# ---------------- Preprocessing -----------------
def create_preprocessing_pipeline():
    numerical_features = ['model_year', 'engine_displacement', 'milage']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean'))
    ])

    categorical_features = ['brand', 'model', 'ext_col', 'int_col',
                            'fuel_type', 'transmission', 'accident', 'clean_title']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    return ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

def extract_features(df):
    data = df.copy()
    data['milage'] = data['milage'].astype(str).str.replace(',', '', regex=False).astype(float)
    data['engine_displacement'] = data['engine'].astype(str).str.extract(r'(\d+\.?\d*)\s*L')[0].astype(float)
    return data

def evaluate_regression(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }

# ---------------- Main -----------------
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

train_processed = extract_features(train_df)
test_processed = extract_features(test_df)

y_train = train_processed['price'].astype(str).str.replace('[$,]', '', regex=True).astype(float)
X_train = train_processed.drop(columns=['price'])

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', create_preprocessing_pipeline()),
    ('model', LGBMRegressor(random_state=42))
])

# Baseline
print("Training LightGBM baseline...")
baseline_model = pipeline.fit(X_train_split, y_train_split)
baseline_preds = baseline_model.predict(X_val)
print("Baseline metrics:", evaluate_regression(y_val, baseline_preds))

# Param space
param_dist = {
    'model__n_estimators': randint(100, 1000),
    'model__learning_rate': uniform(0.01, 0.3),
    'model__max_depth': randint(3, 20),
    'model__num_leaves': randint(20, 200),
    'model__min_child_samples': randint(5, 100)
}

# RandomizedSearch
print("\nTuning LightGBM with RandomizedSearchCV...")
rs = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=30,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
rs.fit(X_train_split, y_train_split)

print("Best params:", rs.best_params_)

# Evaluate best model
val_preds = rs.predict(X_val)
print("Validation metrics:", evaluate_regression(y_val, val_preds))

# Final model on full data
final_model = rs.best_estimator_.fit(X_train, y_train)
competition_test = test_processed.drop(columns=['price'], errors='ignore')
test_preds = final_model.predict(competition_test)

# Save submission
submission = pd.DataFrame({'id': test_df['id'], 'price': test_preds})
submission.to_csv('lightgbm_submission.csv', index=False)
print("LightGBM submission saved!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training LightGBM baseline...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.683314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4012
[LightGBM] [Info] Number of data points in the train set: 169679, number of used features: 1833
[LightGBM] [Info] Start training from score 43888.560718




Baseline metrics: {'MAE': 19695.47160267817, 'RMSE': np.float64(69699.18035558485), 'R2': 0.15165160352832463}

Tuning LightGBM with RandomizedSearchCV...
Fitting 3 folds for each of 30 candidates, totalling 90 fits


KeyboardInterrupt: 

In [7]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

def extract_features(df):
    """Simple feature engineering"""
    data = df.copy()

    # Clean milage (correct spelling)
    if 'milage' in data.columns:
        data['milage'] = data['milage'].astype(str).str.replace(',', '').astype(float)

    # Extract engine displacement if engine column exists
    if 'engine' in data.columns:
        # Try to extract numeric value from engine description
        data['engine_displacement'] = (
            data['engine']
            .astype(str)
            .str.extract(r'(\d+\.?\d*)')[0]  # Extract first numeric value
            .astype(float)
        )

    return data

def main():
    # Load data
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Feature engineering
    train_processed = extract_features(train_df)
    test_processed = extract_features(test_df)

    # Target variable
    y_train = (
        train_processed['price']
        .astype(str)
        .str.replace('$', '', regex=False)
        .str.replace(',', '', regex=False)
        .astype(float)
    )

    # Features - drop price and keep important columns
    feature_cols = ['model_year', 'milage', 'brand', 'model', 'fuel_type', 'transmission']
    X_train = train_processed[feature_cols]
    X_test = test_processed[feature_cols]

    # Encode categorical variables for LightGBM
    categorical_cols = ['brand', 'model', 'fuel_type', 'transmission']
    label_encoders = {}

    for col in categorical_cols:
        if col in X_train.columns:
            le = LabelEncoder()
            # Combine train and test for consistent encoding
            combined = pd.concat([X_train[col], X_test[col]], axis=0)
            le.fit(combined.astype(str))
            X_train[col] = le.transform(X_train[col].astype(str))
            X_test[col] = le.transform(X_test[col].astype(str))
            label_encoders[col] = le

    # Handle missing values
    X_train = X_train.fillna(X_train.median())
    X_test = X_test.fillna(X_test.median())

    # Train-validation split
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # LightGBM model with early stopping in constructor
    print("Training LightGBM model...")
    model = lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1,
        early_stopping_rounds=50  # CORRECTED: moved to constructor
    )

    # Train with early stopping - CORRECTED syntax
    model.fit(
        X_train_split,
        y_train_split,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse'
    )

    # Evaluate
    val_preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    mae = mean_absolute_error(y_val, val_preds)
    r2 = r2_score(y_val, val_preds)

    print(f"Validation RMSE: {rmse:.2f}")
    print(f"Validation MAE: {mae:.2f}")
    print(f"Validation R²: {r2:.4f}")

    # Final training on full data
    print("\nTraining final model on full data...")
    final_model = lgb.LGBMRegressor(
        n_estimators=model.best_iteration_ if hasattr(model, 'best_iteration_') else 1000,
        learning_rate=0.05,
        max_depth=7,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    final_model.fit(X_train, y_train)

    # Predict on test set
    test_predictions = final_model.predict(X_test)

    # Save submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('lightgbm_submission.csv', index=False)
    print("✅ LightGBM submission file saved!")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training LightGBM model...
Validation RMSE: 68229.69
Validation MAE: 20052.95
Validation R²: 0.1629

Training final model on full data...
✅ LightGBM submission file saved!
