<a href="https://colab.research.google.com/github/fachiny17/kaggle/blob/main/dsn_bootcamp_qualification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump

In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read CSV (update path to where your file is stored in Drive)
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

In [None]:
train_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [None]:
test_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [7]:
# file: fast_pipeline.py
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV, KFold
from joblib import dump
import warnings
import re

def create_preprocessing_pipeline():
    numerical_feature = ['model_year', 'engine_displacement', 'milage']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Low-cardinality categorical features → OneHot
    low_card_cat = ['brand', 'model', 'ext_col', 'int_col',
                    'fuel_type', 'transmission', 'accident', 'clean_title']
    low_card_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # # High-cardinality categorical features → Ordinal
    # high_card_cat = ['brand', 'model', 'ext_col', 'int_col']
    # high_card_transformer = Pipeline(steps=[
    #     ('imputer', SimpleImputer(strategy='most_frequent')),
    #     ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    # ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_feature),
            ('low_cat', low_card_transformer, low_card_cat)
            #  ('high_cat', high_card_transformer, high_card_cat)
        ]
    )
    return preprocessor


In [8]:
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    data = df.copy()

    # Clean mileage
    data['milage'] = (
        data['milage'].astype(str).str.replace(',', '', regex=False).astype(float)
    )

    # Vectorized extraction of engine displacement
    data['engine_displacement'] = (
        data['engine'].astype(str).str.extract(r'(\d+\.?\d*)\s*L')[0].astype(float)
    )

    return data




In [9]:
def evaluate_regression(y_true, y_pred):
  """Calculates and returns regression metrics"""
  metrics = {
      'MAE': mean_absolute_error(y_true, y_pred),
      'MSE': mean_squared_error(y_true, y_pred),
      'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
      "R2": r2_score(y_true, y_pred)
  }
  print(f"Metrics: {metrics}")

  return metrics

In [None]:
def main():
    train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
    test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Feature engineering
    train_processed = extract_features(train_df)
    test_processed = extract_features(test_df)

    # Target variable
    y_train = (
        train_processed['price']
        .astype(str)
        .str.replace('$', '', regex=False)
        .str.replace(',', '', regex=False)
        .astype(float)
    )

    # Drop target
    X_train = train_processed.drop(columns=['price'])

    # Split training data for validation (80% train, 20% validation)
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    # Build pipeline
    preprocessor = create_preprocessing_pipeline()
    model = HistGradientBoostingRegressor(
        random_state=42,
        max_iter=100,
        learning_rate=0.1,
        max_depth=10,
        min_samples_leaf=20,
        l2_regularization=0.1
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # # Cross-validation for RMSE evaluation
    # kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    # cv_scores = cross_val_score(
    #     pipeline,
    #     X_train,
    #     y_train,
    #     cv=kfold,
    #     scoring='neg_root_mean_squared_error',  # Use RMSE directly
    #     n_jobs=-1
    # )
    #
    # # Convert negative RMSE to positive (scikit-learn returns negative for scoring)
    # rmse_scores = -cv_scores
    # mean_rmse = rmse_scores.mean()
    # std_rmse = rmse_scores.std()
    #
    # print(f"Cross-Validation RMSE Scores: {rmse_scores}")
    # print(f"Mean RMSE: {mean_rmse:.2f} ± {std_rmse:.2f}")

    # Train baseline model ------------------------------------------------------
    print("Training baseline model...")
    baseline_model = pipeline.fit(X_train_split, y_train_split)
    baseline_preds = baseline_model.predict(X_val)
    baseline_metrics = evaluate_regression(y_val, baseline_preds)

    # Hyperparameter grid for HistGradientBoosting----------------------------
    param_grid = {
        'model__max_iter': [100, 200, 500],
        'model__learning_rate': [0.05, 0.1, 0.2],
        'model__max_depth': [5, 10, 15],
        'model__min_samples_leaf': [10, 20, 30],
        'model__l2_regularization': [0.01, 0.1, 1.0],
        'model__max_bins': [128, 255]
    }

    # GridSearchCV -----------------------------------------------------
    print("\nStarting GridSearchCV.......")
    gs_model = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='neg_root_mean_squared_error',
        cv=3,
        verbose=2,
        n_jobs=-1
    )
    gs_model.fit(X_train_split, y_train_split)

    # Best model -------------------------------------------------------
    print(f"\nBest parameters: {gs_model.best_params_}")
    best_model = gs_model.best_estimator_
    gs_preds = best_model.predict(X_val)
    gs_metrics = evaluate_regression(y_val, gs_preds)

    # Compare metrics ------------------------------------------------------
    compare_metrics = pd.DataFrame({
        'Baseline': baseline_metrics,
        'GridSearchCV': gs_metrics
    }).T
    print("\nModel Comparison:")
    print(compare_metrics)

    # Final Training: Train best model on FULL training data
    print("\nTraining final model on full training data...")
    final_model = gs_model.best_estimator_.fit(X_train, y_train)

    # Predict on ACTUAL competition test set (no price column); make sure test_proceessed doesn't have price column
    competioin_test = test_processed.drop(columns=['price'], errors='ignore')
    test_predictions = final_model.predict(competioin_test)

    # Save submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'price': test_predictions
    })
    submission.to_csv('histgb.csv', index=False)
    print("Submission file saved")

if __name__ == "__main__":
    main()

Training baseline model...
