In [64]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [65]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

# Feature Engineering

In [66]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
        X['TotalBath'] = (
            X['FullBath'] + 0.5 * X['HalfBath'] +
            X['BsmtFullBath'] + 0.5 * X['BsmtHalfBath']
        )
        X['TotalPorchSF'] = (
            X['OpenPorchSF'] + X['EnclosedPorch'] +
            X['3SsnPorch'] + X['ScreenPorch']
        )
        X['HasPool'] = (X['PoolArea'] > 0).astype(int)
        X['HasFireplace'] = (X['Fireplaces'] > 0).astype(int)
        X['HasGarage'] = (X['GarageArea'] > 0).astype(int)
        X['HasBsmt'] = (X['TotalBsmtSF'] > 0).astype(int)
        return X

# Cleaning

In [67]:
X = df.drop(columns=['Id', 'SalePrice'])
y = df['SalePrice']

feature_engineer = FeatureEngineer()

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

pipeline = Pipeline([
    ('features', feature_engineer),
    ('preprocessor', preprocessor),
    ('model', Ridge(alpha=10.0))
])


# Training

In [68]:
import mlflow
import mlflow.sklearn

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run():
    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_val)
    preds = np.maximum(preds, 0)
    rmse = np.sqrt(np.mean((y_val - preds)**2))

    mlflow.sklearn.log_model(pipeline, "ridge-pipeline")
    mlflow.log_metric("rmse", rmse)

    print(f"RMSE: {rmse:.2f}")



RMSE: 30524.56
🏃 View run charming-wasp-557 at: https://dagshub.com/gioeba/AdvancedRegressionTechniques.mlflow/#/experiments/3/runs/46a8a061339f405a8571c8176bd42631
🧪 View experiment at: https://dagshub.com/gioeba/AdvancedRegressionTechniques.mlflow/#/experiments/3


In [69]:
import dagshub
dagshub.init(repo_owner='gioeba', repo_name='AdvancedRegressionTechniques', mlflow=True)

mlflow.set_tracking_uri("https://dagshub.com/gioeba/AdvancedRegressionTechniques.mlflow")
mlflow.set_experiment("ridge-model-with-pipeline")
mlflow.set_registry_uri("https://dagshub.com/gioeba/AdvancedRegressionTechniques.mlflow")