WE WILL FOCUS ON ASSEMBLING A PIPELINE AND MAKING EVERYTHING VERY COMPACT

In [6]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [8]:
df = pd.read_csv('data_downloaded/train.csv')

In [9]:
# Step 1: Define features and target
X = df.drop(columns=['SalePrice'])  # Features
y = df['SalePrice']  # Target

In [19]:
n_columns_to_use = ['Lot_Area', 'Lot_Frontage', ]

In [20]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [10]:
# Step 2: Define categorical, ordinal, and numeric columns
categorical_nominal_cols = ['col1', 'col2', 'col3']  # List of nominal categorical columns
categorical_ordinal_cols = ['col4', 'col5', 'col6']  # List of ordinal categorical columns
binary_cols = ['bin1', 'bin2', 'bin3']  # List of binary columns
numeric_cols = ['num1', 'num2', 'num3']  # List of numeric columns

In [12]:
# Step 3: Define preprocessing for different types of features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[['Very Bad', 'Bad', 'Average', 'Good', 'Excellent']]))])

categorical_nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [13]:
# Step 4: Bundle preprocessing for all types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat_ord', categorical_ordinal_transformer, categorical_ordinal_cols),
        ('cat_nom', categorical_nominal_transformer, categorical_nominal_cols),
        ('bin', binary_transformer, binary_cols)])

In [14]:
# Step 5: Define the Regressor (Linear Regression)
model_lr = LinearRegression()

# Step 6: Define the Regressor (XGBoost)
model_xgb = XGBRegressor()

In [15]:
# Step 7: Define the pipeline with the final regressor
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', model_lr)])

pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model_xgb)])

In [16]:
# Step 8: Define the final pipeline with transformed target regressor
final_pipeline_lr = TransformedTargetRegressor(regressor=pipeline_lr,
                                               transformer=StandardScaler())

final_pipeline_xgb = TransformedTargetRegressor(regressor=pipeline_xgb,
                                                transformer=StandardScaler())
