WE WILL FOCUS ON ASSEMBLING A PIPELINE AND MAKING EVERYTHING VERY COMPACT

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('data_downloaded/train.csv')

In [9]:
# Step 1: Define features and target
X = df.drop(columns=['SalePrice'])  # Features
y = df['SalePrice']  # Target

In [19]:
n_columns_to_use = ['Lot_Area', 'Lot_Frontage', ]

In [20]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [3]:
# Step 2: Define categorical, ordinal, and numeric columns
categorical_nominal_cols = ['col1', 'col2', 'col3']  # List of nominal categorical columns
categorical_ordinal_cols = ['col4', 'col5', 'col6']  # List of ordinal categorical columns
binary_cols = ['bin1', 'bin2', 'bin3']  # List of binary columns
numeric_cols = ['num1', 'num2', 'num3']  # List of numeric columns

In [4]:
# Step 3: Define preprocessing for different types of features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[['Very Bad', 'Bad', 'Average', 'Good', 'Excellent']]))])

categorical_nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [5]:
# Step 4: Bundle preprocessing for all types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat_ord', categorical_ordinal_transformer, categorical_ordinal_cols),
        ('cat_nom', categorical_nominal_transformer, categorical_nominal_cols),
        ('bin', binary_transformer, binary_cols)])

In [6]:
# Step 5: Define the Regressor (Linear Regression)
model_lr = LinearRegression()

# Step 6: Define the Regressor (XGBoost)
model_xgb = XGBRegressor()

In [7]:
# Step 7: Define the pipeline with the final regressor
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', model_lr)])

pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model_xgb)])

In [8]:
# Step 8: Define the final pipeline with transformed target regressor
final_pipeline_lr = TransformedTargetRegressor(regressor=pipeline_lr,
                                               transformer=StandardScaler())

final_pipeline_xgb = TransformedTargetRegressor(regressor=pipeline_xgb,
                                                transformer=StandardScaler())


**Feature Engineering den sonra PCA ekleyebiliriz**

**Featurelarin variancelerini cok merak ettim dogrusu**

In [11]:
df.var().nlargest(20)

  df.var().nlargest(20)


SalePrice        6.311111e+09
LotArea          9.962565e+07
GrLivArea        2.761296e+05
MiscVal          2.461381e+05
BsmtFinSF1       2.080255e+05
BsmtUnfSF        1.952464e+05
TotalBsmtSF      1.924624e+05
2ndFlrSF         1.905571e+05
Id               1.777550e+05
1stFlrSF         1.494501e+05
GarageArea       4.571251e+04
MasVnrArea       3.278497e+04
BsmtFinSF2       2.602391e+04
WoodDeckSF       1.570981e+04
OpenPorchSF      4.389861e+03
EnclosedPorch    3.735550e+03
ScreenPorch      3.108889e+03
LowQualFinSF     2.364204e+03
MSSubClass       1.789338e+03
PoolArea         1.614216e+03
dtype: float64