WE WILL FOCUS ON ASSEMBLING A PIPELINE AND MAKING EVERYTHING VERY COMPACT

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('data_downloaded/train.csv')

In [9]:
# Step 1: Define features and target
X = df.drop(columns=['SalePrice'])  # Features
y = df['SalePrice']  # Target

In [3]:
# Step 2: Define categorical, ordinal, and numeric columns
categorical_nominal_cols = ['col1', 'col2', 'col3']  # List of nominal categorical columns
categorical_ordinal_cols = ['col4', 'col5', 'col6']  # List of ordinal categorical columns
binary_cols = ['bin1', 'bin2', 'bin3']  # List of binary columns
numeric_cols = ['num1', 'num2', 'num3']  # List of numeric columns

In [4]:
# Step 3: Define preprocessing for different types of features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[['Very Bad', 'Bad', 'Average', 'Good', 'Excellent']]))])

categorical_nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [5]:
# Step 4: Bundle preprocessing for all types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat_ord', categorical_ordinal_transformer, categorical_ordinal_cols),
        ('cat_nom', categorical_nominal_transformer, categorical_nominal_cols),
        ('bin', binary_transformer, binary_cols)])

In [6]:
# Step 5: Define the Regressor (Linear Regression)
model_lr = LinearRegression()

# Step 6: Define the Regressor (XGBoost)
model_xgb = XGBRegressor()

In [7]:
# Step 7: Define the pipeline with the final regressor
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', model_lr)])

pipeline_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model_xgb)])

In [8]:
# Step 8: Define the final pipeline with transformed target regressor
final_pipeline_lr = TransformedTargetRegressor(regressor=pipeline_lr,
                                               transformer=StandardScaler())

final_pipeline_xgb = TransformedTargetRegressor(regressor=pipeline_xgb,
                                                transformer=StandardScaler())


**Feature Engineering den sonra PCA ekleyebiliriz**

**Featurelarin variancelerini cok merak ettim dogrusu**

In [29]:
df.var().nlargest(10)

  df.var().nlargest(10)


SalePrice      6.311111e+09
LotArea        9.962565e+07
GrLivArea      2.761296e+05
MiscVal        2.461381e+05
BsmtFinSF1     2.080255e+05
BsmtUnfSF      1.952464e+05
TotalBsmtSF    1.924624e+05
2ndFlrSF       1.905571e+05
Id             1.777550e+05
1stFlrSF       1.494501e+05
dtype: float64

**Bir de trimmed variance bakalim**

In [20]:
from scipy.stats.mstats import trimmed_var

In [28]:
df.select_dtypes(include='number').apply(trimmed_var).sort_values(ascending=False).head(10)

SalePrice      1882790400.47177
LotArea          4600761.244546
2ndFlrSF          126369.721195
Id                    113685.25
BsmtFinSF1        108198.508749
GrLivArea         102768.092838
BsmtUnfSF          85391.659046
TotalBsmtSF        63500.571484
1stFlrSF           62147.608644
GarageArea          15760.73548
dtype: object

In [32]:
# A sample of encoding and standardizing

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# Step 1: Create the DataFrame
data = {'Category_Ordinal': ['Low', 'Medium', 'High', 'Medium'],
        'Category_Nominal': ['Red', 'Blue', 'Green', 'Red'],
        'Category_Binary': [1, 0, 1, 0],
        'Continuous_Num1': [25.2, 30.5, 27.8, 35.1],
        'Continuous_Num2': [40.0, 42.3, 38.9, 45.2],
        'Discrete_Num1': [4, 3, 5, 2],
        'Discrete_Num2': [7, 6, 8, 5]}

df = pd.DataFrame(data)

# Step 2: Encode Categorical Columns
label_encoder = LabelEncoder()
df['Category_Ordinal'] = label_encoder.fit_transform(df['Category_Ordinal'])

# One-Hot Encode the nominal column
df = pd.get_dummies(df, columns=['Category_Nominal'], drop_first=True)

# Step 3: Standardize Numerical Columns
scaler = StandardScaler()
numerical_cols = ['Continuous_Num1', 'Continuous_Num2', 'Discrete_Num1', 'Discrete_Num2']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

df

Unnamed: 0,Category_Ordinal,Category_Binary,Continuous_Num1,Continuous_Num2,Discrete_Num1,Discrete_Num2,Category_Nominal_Green,Category_Nominal_Red
0,1,1,-1.215081,-0.662937,0.447214,0.447214,0,1
1,2,0,0.232094,0.290035,-0.447214,-0.447214,0,0
2,0,1,-0.505146,-1.118706,1.341641,1.341641,1,0
3,2,0,1.488133,1.491607,-1.341641,-1.341641,0,1
