#### This program shows using a data example to build pipeline and run three ML models

In [18]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

# Load the California housing dataset
california = fetch_california_housing()
print("California Housing dataset shape:", california.data.shape)

df = pd.DataFrame(california.data, columns=california.feature_names)
df['MedHouseVal'] = california.target # Adds target variable

df=df[['Latitude', 'Longitude','MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
        'MedHouseVal']]

# Specify the fraction of data for the training set
train_size = 0.7
# Split the DataFrame
train_df, test_df = train_test_split(df, train_size=train_size, random_state=42)

df.head()



California Housing dataset shape: (20640, 8)


Unnamed: 0,Latitude,Longitude,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,MedHouseVal
0,37.88,-122.23,8.3252,41.0,6.984127,1.02381,322.0,2.555556,4.526
1,37.86,-122.22,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,3.585
2,37.85,-122.24,7.2574,52.0,8.288136,1.073446,496.0,2.80226,3.521
3,37.85,-122.25,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413
4,37.85,-122.25,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422


In [21]:
# Convert df to numpy format
X_train, y_train=train_df.to_numpy()[:,:-1], train_df.to_numpy()[:,-1] # the last column if train_df is y_train
X_test, y_test=test_df.to_numpy()[:,:-1], test_df.to_numpy()[:,-1]

In [22]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((14447, 8), (14447,), (6193, 8), (6193,))

In [25]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from copy import deepcopy

std_scaler = StandardScaler().fit(X_train[:,:2])  # scale the first two column (normal distribution type)
min_max_scaler = MinMaxScaler().fit(X_train[:,2:]) # scale the rest columns 

def preprocessor(X):
    A=np.copy(X)
    A[:,:2] = std_scaler.transform(X[:,:2])
    A[:,2:] = min_max_scaler.transform(X[:,2:])
    return A

In [26]:
preprocessor(X_test)

array([[ 1.91605896e-01,  2.86691308e-01,  8.14678418e-02, ...,
         2.72279953e-02,  3.89304633e-02,  2.56319369e-03],
       [-2.39167386e-01,  6.20157326e-02,  1.40094619e-01, ...,
         3.39943401e-02,  4.37792539e-02,  1.59940545e-03],
       [ 1.00632928e+00, -1.42583586e+00,  2.05528200e-01, ...,
         3.36933609e-02,  3.66321926e-02,  5.37584539e-04],
       ...,
       [ 1.01569391e+00, -8.46672151e-01,  2.17245279e-01, ...,
         2.87332090e-02,  1.66260265e-01,  2.29769634e-03],
       [-7.02716895e-01,  6.71136182e-01,  1.76004469e-01, ...,
         2.71256724e-02,  1.15277895e-01,  1.54725775e-03],
       [-8.05727898e-01,  7.80977575e-01,  2.50431029e-01, ...,
         2.53786545e-02,  3.40536450e-02,  1.84525247e-03]])

In [27]:
preprocess_transformer = FunctionTransformer(preprocessor)
preprocess_transformer

#### Model 1: Linear Regression Model

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

p1 = Pipeline([('Scaler', preprocess_transformer), ('Linear Regression', LinearRegression())])
p1

In [38]:
from sklearn.metrics import mean_absolute_error

def fit_and_print(p, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    p.fit(X_train, y_train)
    train_preds = p.predict(X_train)
    test_preds = p.predict(X_test)
    print('Training error: ', mean_absolute_error(train_preds, y_train))
    print('Test error: ', mean_absolute_error(test_preds, y_test))

In [39]:
fit_and_print(p1)

Training error:  0.5309954878872751
Test error:  0.5271979237398966


#### Model 2: KNN regressor

In [42]:
from sklearn.neighbors import KNeighborsRegressor as KNR

In [53]:
p2 = Pipeline([('Scaler', preprocess_transformer), ('K nearest neighbor Regression', KNR(n_neighbors=7))])
p2

In [54]:
fit_and_print(p2) # A better model

Training error:  0.3040008382363121
Test error:  0.3488044160457659


### Model 3: Xgboost regressor model

In [59]:
#!pip install xgboost

In [56]:
from xgboost import XGBRegressor as xgbr

In [57]:
p3 = Pipeline([('Scaler', preprocess_transformer), ('XGB Regression', xgbr())]) # run on default hyper-parameter setting
p3

In [58]:
fit_and_print(p3) # The best model

Training error:  0.18572162963095376
Test error:  0.31107468896566637


#### reference code below only

In [None]:
xgbr = xgb.XGBRegressor(objective='reg:squarederror', # or 'reg:linear' if you're using an older version
                            n_estimators=1000,
                            learning_rate=0.05,
                            max_depth=5,
                            min_child_weight=1,
                            gamma=0,
                            subsample=0.8,
                            colsample_bytree=0.8,
                            reg_alpha=0.005,
                            random_state=42,
                            n_jobs=-1)

    xgbr.fit(X_train, y_train)