In [None]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

# Create fake dataset

In [None]:
X, y = make_regression(n_samples=10000, n_features=10)
# Split train and test
X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.2, random_state=42, )

# Step by Step

In [None]:
scaler = StandardScaler() # Transformers
pca = PCA() # Transformers
lr = LinearRegression() # Predictor

In [None]:
x_train_scale = scaler.fit_transform(X_train)
x_train_pca = pca.fit_transform(x_train_scale)
lr.fit(x_train_pca, y_train)

In [None]:
x_test_scale = scaler.transform(X_test)
x_test_pca = pca.transform(x_test_scale)

In [None]:
y_test_pred = lr.predict(x_test_pca)
y_train_pred = lr.predict(x_train_pca)

# Pipeline

## Linear Regression

In [None]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('dim_reduction', PCA(n_components=0.95)),
    ('model', LinearRegression()),
])

pipeline.fit(X_train, y_train)

prediction_train = pipeline.predict(X_train)
prediction_test = pipeline.predict(X_test)

In [None]:
mse_train = mean_squared_error(y_train, prediction_train)
mse_test = mean_squared_error(y_test, prediction_test)

print(f'MSE Train: {mse_train}\nMSE test : {mse_test}')

## Random Forest

In [None]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('dim_reduction', PCA(n_components=0.95)),
    ('model', RandomForestRegressor()),
])

pipeline.fit(X_train, y_train)

prediction_train = pipeline.predict(X_train)
prediction_test = pipeline.predict(X_test)

In [None]:
mse_train = mean_squared_error(y_train, prediction_train)
mse_test = mean_squared_error(y_test, prediction_test)

print(f'MSE Train: {mse_train}\nMSE test : {mse_test}')

# Hyper-parameter tuning

In [None]:
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('dim_reduction', PCA(n_components=0.9)),
    ('model', RandomForestRegressor(n_estimators=10)),
])

search_space = {
    'dim_reduction__n_components': np.arange(0.8, 0.98, 0.05),
    'model__max_depth': [5, 10, None],
    'model__n_estimators': [50, 100, 200], 
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=search_space,
    n_iter=10,
    n_jobs=-1,
    cv=3,)
random_search.fit(X_train, y_train)

In [None]:
random_search.cv_results_

In [None]:
random_search.best_stimator_