# LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

import pickle
import warnings
warnings.filterwarnings('ignore')

# Typical scikit-learn Workflow

In [None]:
heart_disease = pd.read_csv('https://raw.githubusercontent.com/tronghieu2810/MACHINE-LEARNING/main/UDM_ZTM_ML/data/heart-disease.csv')
heart_disease

In [None]:
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

clf.get_params()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train);

In [None]:
y_preds = clf.predict(X_test)
print(y_preds)

In [None]:
print(clf.score(X_train, y_train))
print(clf.score(X_test,  y_test))

In [None]:
print(classification_report(y_test, y_preds), '\n')
print(confusion_matrix(y_test, y_preds), '\n')
print(accuracy_score(y_test, y_preds))

In [None]:
np.random.seed(42)

for i in range(10, 100, 10):
    print(f'Trying model with {i} estimators ...')
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, y_train)
    print(f'Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}')
    print('')

In [None]:
pickle.dump(clf, open('random_forest_model_1.pkl', 'wb'))

In [None]:
loaded_model = pickle.load(open('random_forest_model_1.pkl', 'rb'))
loaded_model.score(X_test, y_test)

# Getting Your Data Ready

## Splitting Your Data

In [None]:
heart_disease

In [None]:
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X.shape, X_train.shape, X_test.shape, y.shape, y_train.shape, y_test.shape)

## Convert Data To Numbers

In [None]:
car_sales = pd.read_csv('https://raw.githubusercontent.com/tronghieu2810/MACHINE-LEARNING/main/UDM_ZTM_ML/data/car-sales-extended.csv')
car_sales

In [None]:
print(car_sales.dtypes, '\n')
X_col = car_sales.columns
print(f'{X_col} - {type(X_col)} - {X_col[1]}')
# print(car_sales.columns)

In [None]:
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([(  'one_hot',
                                    one_hot,
                                    categorical_features)],
                                remainder='passthrough')

transformed_X = transformer.fit_transform(X)

In [None]:
print(X.head(), '\n')
print(pd.DataFrame(transformed_X).head())

In [None]:
dummies = pd.get_dummies(car_sales[['Make', 'Colour', 'Doors']])
dummies

In [None]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X,
                                                    y, 
                                                    test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

## Handling Missing Values With Pandas

In [None]:
car_sales_missing = pd.read_csv('https://raw.githubusercontent.com/tronghieu2810/MACHINE-LEARNING/main/UDM_ZTM_ML/data/car-sales-extended-missing-data.csv')
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing['Make']         .fillna('missing', inplace=True)
car_sales_missing['Colour']       .fillna('missing', inplace=True)
car_sales_missing['Odometer (KM)'].fillna(  car_sales_missing['Odometer (KM)'].mean(), 
                                            inplace=True)
car_sales_missing['Doors']        .fillna(4, inplace=True)

car_sales_missing.isna().sum()

In [None]:
car_sales_missing.dropna(inplace=True)
car_sales_missing.isna().sum()

In [None]:
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([(  'one_hot',
                                    one_hot,
                                    categorical_features)],
                                remainder='passthrough')

transformed_X = transformer.fit_transform(car_sales_missing)

## Handling Missing Values With Scikit-learn

In [None]:
car_sales_missing = pd.read_csv('https://raw.githubusercontent.com/tronghieu2810/MACHINE-LEARNING/main/UDM_ZTM_ML/data/car-sales-extended-missing-data.csv')
car_sales_missing

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.dropna(subset=['Price'], inplace=True)
car_sales_missing.isna().sum()

In [None]:
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [None]:
cat_imputer  = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer  = SimpleImputer(strategy='mean')

cat_features = ['Make', 'Colour']
door_feature = ['Doors']
num_features = ['Odometer (KM)']

imputer = ColumnTransformer([
    ('cat_imputer',  cat_imputer,  cat_features),
    ('door_imputer', door_imputer, door_feature),
    ('num_imputer',  num_imputer,  num_features)
])

filled_X = imputer.fit_transform(X)
filled_X

In [None]:
car_sales_filled = pd.DataFrame(filled_X, columns=['Make', 'Colour', 'Doors', 'Odometer (KM)'])
print(car_sales_filled.head(5), '\n')
print(car_sales_filled.isna().sum())

In [None]:
categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([(  'one_hot',
                                    one_hot,
                                    categorical_features)],
                                remainder='passthrough')

transformed_X = transformer.fit_transform(car_sales_filled)

In [None]:
np.random.seed(42)

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

# Choosing The Right Model For Your Data

In [None]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
housing

In [None]:
housing_df = pd.DataFrame(housing['data'], columns=housing['feature_names'])
housing_df

In [None]:
housing_df['target'] = housing['target']
housing_df.head()

In [None]:
from sklearn.linear_model import Ridge

np.random.seed(42)

X = housing_df.drop('target', axis=1)
y = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = Ridge()
model.fit(X_train, y_train)
model.score(X_test, y_test)

## Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop('target', axis=1)
x = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

## Classification

In [None]:
heart_disease = pd.read_csv(r'C:\Users\Doan Trong Hieu\Downloads\IMPORTANT\SPECIALIZATION\AI\MACHINE-LEARNING\CODE_ML\UDM_ZTM_ML\data\heart-disease.csv')

In [None]:
from sklearn.svm import LinearSVC

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LinearSVC(max_iter=10000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

# Fitting A Model To The Data
# Making Predictions With Our Model

In [None]:
print(clf.predict(X_test), '\n')
print(np.array(y_test))

In [None]:
y_preds = clf.predict(X_test)
print(np.mean(y_preds == y_test))
print(accuracy_score(y_test, y_preds))

## predict() vs predict_proba()

In [None]:
clf.predict_proba(X_test)

In [None]:
clf.predict(X_test)

## Regression

In [None]:
housing_df

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop('target', axis=1)
y = housing_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_preds = model.predict(X_test)

In [None]:
print(y_preds[:10], len(y_preds), '\n')
print(np.array(y_test[:10]), len(y_test), '\n')
print(mean_absolute_error(y_test, y_preds))

# Evaluating A Machine Learning Model (Score)

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train), '\n')
print(clf.score(X_test, y_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop('target', axis=1)
y = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)

print(model.score(X_train, y_train), '\n')
print(model.score(X_test, y_test))

## Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test), '\n')
print(cross_val_score(clf, X, y, cv=5))

In [None]:
clf_single_score = clf.score(X_test, y_test)
clf_cross_val_score = np.mean(cross_val_score(clf, X, y, cv=5))

print(clf_single_score)
print(clf_cross_val_score)

## Accuracy

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

clf = RandomForestClassifier()
cross_val_score = cross_val_score(clf, X, y, cv=5)
print(f'Heart Disease Classifier Accuracy: {np.mean(cross_val_score) * 100:.2f}%')

## ROC Curve

In [None]:
from sklearn.metrics import roc_curve

clf.fit(X_train, y_train)
y_probs = clf.predict_proba(X_test)
y_probs_positive = y_probs[:, 1]

print(y_probs[:5], y_probs.shape, '\n')
print(y_probs_positive[:5], y_probs_positive.shape)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

fpr, tpr, thresholds

In [None]:
def plot_roc_curve(fpr, tpr):
    """
    - Plots a ROC curve given the False Positive rate (fpr) and True Positive rate (tpr)
    of a model
    """
    
    # Plot ROC curve
    plt.plot(fpr, tpr, color='orange', label='ROC')
    
    # Plot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label='Guessing')
    
    # Plot perfect ROC curve
    plt.plot([0, 0], [0, 1], color='r', linestyle='--')
    plt.plot([0, 1], [1, 1], color='r', linestyle='--')
    
    # Customize the plot
    plt.xlabel('False Positive Rate (fpr)')
    plt.ylabel('True Postitive Rate (tpr)')
    plt.title('Receiver Operating Characteristic (ROC) curve')
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)
print(confusion_matrix(y_test, y_preds))

In [None]:
pd.crosstab(y_test, y_preds, rownames=['Actual Labels'], colnames=['Precicted Labels'])

In [None]:
import sklearn
sklearn.__version__

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=clf, X=X, y=y)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=y_preds)

## Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

# Evaluating A Regression Model

## R2 Score

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop('target', axis=1)
y = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import r2_score

# Fill an array with y_test mean
y_test_mean = np.full(shape=len(y_test), fill_value=y_test.mean())

print(r2_score(y_true=y_test, y_pred=y_test_mean), '\n')
print(r2_score(y_true=y_test, y_pred=y_test), '\n')


## MAE

In [None]:
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
print(mae)

In [None]:
print(y_preds, '\n')
print(np.array(y_test))

In [None]:
df = pd.DataFrame(data={'Actual values':    y_test,
                        'Predicted values': y_preds})
df['Difference'] = df['Predicted values'] - df['Actual values']
print(np.mean(np.absolute(df['Difference'])))

## MSE

In [None]:
from sklearn.metrics import mean_squared_error

y_preds = model.predict(X_test)

mse = mean_squared_error(y_test, y_preds)
df['Squared Difference'] = np.square(df['Difference'])
squared = np.mean(np.square(df['Difference']))

print(mse, '\n')
print(df.head(), '\n')
print(squared, '\n')

## Cross Validation and Scoring Parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

clf = RandomForestClassifier()
cv_acc = cross_val_score(clf, X, y, cv=5, scoring=None)

print(cv_acc)
print(f'The cross-validated accuracy is: {np.mean(cv_acc) * 100:.2f}%')

In [None]:
np.random.seed(42)
cv_acc = cross_val_score(clf, X, y, cv=5, scoring='accuracy')

print(cv_acc)
print(f'The cross-validated accuracy is: {np.mean(cv_acc) * 100:.2f}%')

In [None]:
np.random.seed(42)
cv_precision = cross_val_score(clf, X, y, cv=5, scoring='precision')

print(cv_precision)
print(f'The cross-validated precision is: {np.mean(cv_precision) * 100:.2f}%')

In [None]:
np.random.seed(42)
cv_recall = cross_val_score(clf, X, y, cv=5, scoring='recall')

print(cv_recall)
print(f'The cross-validated recall is: {np.mean(cv_recall) * 100:.2f}%')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop('target', axis=1)
y = housing_df['target']

model = RandomForestRegressor()

cv_r2 = cross_val_score(model, X, y, cv=3, scoring=None)
print(np.round(cv_r2, 3))
print(f'The cross-validated r2 is: {np.mean(cv_r2):.2f}')

In [None]:
cv_mae = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
print(np.round(cv_mae, 3))
print(f'The cross-validated mae is: {np.mean(cv_mae):.2f}')

In [None]:
cv_mse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(np.round(cv_mse, 3))
print(f'The cross-validated mse is: {np.mean(cv_mse):.2f}')

## Evaluating A Model With Scikit-learn Functions

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_preds = clf.predict(X_test)

print('Classifier metrics on the test set')
print(f'Accuracy:  {accuracy_score(y_test, y_preds)  * 100:.2f}%')
print(f'Precision: {precision_score(y_test, y_preds):.2f}')
print(f'Recall:    {recall_score(y_test, y_preds)   :.2f}')
print(f'F1:        {f1_score(y_test, y_preds)       :.2f}')

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)
X = housing_df.drop('target', axis=1)
y = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
y_preds = model.predict(X_test)

print('Regression metrics on the test set')
print(f'R2 score: {r2_score(y_test, y_preds):.3f}')
print(f'     MAE: {mean_absolute_error(y_test, y_preds):.3f}')
print(f'     MSE: {mean_squared_error(y_test, y_preds):.3f}')


# Tuning Hyperparameters

In [None]:
def evaluate_preds(y_true, y_preds):
    """
    - Performs evaluation comparison on y_true labels vs. y_preds labels on a
    classification.
    """
    
    accuracy    = accuracy_score(y_true, y_preds)
    precision   = precision_score(y_true, y_preds)
    recall      = recall_score(y_true, y_preds)
    f1          = f1_score(y_true, y_preds)
    metric_dict = { 'Accuracy':  round(accuracy, 2),
                    'Precision': round(precision, 2),
                    'Recall':    round(recall, 2),
                    'F1':        round(f1, 2)   }
    
    print(f' Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.2f}')
    print(f'   Recall: {recall:.2f}')
    print(f' F1 score: {f1:.2f}')
    
    return metric_dict

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

heart_disease_shuffled = heart_disease.sample(frac=1)

X = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

train_split = round(0.7 * len(heart_disease_shuffled))
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled))

X_train, y_train = X[:train_split],            y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test,  y_test  = X[valid_split:],            y[valid_split:]

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_preds = clf.predict(X_valid)
baseline_metrics = evaluate_preds(y_valid, y_preds)

In [None]:
clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)
y_preds_2 = clf_2.predict(X_valid)
baseline_metrics_2 = evaluate_preds(y_valid, y_preds_2)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {'n_estimators':      [10, 100, 200, 500, 1000, 1200],
        'max_depth':         [None, 5, 10, 20, 30],
        'max_features':      ['auto', 'sqrt'],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf':  [1, 2, 4]}

np.random.seed(42)

X = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

rs_clf = RandomizedSearchCV(    estimator=clf, 
                                param_distributions=grid, 
                                n_iter=10, 
                                cv=5,
                                verbose=2       )

rs_clf.fit(X_train, y_train);

In [None]:
rs_clf.best_params_

In [None]:
rs_y_preds = rs_clf.predict(X_test)
rs_metrics = evaluate_preds(y_test, rs_y_preds)

In [None]:
grid_2 = {  'n_estimators':      [100, 200, 500],
            'max_depth':         [None],
            'max_features':      ['auto', 'sqrt'],
            'min_samples_split': [6],
            'min_samples_leaf':  [1, 2]  }

from sklearn.model_selection import GridSearchCV, train_test_split
np.random.seed(42)

X = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

gs_clf = GridSearchCV(  estimator=clf, 
                        param_grid=grid_2, 
                        cv=5,
                        verbose=2   )

gs_clf.fit(X_train, y_train);

In [None]:
gs_y_preds = gs_clf.predict(X_test)
gs_metrics = evaluate_preds(y_test, gs_y_preds)

In [None]:
compare_metrics = pd.DataFrame({'baseline': baseline_metrics,
                                'clf_2': baseline_metrics_2,
                                'random search': rs_metrics,
                                'grid search': gs_metrics})
compare_metrics.plot.bar(figsize=(22, 8))

# Saving And Loading A Model

# Putting It All Together

In [None]:
data = pd.read_csv(r'C:\Users\Doan Trong Hieu\Downloads\IMPORTANT\SPECIALIZATION\AI\MACHINE-LEARNING\CODE_ML\UDM_ZTM_ML\data\car-sales-extended.csv')

print(data.head(), '\n')
print(data.dtypes, '\n')
print(data.isna().sum(), '\n')

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
np.random.seed(42)

data = pd.read_csv(r'C:\Users\Doan Trong Hieu\Downloads\IMPORTANT\SPECIALIZATION\AI\MACHINE-LEARNING\CODE_ML\UDM_ZTM_ML\data\car-sales-extended-missing-data.csv')
data.dropna(subset=['Price'], inplace=True)

categorical_features = ['Make', 'Colour']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

door_feature = ['Doors']
door_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))
])

num_features = ['Odometer (KM)']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat',  categorical_transformer, categorical_features),
        ('door', door_transformer,        door_feature),
        ('num',  num_transformer,         num_features)
    ]
)

model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor())
    ]
)

X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
pipe_grid = {   'preprocessor__num__imputer__strategy': ['mean', 'meadian'],
                'model__n_estimators':                  [100, 1000],
                'model__max_depth':                     [None, 5],
                'model__max_features':                  ['auto'],
                'model__min_samples_split':             [2, 4]  }
gs_model = GridSearchCV(estimator=model, param_grid=pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

In [None]:
gs_model.score(X_test, y_test)