This notebook contains the model implementation to validate the results.

The train, test and validation datasets are provided in this in repository.
On running the final cell the user can validate the results for the following models -
1. Ridge
2. Lasso
3. XGBoost
4. Random Forest
5. Extra Trees

## **Creating Train and test sets**

In [None]:
X_train, X_test_valid, y_train, y_test_valid = train_test_split(x, y, test_size=0.33, random_state=666)
X_test, X_valid, y_test, y_valid = train_test_split(X_test_valid, y_test_valid, test_size=0.33, random_state=666)

In [None]:
X_train.shape, X_valid.shape, X_test.shape

In [None]:
def train_evaluate(X_train, X_valid, X_test, y_train, y_valid, y_test, model):
    """
    Trains evaluates a model, uses validation data for early stopping
    if supported from the model, if data needs scaling perform this prior to
     this function.

     Returns
     -------
     trained model
     """

    from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, root_mean_squared_error
    print()
    print(60 * "-")
    print(str(model).center(60))
    print(60 * "-")

    try:
        _ = model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    except (TypeError, ValueError):
        _ = model.fit(X_train, y_train)

    print(f'train_score: {model.score(X_train, y_train):1.4f}',
          f'Valid Score: {model.score(X_valid, y_valid):1.4f}',
          f'Test score: {model.score(X_test, y_test):1.4f}', sep='\n')

    y_pred = model.predict(X_test)
    print("\nTest Set Results\n----------------",
          f"True Mean: {y_test.mean(): 1.4f}",
          f"MAE: {mean_absolute_error(y_test, y_pred): 1.4f}",
          f"MAPE: {mean_absolute_percentage_error(y_test, y_pred): 1.4%}",
          f"RMSE: {root_mean_squared_error(y_test, y_pred): 1.4f}", sep='\n')
    return model

## **Training and testing the model**

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

ridge = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=5)
lasso = LassoLarsCV(max_n_alphas=10000, max_iter=100000, cv=5)

xgbr = XGBRegressor(early_stopping_rounds=50,
                    learning_rate=0.1,
                    reg_lambda=0.125,
                    max_depth=7,
                    min_child_weight=1,
                    n_estimators=500,
                    n_jobs=-1,
                    objective='reg:squarederror',
                    random_state=666, )
rfr = RandomForestRegressor(n_estimators=250)
etr = ExtraTreesRegressor(n_estimators=150)

for model in (ridge, lasso , xgbr, rfr, etr):
    if model not in (ridge, lasso):
       _ = train_evaluate(X_train, X_valid, X_test, y_train, y_valid, y_test, model)
    else:
       _ = train_evaluate(X_train_scaled, X_valid_scaled, X_test_scaled, y_train, y_valid, y_test, model)