In [None]:

import numpy as np
import pandas as pd 
from sklearn.preprocessing import minmax_scale
from sklearn.multioutput import RegressorChain
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt





In [None]:
def mean_euclidean_error(y_true, y_pred):
    """
    mean euclidean error MEE
    """

    error = 0
    if len(y_true) == len(y_pred):
        for i in range(len(y_true)):
            error = error + np.sqrt(np.square(y_pred[i][0]
                                    - y_true[i][0])
                                    + np.square(y_pred[i][1]
                                    - y_true[i][1]))
        return error / len(y_true)

In [None]:

train_df = pd.read_csv('../data/ML-CUP20-TR .csv')
test_df = pd.read_csv('../data/ML-CUP20-TS.csv')

# Spilitting the targets and input features



In [None]:


# Spilitting the targets and input features
train_x = train_df.iloc[:,:-2].to_numpy()
train_y = train_df.iloc[:,[10,11]].to_numpy()


In [None]:
"""split train dataset"""

(x_train, x_test, y_train, y_test) = train_test_split(train_x, train_y,
        test_size=0.3, random_state=30)

In [None]:
"""regression models """

knn_model = Pipeline(steps=[('scale', StandardScaler(with_mean=True,
                     with_std=True)), ('estimator',
                     RegressorChain(KNeighborsRegressor(n_neighbors=4),
                     order=[0, 1]))])
et_regressor = Pipeline(steps=[('scale', StandardScaler()), ('estimator'
                        ,
                        RegressorChain(ExtraTreeRegressor(random_state=0),
                        order=[0, 1]))])
dt_regressor = Pipeline(steps=[('scale', StandardScaler(with_mean=True,
                        with_std=True)), ('estimator',
                        RegressorChain(DecisionTreeRegressor(random_state=0),
                        order=[0, 1]))])
direct_svr = Pipeline(steps=[('scale', StandardScaler(with_mean=True,
                      with_std=True)), ('estimator',
                      MultiOutputRegressor(SVR(kernel='rbf', C=0.1,
                      gamma='scale', epsilon=0.1)))])
chin_svr = Pipeline(steps=[('scale', StandardScaler(with_mean=True,
                    with_std=True)), ('estimator',
                    RegressorChain(SVR(kernel='rbf', C=0.1,
                    gamma='scale', epsilon=0.1)))])


In [None]:
"""parametres of models for gridsearch """

knn_params = {
    'estimator__base_estimator__n_neighbors': [
        5,
        6,
        7,
        8,
        9,
        10,
        ],
    'estimator__base_estimator__leaf_size': [1, 2, 3, 5],
    'estimator__base_estimator__weights': ['uniform', 'distance'],
    'estimator__base_estimator__algorithm': ['auto', 'ball_tree',
            'kd_tree', 'brute'],
    }

er_params = {
    'estimator__base_estimator__criterion': ['mse', 'mae', 'poisson',
            'friedman_mse'],
    'estimator__base_estimator__min_samples_split': [10, 20, 40, 'best'
            ],
    'estimator__base_estimator__max_depth': [2, 6, 8],
    'estimator__base_estimator__min_samples_leaf': [20, 40, 100],
    'estimator__base_estimator__max_leaf_nodes': [5, 20, 100],
    }

dt_params = {
    'estimator__base_estimator__criterion': ['mse', 'mae', 'poisson',
            'friedman_mse'],
    'estimator__base_estimator__min_samples_split': [10, 20, 40, 'best'
            ],
    'estimator__base_estimator__max_depth': [2, 6, 8],
    'estimator__base_estimator__min_samples_leaf': [20, 40, 100],
    'estimator__base_estimator__max_leaf_nodes': [5, 20, 100],
    }

dsvr_params = {
    'estimator__estimator__kernel': ['rbf', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    'estimator__estimator__C': np.logspace(-3, 2, 6).tolist(),
    'estimator__estimator__gamma': [0.0001, 0.001, 0.01, 0.1],
    'estimator__estimator__epsilon': np.logspace(-3, 2, 6).tolist(),
    }
chinesvr_params = {
    'estimator__base_estimator__kernel': ['rbf', 'poly', 'rbf', 'sigmoid', 'precomputed'],
    'estimator__base_estimator__gamma': [0.0001, 0.001, 0.01, 0.1],
    'estimator__base_estimator__C': np.logspace(-3, 2, 6, 8).tolist(),
    'estimator__base_estimator__epsilon': np.logspace(-3, 2, 6,
            8).tolist(),
    }


In [None]:
""" list of regression_models, parameters_list and models name  """

regression_models = [knn_model, et_regressor, dt_regressor, direct_svr,
                     chin_svr]
parameters_list = [knn_params, er_params, dt_params, dsvr_params,
                   chinesvr_params]
model_log = ['KNeighborsRegressor', 'ExtraTreeRegressor',
             'DecisionTreeRegressor', 'MultiOutputRegressor',
             'RegressorChain']

In [16]:

""" Gridsearch and Learning curve"""

for i in range(len(regression_models)):
    Grid = GridSearchCV(
        estimator=regression_models[i],
        param_grid=parameters_list[i],
        n_jobs=-1,
        cv=10,
        verbose=3,
        ).fit(x_train, y_train)
    y_pred = Grid.predict(x_test)
    df_result = pd.DataFrame(Grid.cv_results_)
    (train_sizes, train_scores, test_scores) = \
        learning_curve(Grid.best_estimator_, x_train, y_train, cv=10,
                       n_jobs=5)

    print(f"Best parameters:{Grid.best_params_}")
    print(f"Best training accuracy:{Grid.best_score_}")
    print(f"Test dataset accuracy score for best params:{r2_score(y_test, y_pred)}" )
    print(f"Mean_euclidean_error for test dataset :{mean_euclidean_error(y_test, y_pred)}" )
    print(f"Best model :{Grid.best_estimator_}")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    (fig, ax) = plt.subplots(1, figsize=(10, 10))
    ax.grid(linestyle='-', linewidth='0.5', color='black')
    ax.set_axisbelow(True)
    plt.plot(train_sizes, train_scores_mean, '--', color='r',
             label='Training score')
    plt.plot(train_sizes, test_scores_mean, 'o-', color='g',
             label='Cross-validation score')
    plt.title(f"Learning curve({model_log[i]})")
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    plt.legend(loc='Best')
    plt.tight_layout()
    plt.show()



Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


TypeError: mean_euclidean_error() takes 2 positional arguments but 3 were given

In [9]:
RegressorChain(SVR()).get_params()
MultiOutputRegressor(SVR()).get_params()

{'estimator__C': 1.0,
 'estimator__cache_size': 200,
 'estimator__coef0': 0.0,
 'estimator__degree': 3,
 'estimator__epsilon': 0.1,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 'n_jobs': None}