In [1]:

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import minmax_scale

from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score



In [2]:
train_df = pd.read_csv('../data/ML-CUP20-TR .csv')
test_df = pd.read_csv('../data/ML-CUP20-TS.csv')

In [4]:
train_df = train_df.drop(['X1', 'X2','X8'], axis=1)

In [7]:
train_x = train_df.iloc[:,:-2].to_numpy()
train_y = train_df.iloc[:,[7,8]].to_numpy()

In [11]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.4, random_state=30)


In [12]:
knn_model = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                             ('estimator', RegressorChain(KNeighborsRegressor(n_neighbors=4), order=[0,1]))])
et_regressor = Pipeline(steps = [('scale', StandardScaler()),
                             ('estimator', RegressorChain(ExtraTreeRegressor(random_state=0), order=[0,1]))])
dt_regressor = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                             ('estimator', RegressorChain(DecisionTreeRegressor(random_state=0), order=[0,1]))])
direct_svr = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                             ('estimator', MultiOutputRegressor(SVR(kernel='rbf',C=0.1,  gamma='scale',epsilon=0.1)))])
chin_svr = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                             ('estimator', RegressorChain(SVR(kernel='rbf',C=0.1,  gamma='scale',epsilon=0.1)))])


In [13]:
knn_params={
          'estimator__base_estimator__n_neighbors':[5,6,7,8,9,10],    
          'estimator__base_estimator__leaf_size':[1,2,3,5],
          'estimator__base_estimator__weights':['uniform', 'distance'],
          'estimator__base_estimator__algorithm':['auto', 'ball_tree','kd_tree','brute']
           }

er_params={   'estimator__base_estimator__criterion': ['mse', 'mae','poisson','friedman_mse'], 
              'estimator__base_estimator__min_samples_split': [10, 20, 40,'best'],
              'estimator__base_estimator__max_depth': [2, 6, 8],
              'estimator__base_estimator__min_samples_leaf': [20, 40, 100],
              'estimator__base_estimator__max_leaf_nodes': [5, 20, 100],
          }

dt_params = {
    'estimator__base_estimator__criterion': ['mse', 'mae','poisson','friedman_mse'],
    'estimator__base_estimator__min_samples_split': [10, 20, 40,'best'],
    'estimator__base_estimator__max_depth': [2, 6, 8],
    'estimator__base_estimator__min_samples_leaf': [20, 40, 100],
    'estimator__base_estimator__max_leaf_nodes': [5, 20, 100]
          }

dsvr_params={
    'estimator__estimator__kernel': ["rbf"],
    'estimator__estimator__C': np.logspace(-3, 2, 6).tolist(),
    'estimator__estimator__gamma':[0.0001, 0.001, 0.01, 0.1],
    'estimator__estimator__epsilon': np.logspace(-3, 2, 6).tolist()}
chinesvr_params={
    'estimator__base_estimator__kernel': ["rbf"],
    'estimator__base_estimator__gamma': [0.0001, 0.001, 0.01, 0.1],
    'estimator__base_estimator__C': np.logspace(-3, 2, 6,8).tolist(),
    'estimator__base_estimator__epsilon': np.logspace(-3, 2, 6,8).tolist()}

In [14]:
# list of models
regression_models = [knn_model,et_regressor,dt_regressor,direct_svr,chin_svr]
parameters_list=[knn_params, er_params, dt_params, dsvr_params,chinesvr_params]
model_log=["_knn", "_et","_dt", "_dsrv", "_chinSvr"]

In [24]:
for i in range(len(regression_models)):
    Grid=GridSearchCV(estimator=regression_models[i], param_grid=parameters_list[i], scoring='neg_mean_squared_error',
                      n_jobs=-1, cv=10, verbose=3).fit(x_train, y_train)
    print(f"Best parameters:{Grid.best_params_}")
    print(f"Best training accuracy:{Grid.best_score_}")
    print(f"Best model {Grid.best_estimator_}")
    print(f"Test set accuracy score for best params:{r2_score(y_test, y_pred)}" )

      

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:    3.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s


Best parameters:{'estimator__base_estimator__algorithm': 'auto', 'estimator__base_estimator__leaf_size': 1, 'estimator__base_estimator__n_neighbors': 10, 'estimator__base_estimator__weights': 'distance'}
Best training accuracy:-10.824013531698215
Best model Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('estimator',
                 RegressorChain(base_estimator=KNeighborsRegressor(algorithm='auto',
                                                                   leaf_size=1,
                                                                   metric='minkowski',
                                                                   metric_params=None,
                                                                   n_jobs=None,
                                                                   n_neighbors=10,
                                                                   p=2,
              

[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 3568 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed:    7.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s


Best parameters:{'estimator__base_estimator__criterion': 'mse', 'estimator__base_estimator__max_depth': 8, 'estimator__base_estimator__max_leaf_nodes': 100, 'estimator__base_estimator__min_samples_leaf': 20, 'estimator__base_estimator__min_samples_split': 10}
Best training accuracy:-37.277356895367426
Best model Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('estimator',
                 RegressorChain(base_estimator=ExtraTreeRegressor(ccp_alpha=0.0,
                                                                  criterion='mse',
                                                                  max_depth=8,
                                                                  max_features='auto',
                                                                  max_leaf_nodes=100,
                                                                  min_impurity_decrease=0.0,
                       

[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 2128 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed:   14.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best parameters:{'estimator__base_estimator__criterion': 'mae', 'estimator__base_estimator__max_depth': 8, 'estimator__base_estimator__max_leaf_nodes': 100, 'estimator__base_estimator__min_samples_leaf': 20, 'estimator__base_estimator__min_samples_split': 10}
Best training accuracy:-26.31796550221671
Best model Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('estimator',
                 RegressorChain(base_estimator=DecisionTreeRegressor(ccp_alpha=0.0,
                                                                     criterion='mae',
                                                                     max_depth=8,
                                                                     max_features=None,
                                                                     max_leaf_nodes=100,
                                                                     min_impurity_decrease=0.0,
        

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 800 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:   14.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best parameters:{'estimator__estimator__C': 10.0, 'estimator__estimator__epsilon': 0.01, 'estimator__estimator__gamma': 0.1, 'estimator__estimator__kernel': 'rbf'}
Best training accuracy:-10.738873828643552
Best model Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('estimator',
                 MultiOutputRegressor(estimator=SVR(C=10.0, cache_size=200,
                                                    coef0=0.0, degree=3,
                                                    epsilon=0.01, gamma=0.1,
                                                    kernel='rbf', max_iter=-1,
                                                    shrinking=True, tol=0.001,
                                                    verbose=False),
                                      n_jobs=None))],
         verbose=False)
Test set accuracy score for best params:0.9247791024806717
Fitting 10 folds for each of 144 candid

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:   20.5s finished


Best parameters:{'estimator__base_estimator__C': 100.0, 'estimator__base_estimator__epsilon': 0.1, 'estimator__base_estimator__gamma': 0.1, 'estimator__base_estimator__kernel': 'rbf'}
Best training accuracy:-12.01966544589798
Best model Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('estimator',
                 RegressorChain(base_estimator=SVR(C=100.0, cache_size=200,
                                                   coef0=0.0, degree=3,
                                                   epsilon=0.1, gamma=0.1,
                                                   kernel='rbf', max_iter=-1,
                                                   shrinking=True, tol=0.001,
                                                   verbose=False),
                                cv=None, order=None, random_state=None))],
         verbose=False)
Test set accuracy score for best params:0.9247791024806717


In [10]:
train_x[0]

array([ 0.453528, -0.761051, -0.537705,  1.471803, -1.143195,  1.603978,
       -1.399807])