In [7]:

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import minmax_scale

from sklearn.multioutput import RegressorChain
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA




In [3]:
train_df = pd.read_csv('../data/ML-CUP20-TR .csv')
test_df = pd.read_csv('../data/ML-CUP20-TS.csv')

# Spilitting the targets and input features
train_x = train_df.iloc[:,:-2].to_numpy()
train_y = train_df.iloc[:,[10,11]].to_numpy()


In [5]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.4, random_state=30)

In [10]:
knn_model = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                              ('pca', PCA(n_components=2)),
                             ('estimator', RegressorChain(KNeighborsRegressor(n_neighbors=4), order=[0,1]))])
et_regressor = Pipeline(steps = [('scale', StandardScaler()),
                                 ('pca', PCA(n_components=2)),
                             ('estimator', RegressorChain(ExtraTreeRegressor(random_state=0), order=[0,1]))])
dt_regressor = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                                 ('pca', PCA(n_components=2)),
                             ('estimator', RegressorChain(DecisionTreeRegressor(random_state=0), order=[0,1]))])
direct_svr = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                               ('pca', PCA(n_components=2)),
                             ('estimator', MultiOutputRegressor(SVR(kernel='rbf',C=0.1, epsilon=0.1)))])
chin_svr = Pipeline(steps = [('scale', StandardScaler(with_mean=True, with_std=True)),
                             ('pca', PCA(n_components=2)),
                             ('estimator', RegressorChain(SVR(kernel='rbf',C=0.1, epsilon=0.1)))])


In [17]:
knn_params={
          'estimator__base_estimator__n_neighbors':[5,6,7,8,9,10],    
          'estimator__base_estimator__leaf_size':[1,2,3,5],
          'estimator__base_estimator__weights':['uniform', 'distance'],
          'estimator__base_estimator__algorithm':['auto', 'ball_tree','kd_tree','brute']
           }

er_params={   'estimator__base_estimator__criterion': ['mse', 'mae','poisson','friedman_mse'], 
              'estimator__base_estimator__min_samples_split': [10, 20, 40,'best'],
              'estimator__base_estimator__max_depth': [2, 6, 8],
              'estimator__base_estimator__min_samples_leaf': [20, 40, 100],
              'estimator__base_estimator__max_leaf_nodes': [5, 20, 100],
          }

dt_params = {
    'estimator__base_estimator__criterion': ['mse', 'mae','poisson','friedman_mse'],
    'estimator__base_estimator__min_samples_split': [10, 20, 40,'best'],
    'estimator__base_estimator__max_depth': [2, 6, 8],
    'estimator__base_estimator__min_samples_leaf': [20, 40, 100],
    'estimator__base_estimator__max_leaf_nodes': [5, 20, 100]
          }

dsvr_params={
    'estimator__estimator__kernel': ["rbf"],
    'estimator__estimator__C': np.logspace(-3, 2, 6).tolist(),
    'estimator__estimator__gamma':[0.0001, 0.001, 0.01, 0.1],
    'estimator__estimator__epsilon': np.logspace(-3, 2, 6).tolist()}
chinesvr_params={
    'estimator__base_estimator__kernel': ["rbf"],
    'estimator__base_estimator__gamma': [0.0001, 0.001, 0.01, 0.1],
    'estimator__base_estimator__C': np.logspace(-3, 2, 6,8).tolist(),
    'estimator__base_estimator__epsilon': np.logspace(-3, 2, 6,8).tolist()}

In [18]:
parameters_list=[knn_params, er_params, dt_params, dsvr_params,chinesvr_params]
model_log=["_knn", "_et","_dt", "_dsrv", "_chinSvr"]
regression_models = [knn_model,et_regressor,dt_regressor,direct_svr,chin_svr]

In [19]:
for i in range(len(regression_models)):
    Grid=GridSearchCV(estimator=regression_models[i], param_grid=parameters_list[i], scoring='neg_mean_squared_error',
                      n_jobs=-1, cv=10, verbose=3).fit(x_train, y_train)
    y_pred = Grid.predict(x_test)
    print('Best params: %s' % Grid.best_params_)
    print('Best training accuracy: %.3f' % Grid.best_score_)
    print('Test set accuracy score for best params: %.3f ' % r2_score(y_test, y_pred))

      

Fitting 10 folds for each of 192 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1883 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed:    4.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s


Best params: {'estimator__base_estimator__algorithm': 'auto', 'estimator__base_estimator__leaf_size': 1, 'estimator__base_estimator__n_neighbors': 10, 'estimator__base_estimator__weights': 'uniform'}
Best training accuracy: -12.484
Test set accuracy score for best params: 0.916 
Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Done 1008 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 3568 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed:    9.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s


Best params: {'estimator__base_estimator__criterion': 'mse', 'estimator__base_estimator__max_depth': 6, 'estimator__base_estimator__max_leaf_nodes': 20, 'estimator__base_estimator__min_samples_leaf': 20, 'estimator__base_estimator__min_samples_split': 10}
Best training accuracy: -26.676
Test set accuracy score for best params: 0.809 
Fitting 10 folds for each of 432 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1904 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 3696 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed:   11.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best params: {'estimator__base_estimator__criterion': 'mse', 'estimator__base_estimator__max_depth': 8, 'estimator__base_estimator__max_leaf_nodes': 100, 'estimator__base_estimator__min_samples_leaf': 20, 'estimator__base_estimator__min_samples_split': 10}
Best training accuracy: -14.228
Test set accuracy score for best params: 0.903 
Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 816 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 1425 out of 1440 | elapsed:   15.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:   15.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best params: {'estimator__estimator__C': 10.0, 'estimator__estimator__epsilon': 1.0, 'estimator__estimator__gamma': 0.1, 'estimator__estimator__kernel': 'rbf'}
Best training accuracy: -11.804
Test set accuracy score for best params: 0.921 
Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1243 tasks      | elapsed:   13.9s


Best params: {'estimator__base_estimator__C': 100.0, 'estimator__base_estimator__epsilon': 1.0, 'estimator__base_estimator__gamma': 0.1, 'estimator__base_estimator__kernel': 'rbf'}
Best training accuracy: -12.073
Test set accuracy score for best params: 0.919 


[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:   18.3s finished
