In [None]:
import pandas as pd 
import numpy as np

data = pd.read_csv('https://raw.githubusercontent.com/mwilchek/Stock-Modeling/master/DJ_NEWS_SENTIMENT_DATA%20eg.csv', header=0)

print('Number of rows before removing rows with missing values: ' + str(data.shape[0]))

# Replace ? with np.NaN
data = data.replace('?', np.NaN)

# Remove rows with np.NaN
data = data.dropna(how='any')

print('Number of rows after removing rows with missing values: ' + str(data.shape[0]))

# Get Feature values
x = data[['Open', 'High', 'Low', 'Cycle_Change']].values

# Get Target values
y = data['Close'].values

data.head()

Number of rows before removing rows with missing values: 2007
Number of rows after removing rows with missing values: 2007


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor


regression_models = {'lr': LinearRegression(n_jobs=-1),
                     'mlp': MLPRegressor(random_state=0),
                     'dt': DecisionTreeRegressor(random_state=0),
                     'rf': RandomForestRegressor(random_state=0, n_jobs=-1),
                     'svr': SVR(max_iter=-1)}

pipe_regrs = {}

# Create list of pipeline models to test with that standardize the data
for name, regression_models in regression_models.items():
    pipe_regrs[name] = Pipeline([('StandardScaler', StandardScaler()), ('regr', regression_models)])

param_grids = {}

# Linear Regression Parameter Options:
param_grid = [{'regr__normalize': ['True']},
              {'regr__normalize': ['False']}]

# Add Linear Regression Parameters to dictionary grid
param_grids['lr'] = param_grid

# MLP Parameter Options:
alpha_range = [10 ** i for i in range(-4, 5)]

param_grid = [{'regr__hidden_layer_sizes': [10, 100, 200],
               'regr__activation': ['identity', 'logistic', 'tanh', 'relu'],
               'regr__solver': ['lbfgs', 'sgd', 'adam'],
               'regr__alpha': alpha_range},
              {'regr__hidden_layer_sizes': [30, 30, 30],
               'regr__activation': ['identity', 'logistic', 'tanh', 'relu'],
               'regr__solver': ['lbfgs', 'sgd', 'adam'],
               'regr__alpha': alpha_range}]

# Add Multi-layer Perceptron Parameters to dictionary grid
param_grids['mlp'] = param_grid

# Decision Tree Regression Parameter Options:
param_grid = [{'regr__criterion': ['mse', 'friedman_mse', 'mae'],
               'regr__min_samples_split': [2, 6, 10, 20, 30, 40, 50],
               'regr__min_samples_leaf': [1, 6, 10, 20, 30, 40, 50],
               'regr__max_features': ['auto', 'int', 'float', 'sqrt', 'log2']}]

# Add Decision Tree Parameters to dictionary grid
param_grids['dt'] = param_grid

# Random Forest Regression Parameter Options:
param_grid = [{'regr__n_estimators': [10, 100, 1000],
               'regr__criterion': ['mse', 'mae'],
               'regr__min_samples_split': [2, 6, 10, 20, 30, 40, 50],
               'regr__min_samples_leaf': [1, 6, 10, 20, 30, 40, 50],
               'regr__max_features': ['auto', 'int', 'float', 'sqrt', 'log2']}]

# Add Random Forest Parameters to dictionary grid
param_grids['rf'] = param_grid

# Support Vector Machine (SVM) Parameter Options:
param_grid = [{'regr__C': [0.01, 0.1, 1, 10, 100],
               'regr__gamma': [0.01, 0.1, 1, 10, 100],
               'regr__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}]

# Add SVM Parameters to dictionary grid
param_grids['svr'] = param_grid


In [None]:
from sklearn.model_selection import GridSearchCV

# The list of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

# Scoring Param: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
# For each regression
for name in pipe_regrs.keys():
    # GridSearchCV
    gs = GridSearchCV(estimator=pipe_regrs[name],
                      param_grid=param_grids[name],
                      scoring='neg_mean_squared_error',
                      n_jobs=1,
                      cv=None)

    # Fit the pipeline
    gs = gs.fit(x, y)

    # Update best_score_param_estimators
    best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])

In [1]:
# Sort best_score_param_estimators in descending order of the best_score_
best_score_param_estimators = sorted(best_score_param_estimators, key=lambda x: x[0], reverse=True)

# For each [best_score_, best_params_, best_estimator_]
for best_score_param_estimator in best_score_param_estimators:
    # Print out [best_score_, best_params_, best_estimator_], where best_estimator_ is a pipeline
    # Since we only print out the type of classifier of the pipeline
    print([best_score_param_estimator[0], best_score_param_estimator[1], type(best_score_param_estimator[2].named_steps['regr'])], end='\n\n')


NameError: name 'best_score_param_estimators' is not defined