In [1]:
import re
import sys
import glob
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
from time import ctime
from random import randrange
from sklearn.model_selection import KFold
from joblib import Parallel, delayed
import multiprocessing
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
from sklearn import preprocessing
register_matplotlib_converters()
sns.set_style('darkgrid')

In [2]:
from sklearn.linear_model._bayes import ARDRegression
from sklearn.ensemble._weight_boosting import AdaBoostRegressor
from sklearn.linear_model._bayes import BayesianRidge
from sklearn.tree._classes import DecisionTreeRegressor 
from sklearn.linear_model._coordinate_descent import ElasticNetCV
from sklearn.ensemble._forest import ExtraTreesRegressor
from sklearn.gaussian_process._gpr import GaussianProcessRegressor
from sklearn.linear_model._glm.glm import GeneralizedLinearRegressor
from sklearn.ensemble._gb import GradientBoostingRegressor
from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import HistGradientBoostingRegressor
from sklearn.linear_model._huber import HuberRegressor
from sklearn.isotonic import IsotonicRegression 
from sklearn.neighbors._regression import KNeighborsRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model._least_angle import LarsCV
from sklearn.linear_model._coordinate_descent import LassoCV
from sklearn.linear_model._least_angle import LassoLarsCV
from sklearn.linear_model._least_angle import LassoLarsIC
from sklearn.linear_model._base import LinearRegression
from sklearn.svm._classes import LinearSVR
from sklearn.neural_network._multilayer_perceptron import MLPRegressor
from sklearn.svm._classes import NuSVR
from sklearn.linear_model._omp import OrthogonalMatchingPursuitCV
from sklearn.linear_model._passive_aggressive import PassiveAggressiveRegressor
from sklearn.neighbors._regression import RadiusNeighborsRegressor
from sklearn.ensemble._forest import RandomForestRegressor
from sklearn.linear_model._ridge import RidgeCV
from sklearn.linear_model._stochastic_gradient import SGDRegressor
from sklearn.svm._classes import SVR
from sklearn.linear_model._glm.glm import TweedieRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

estimators = [('ard', ARDRegression()),
              ('ada', AdaBoostRegressor()),
              ('brr', BayesianRidge()),
              ('dtr', DecisionTreeRegressor()),
              ('enc', ElasticNetCV()),
              ('etr', ExtraTreesRegressor()),
              ('gpr', GaussianProcessRegressor()),
              ('glr', GeneralizedLinearRegressor()),
              ('gbr', GradientBoostingRegressor()),
              ('hgb', HistGradientBoostingRegressor()),
              ('hur', HuberRegressor()),
              ('knr', KNeighborsRegressor()),
              ('ker', KernelRidge()),
              ('lar', LarsCV()),
              ('las', LassoCV()),
              ('llc', LassoLarsCV()),
              ('lli', LassoLarsIC()),
              ('lir', LinearRegression()),
              ('lsv', LinearSVR(max_iter=100000)),
              ('mlp', MLPRegressor(max_iter=10000)),
              ('nsv', NuSVR(max_iter=100000)),
              ('par', PassiveAggressiveRegressor(max_iter=10000)),
              ('omp', OrthogonalMatchingPursuitCV()),
              ('rfr', RandomForestRegressor()),
              ('sgd', SGDRegressor(max_iter=10000)),
              ('svr', SVR(max_iter=100000)),
              ('twr', TweedieRegressor(max_iter=10000))]

festimators = [('etr', ExtraTreesRegressor()),
              ('gpr', GaussianProcessRegressor()),
              ('gbr', GradientBoostingRegressor()),
              ('hgb', HistGradientBoostingRegressor()),
              ('ker', KernelRidge()),
              ('lir', LinearRegression()),
              ('mlp', MLPRegressor(max_iter=10000)),
              ('nsv', NuSVR(max_iter=100000)),
              ('rfr', RandomForestRegressor()),
              ('svr', SVR(max_iter=100000))]

Y = pd.read_csv("Y.csv")

j = 0
nmodels = 5000

stacked = pd.DataFrame(index=list(range(nmodels)),columns = ['base','meta','score'])
for k in range(nmodels):
    base_models = list()
    meta_name,meta_model = random.choice(festimators)
    base_name,base_model = random.choice(estimators)
    sname = base_name + '.csv'
    base_models.append(('est0',base_model))

    X = pd.read_csv(sname)
    nest = randrange(3,7)
    for i in range(nest):
        base_name,base_model = random.choice(estimators)
        est_string = 'est'+str(i+1)
        base_models.append((est_string,base_model))
        sname = base_name + '.csv'
        X = pd.concat([X, pd.read_csv(sname)],axis=1)
 
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

    model = meta_model
    model.fit(X_train,Y_train.to_numpy().reshape(-1))
    score = model.score(X_test,Y_test.to_numpy().reshape(-1))

    stacked.loc[j]['base'] = base_models
    stacked.loc[j]['meta'] = meta_model
    stacked.loc[j]['score'] = score

    j = j + 1



In [3]:
printdf = stacked.sort_values(by = 'score')

In [9]:
printdf.tail(20) 

Unnamed: 0,base,meta,score
577,"[(est0, LinearSVR(max_iter=100000)), (est1, Ga...",LinearRegression(),0.931331
763,"[(est0, SVR(max_iter=100000)), (est1, Generali...",LinearRegression(),0.931346
2393,"[(est0, KNeighborsRegressor()), (est1, SVR(max...",LinearRegression(),0.931402
373,"[(est0, TweedieRegressor(max_iter=10000)), (es...",LinearRegression(),0.931581
3063,"[(est0, LinearSVR(max_iter=100000)), (est1, Hi...",LinearRegression(),0.931627
1893,"[(est0, ExtraTreesRegressor()), (est1, LinearS...",KernelRidge(),0.931712
4440,"[(est0, SVR(max_iter=100000)), (est1, TweedieR...",LinearRegression(),0.931797
987,"[(est0, SVR(max_iter=100000)), (est1, Generali...",LinearRegression(),0.931929
5,"[(est0, GeneralizedLinearRegressor()), (est1, ...",MLPRegressor(max_iter=10000),0.931975
2668,"[(est0, GradientBoostingRegressor()), (est1, T...",KernelRidge(),0.932239


In [5]:
printdf.to_csv('metadf.csv', index=False)

In [10]:
printdf.iloc[4997]['base']

[('est0', TweedieRegressor(max_iter=10000)),
 ('est1', RandomForestRegressor()),
 ('est2', LinearSVR(max_iter=100000)),
 ('est3', SVR(max_iter=100000)),
 ('est4', DecisionTreeRegressor()),
 ('est5', RandomForestRegressor())]

In [12]:
printdf.iloc[4997]['meta']

LinearRegression()