diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..e0b1416 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Marios Michailidis + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..3a83bdc --- /dev/null +++ b/README.md @@ -0,0 +1,90 @@ +## About + +`pystacknet` is a light python version of [StackNet](https://github.com/kaz-Anova/StackNet) which was originally made in Java. + +It supports many of the original features, with some new elements. + + +## Installation + +``` +git clone https://github.com/h2oai/pystacknet +cd pystacknet +python setup.py install +``` + +## New features + +`pystacknet`'s main object is a 2-dimensional list of sklearn type of models. This list defines the StackNet structure. This is the equivalent of [parameters](https://github.com/kaz-Anova/StackNet#parameters-file) in the Java version. A representative example could be: + +``` +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier +from sklearn.linear_model import LogisticRegression + + models=[ + ######## First level ######## + [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + LogisticRegression(random_state=1) + ], + ######## Second level ######## + [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] + ] +``` + +`pystacknet` is not as strict as in the `Java` version and can allow `Regressors`, `Classifiers` or even `Transformers` at any level of StackNet. In other words the following could work just fine: + +``` +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier,GradientBoostingRegressor +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.decomposition import PCA + models=[ + + [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + LogisticRegression(random_state=1), + PCA(n_components=4,random_state=1) + ], + + [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] + + + ] +``` + +**Note** that not all transformers are meaningful in this context and you should use it at your own risk. + + +## Parameters + +A typical usage for classification could be : + +``` +from pystacknet.pystacknet import StackNetClassifier + +model=StackNetClassifier(models, metric="auc", folds=4, + restacking=False,use_retraining=True, use_proba=True, + random_state=12345,n_jobs=1, verbose=1) + +model.fit(x,y) +preds=model.predict_proba(x_test) + + +``` +Where : + + +Command | Explanation +--- | --- +models | List of models. This should be a 2-dimensional list . The first level hould defice the stacking level and each entry is the model. +metric | Can be "auc","logloss","accuracy","f1","matthews" or your own custom metric as long as it implements (ytrue,ypred,sample_weight=) +folds | This can be either integer to define the number of folds used in `StackNet` or an iterable yielding train/test splits. +restacking | True for [restacking](https://github.com/kaz-Anova/StackNet#restacking-mode) else False +use_proba | When evaluating the metric, it will use probabilities instead of class predictions if `use_proba==True` +use_retraining | If `True` it does one model based on the whole training data in order to score the test data. Otherwise it takes the average of all models used in the folds ( however this takes more memory and there is no guarantee that it will work better.) +random_state | WInteger for randomised procedures +n_jobs | Number of models to run in parallel. This is independent of any extra threads allocated + n_jobs | Number of models to run in parallel. This is independent of any extra threads allocated from the selected algorithms. e.g. it is possible to run 4 models in parallel where one is a randomforest that runs on 10 threads (it selected). + verbose | Integer value higher than zero to allow printing at the console. diff --git a/pystacknet/__init__.py b/pystacknet/__init__.py new file mode 100644 index 0000000..99c4176 --- /dev/null +++ b/pystacknet/__init__.py @@ -0,0 +1 @@ +__version__ = '0.0.1' \ No newline at end of file diff --git a/pystacknet/metrics.py b/pystacknet/metrics.py new file mode 100644 index 0000000..becf04e --- /dev/null +++ b/pystacknet/metrics.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Aug 31 18:33:58 2018 + +@author: Marios Michailidis + +metrics and method to check metrics used within StackNet + +""" + +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score , mean_squared_log_error #regression metrics +from sklearn.metrics import roc_auc_score, log_loss ,accuracy_score, f1_score ,matthews_corrcoef +import numpy as np + +valid_regression_metrics=["rmse","mae","rmsle","r2","mape","smape"] +valid_classification_metrics=["auc","logloss","accuracy","f1","matthews"] + +############ classification metrics ############ + +def auc(y_true, y_pred, sample_weight=None): + return roc_auc_score(y_true, y_pred, sample_weight=sample_weight) + +def logloss(y_true, y_pred, sample_weight=None): + return log_loss(y_true, y_pred, sample_weight=sample_weight) + +def accuracy(y_true, y_pred, sample_weight=None): + return accuracy_score(y_true, y_pred, sample_weight=sample_weight) + +def f1(y_true, y_pred, sample_weight=None): + return f1_score(y_true, y_pred, sample_weight=sample_weight) + +def matthews(y_true, y_pred, sample_weight=None): + return matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight) + +############ regression metrics ############ + +def rmse(y_true, y_pred, sample_weight=None): + return np.sqrt(mean_squared_error(y_true, y_pred, sample_weight=sample_weight)) + +def mae(y_true, y_pred, sample_weight=None): + return mean_absolute_error(y_true, y_pred, sample_weight=sample_weight) + +def rmsle (y_true, y_pred, sample_weight=None): + return np.sqrt(mean_squared_log_error(y_true, y_pred, sample_weight=sample_weight)) + +def r2(y_true, y_pred, sample_weight=None): + return r2_score(y_true, y_pred, sample_weight=sample_weight) + + +def mape(y_true, y_pred, sample_weight=None): + y_true = y_true.ravel() + y_pred = y_pred.ravel() + if sample_weight is not None: + sample_weight = sample_weight.ravel() + eps = 1E-15 + ape = np.abs((y_true - y_pred) / (y_true + eps)) * 100 + ape[y_true == 0] = 0 + return np.average(ape, weights=sample_weight) + + +def smape(y_true, y_pred, sample_weight=None): + + y_true = y_true.ravel() + y_pred = y_pred.ravel() + if sample_weight is not None: + sample_weight = sample_weight.ravel() + eps = 1E-15 + sape = (np.abs(y_true - y_pred) / (0.5 * (np.abs(y_true) + np.abs(y_pred)) + eps)) * 100 + sape[(y_true == 0) & (y_pred == 0)] = 0 + return np.average(sape, weights=sample_weight) + + +""" +metric: string or class that returns a metric given (y_true, y_pred, sample_weight=None) +Curently supported metrics are "rmse","mae","rmsle","r2","mape","smape" +""" + + +def check_regression_metric(metric): + + if type(metric) is type(None): + raise Exception ("metric cannot be None") + if isinstance(metric, str) : + if metric not in valid_regression_metrics: + raise Exception ("The regression metric has to be one of %s " % (", ".join([str(k) for k in valid_regression_metrics]))) + if metric=="rmse": + return rmse,metric + elif metric=="mae": + return mae,metric + elif metric=="rmsle": + return rmsle,metric + elif metric=="r2": + return r2,metric + elif metric=="mape": + return mape,metric + elif metric=="smape": + return smape,metric + else : + raise Exception ("The metric %s is not recognised " % (metric) ) + else : #customer metrics is given + try: + y_true_temp=[[1],[2],[3]] + y_pred_temp=[[2],[1],[3]] + y_true_temp=np.array(y_true_temp) + y_pred_temp=np.array(y_pred_temp) + sample_weight_temp=[1,0.5,1] + metric(y_true_temp,y_pred_temp, sample_weight=sample_weight_temp ) + return metric,"custom" + + except: + raise Exception ("The custom metric has to implement metric(y_true, y_pred, sample_weight=None)" ) + + +""" +metric: string or class that returns a metric given (y_true, y_pred, sample_weight=None) +Curently supported metrics are "rmse","mae","rmsle","r2","mape","smape" +""" + + +def check_classification_metric(metric): + + if type(metric) is type(None): + raise Exception ("metric cannot be None") + if isinstance(metric, str) : + if metric not in valid_classification_metrics: + raise Exception ("The classification metric has to be one of %s " % (", ".join([str(k) for k in valid_classification_metrics]))) + if metric=="auc": + return auc,metric + elif metric=="logloss": + return logloss,metric + elif metric=="accuracy": + return accuracy,metric + elif metric=="r2": + return r2,metric + elif metric=="f1": + return f1,metric + elif metric=="matthews": + return matthews,metric + else : + raise Exception ("The metric %s is not recognised " % (metric) ) + else : #customer metrics is given + try: + y_true_temp=[[1],[0],[1]] + y_pred_temp=[[0.4],[1],[0.2]] + y_true_temp=np.array(y_true_temp) + y_pred_temp=np.array(y_pred_temp) + sample_weight_temp=[1,0.5,1] + metric(y_true_temp,y_pred_temp, sample_weight=sample_weight_temp ) + return metric,"custom" + + except: + raise Exception ("The custom metric has to implement metric(y_true, y_pred, sample_weight=None)" ) + + + + + \ No newline at end of file diff --git a/pystacknet/pystacknet.py b/pystacknet/pystacknet.py new file mode 100644 index 0000000..d23f357 --- /dev/null +++ b/pystacknet/pystacknet.py @@ -0,0 +1,1602 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Aug 30 23:56:58 2018 + +@author: mimar + + +This module will implement StackNet[https://github.com/kaz-Anova/StackNet] , allowing for both Regression and classification. + + +""" + +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix,hstack,vstack ,csc_matrix +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin +from sklearn.base import clone +from pystacknet.metrics import check_regression_metric, check_classification_metric +from sklearn.model_selection import KFold +from sklearn.utils import check_X_y,check_array,check_consistent_length, column_or_1d +import inspect +from sklearn.externals.joblib import delayed,Parallel +import operator +import time +from sklearn.preprocessing import LabelEncoder + +proba_metrics=["auc","logloss"] +non_proba_metrics=["accuracy","f1","matthews"] + +#(estimator, safe=True) + + + +####### methods for paralellism ############ + +def _parallel_build_estimators(estimator, X, y, sample_weight, index): + """Private function used to build a batch of estimators within a job.""" + # Retrieve settings + n_samples, n_features = X.shape + + + if not type(sample_weight) is type (None): + if "sample_weight" in inspect.getfullargspec(estimator.fit).args: + estimator.fit(X, y, sample_weight=sample_weight) + else : + estimator.fit(X) + else: + estimator.fit(X, y) + + return estimator,index + + +def _parallel_predict_proba(estimator, X, index): + + """Private function used to compute (proba-)predictions within a job.""" + + if hasattr(estimator, 'predict_proba') : + predictions = estimator.predict_proba(X) + elif hasattr(estimator, 'predict') : + predictions = estimator.predict(X) + elif hasattr(estimator, 'transform') : + predictions = estimator.transform(X) + else : + raise Exception ("Each model/algorithm needs to implement at least one of ('predict()','predict_proba()' or 'transform()' ") + + if hasattr(estimator, 'predict_proba') and len(predictions.shape)==2 and predictions.shape[1]==2: + predictions=predictions[:,1] + elif len(predictions.shape)==2 and predictions.shape[1]==1: + predictions=predictions[:,0] + + + return predictions,index + +def _parallel_predict_proba_scoring(estimators, X, index): + preds=None + """Private function used to compute (proba-)predictions within a job.""" + for estimator in estimators: + if hasattr(estimator, 'predict_proba') : + predictions = estimator.predict_proba(X) + elif hasattr(estimator, 'predict') : + predictions = estimator.predict(X) + elif hasattr(estimator, 'transform') : + predictions = estimator.transform(X) + else : + raise Exception ("Each model/algorithm needs to implement at least one of ('predict()','predict_proba()' or 'transform()' ") + + if hasattr(estimator, 'predict_proba') and len(predictions.shape)==2 and predictions.shape[1]==2: + predictions=predictions[:,1] + elif len(predictions.shape)==2 and predictions.shape[1]==1: + predictions=predictions[:,0] + + if type(preds) is type(None): + preds=predictions + else : + if predictions.shape!=preds.shape: + + raise Exception (" predictions' shape not equal among estimators within the batch as %d!=%d " % (predictions.shape[1],preds.shape[1])) + + preds+=predictions + preds/=float(len(estimators)) + + return preds,index + + +def _parallel_predict(estimator, X, index): + + """Private function used to compute (proba-)predictions within a job.""" + + if hasattr(estimator, 'predict') : + predictions = estimator.predict(X) + elif hasattr(estimator, 'transform') : + predictions = estimator.transform(X) + else : + raise Exception ("Each model/algorithm needs to implement at least one of ('predict()' or 'transform()' ") + + + return predictions,index + + +def predict_from_broba(probas): + preds=np.zeros(probas.shape[0]) + + if len(probas.shape)==1: + preds[probas>=0.5]=1. + else : + preds=np.argmax(probas, axis=1) + return preds + + + + +########################### Classifier ######################### + +""" + +models: List of models. This should be a 2-dimensional list . The first level hould defice the stacking level and each entry is the model. +metric: Can be "auc","logloss","accuracy","f1","matthews" or your own custom metric as long as it implements (ytrue,ypred,sample_weight=) +folds: This can be either integer to define the number of folds used in StackNet or an iterable yielding train/test splits. +restacking: True for restacking (https://github.com/kaz-Anova/StackNet#restacking-mode) else False +use_proba : When evaluating the metric, it will use probabilities instead of class predictions if use_proba==True +use_retraining : If True it does one model based on the whole training data in order to score the test data. Otherwise it takes the average of all models used in the folds ( however this takes more memory and there is no guarantee that it will work better.) +random_state : Integer for randomised procedures +n_jobs : Number of models to run in parallel. This is independent of any extra threads allocated from the selected algorithms. e.g. it is possible to run 4 models in parallel where one is a randomforest that runs on 10 threads (it selected). +verbose : Integer value higher than zero to allow printing at the console. + +""" + + +class StackNetClassifier(BaseEstimator, ClassifierMixin): + + def __init__(self, models, metric="logloss", folds=3, restacking=False, use_retraining=True, use_proba=True, random_state=12345, n_jobs=1, verbose=0): + + #check models + if type(models) is type(None): + raise Exception("Models cannot be None. It needs to be a list of sklearn type of models ") + if not isinstance(models, list): + raise Exception("Models has to be a list of sklearn type of models ") + for l in range (len(models)): + if not isinstance(models[l], list): + raise Exception("Each element in the models' list has to be a list . In other words a 2-dimensional list is epected. ") + for m in range (len(models[l])): + if not hasattr(models[l][m], 'fit') : + raise Exception("Each model/algorithm needs to implement a 'fit() method ") + + if not hasattr(models[l][m], 'predict_proba') and not hasattr(models[l][m], 'predict') and not hasattr(models[l][m], 'transform') : + raise Exception("Each model/algorithm needs to implement at least one of ('predict()','predict_proba()' or 'transform()' ") + self.models= models + + #check metrics + self.metric,self.metric_name=check_classification_metric(metric) + + #check kfold + if not isinstance(folds, int): + try: + object_iterator = iter(folds) + except TypeError as te: + raise Exception( 'folds is not int nor iterable') + else: + if folds <2: + raise Exception( 'folds must be 2 or more') + + self.folds=folds + #check use_proba + if use_proba not in [True, False]: + raise Exception("use_proba has to be True or False") + + if self.metric_name in non_proba_metrics and use_proba==True: + self.use_proba=False + else : + self.use_proba=use_proba + + self.layer_legths=[] + + #check restacking + + if restacking not in [True, False]: + raise Exception("restacking has to be True (to include previous inputs/layers to current layers in stacking) or False") + self.restacking= restacking + + #check retraining + + if use_retraining not in [True, False]: + raise Exception("use_retraining has to be True or False") + + self.use_retraining= use_retraining + + #check random state + if not isinstance(random_state, int): + raise Exception("random_state has to be int") + self.random_state= random_state + + #check verbose + if not isinstance(verbose, int): + raise Exception("Cerbose has to be int") + + #check verbose + self.n_jobs= n_jobs + if self.n_jobs<=0: + self.n_jobs=-1 + + if not isinstance(n_jobs, int): + raise Exception("n_jobs has to be int") + + self.verbose= verbose + + + + self.n_classes_=None + self.classes_ = None + self.n_features_=None + self.estimators_=None + self._n_samples=None + self._sparse=None + self._label_encoder=None + self._level_dims=None + + + def fit (self, X, y, sample_weight=None): + + start_time = time.time() + + + # Convert data (X is required to be 2d and indexable) + X, y = check_X_y( + X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, + multi_output=True + ) + + if isinstance(X, list): + X=np.array(X) + + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + self._sparse=True + else : + self._sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + + if type(sample_weight) is not type(None): + sample_weight = check_array(sample_weight, ensure_2d=False) + check_consistent_length(y, sample_weight) + + # Remap output + self._n_samples, self.n_features_ = X.shape + self._validate_y(y) + + self._label_encoder=LabelEncoder() + y=self._label_encoder.fit_transform(y) + + + classes = np.unique(y) + #print (classes) + if len(classes)<=1: + raise Exception ("Number of classes must be at least 2, here only %d was given " %(len(classes))) + + self.classes_=classes + self.n_classes_=len(self.classes_) + + if isinstance(self.folds, int) : + indices=KFold( n_splits=self.folds,shuffle=True, random_state=self.random_state).split(y) + + else : + indices=self.folds + + self._level_dims =[] + + previous_input=None #holds previous data for restackng + current_input=X + + self.estimators_=[] + ##start the level training + for level in range (len(self.models)): + start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if self._sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + if self.verbose>0: + print ("Input dinesionality: %d at Level %d " % (current_input.shape[1], level)) + + this_level_models=self.models[level] + + if self.verbose>0: + print ("%d models included in Level %d " % (len(this_level_models), level)) + + + train_oof=None + metrics=[0.0 for k in range(len(this_level_models))] + + + indices=[t for t in indices] + + iter_count=len(indices) + #print ("iter_count",iter_count) + + i=0 + #print (i) + #print (indices) + for train_index, test_index in indices: + + #print ( i, i, i) + metrics_i=[0.0 for k in range(len(this_level_models))] + + X_train, X_cv = current_input[train_index], current_input[test_index] + y_train, y_cv = y[train_index], y[test_index] + w_train,w_cv=None,None + if not type(sample_weight) is type (None): + w_train, w_cv = sample_weight[train_index], sample_weight[test_index] + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + X_train, + y_train, + w_train, d) + for d in range(len(this_level_models))) + + # Reduce + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + if self.use_retraining==False: + fitted_estimators=[t[0] for t in this_level_estimators_] + if i==0: + self.estimators_.append([fitted_estimators]) #add level + else : + self.estimators_[level].append(fitted_estimators) + + #parallel predict + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_predict_proba)( + this_level_estimators_[d][0], + X_cv,d) + for d in range(len(this_level_models))) + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + for d in range (len(this_level_models)): + this_model=this_level_models[d] + if self.use_proba: + if hasattr(this_model, 'predict_proba') : + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + elif self.n_classes_==2 and hasattr(this_model, 'predict'): + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + + + else : + if hasattr(this_model, 'predict_proba') : + preds_transformed=predict_from_broba(predictions_[d]) + metrics_i[d]=self.metric(y_cv,preds_transformed, sample_weight=w_cv) # + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Level %d, fold %d/%d , model %d , %s===%f " % (level, i+1, iter_count, d, self.metric_name, metrics_i[d])) + elif self.n_classes_==2 and hasattr(this_model, 'predict'): + preds_transformed=predict_from_broba(predictions_[d]) + metrics_i[d]=self.metric(y_cv,preds_transformed, sample_weight=w_cv) # + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Level %d, fold %d/%d , model %d , %s===%f " % (level, i+1, iter_count, d, self.metric_name, metrics_i[d])) + + + #concatenate predictions + preds_concat_=np.column_stack( predictions_) + #print ("preds_concat_.shape", preds_concat_.shape) + if type(train_oof) is type(None): + train_oof=np.zeros ( (current_input.shape[0], preds_concat_.shape[1])) + self._level_dims.append(preds_concat_.shape[1]) + + + if self._level_dims[level]!=preds_concat_.shape[1]: + raise Exception ("Output dimensionality among folds is not consistent as %d!=%d " % ( self._level_dims[level],preds_concat_.shape[1])) + train_oof[test_index] = preds_concat_ + print ("=========== end of fold % in level %d ===========" %(i+1,level)) + i+=1 + + metrics=np.array(metrics) + metrics/=float(iter_count) + + if self.verbose>0: + for d in range(len(this_level_models)): + this_model=this_level_models[d] + if hasattr(this_model, 'predict_proba') : + print ("Level %d, model %d , %s===%f " % (level, d, self.metric_name, metrics[d])) + + + #done cv + + if self.use_retraining: + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + current_input, + y, + sample_weight, d) + for d in range(len(this_level_models))) + + + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + fitted_estimators=[t[0] for t in this_level_estimators_] + + self.estimators_.append([fitted_estimators]) #add level + + + previous_input=current_input + current_input=train_oof + if self.verbose>0: + print ("Output dimensionality of level %d is %d " % ( level,current_input.shape[1] )) + + + + end_of_level_time=time.time() + if self.verbose>0: + print ("====================== End of Level %d ======================" % (level)) + print (" level %d lasted %f seconds " % (level,end_of_level_time-start_level_time )) + + end_of_fit_time=time.time() + if self.verbose>0: + + print ("====================== End of fit ======================") + print (" fit() lasted %f seconds " % (end_of_fit_time-start_time )) + + + # fit method that returns all out of fold predictions/outputs for all levels + #each ith entry is a stack of oof predictions for the ith level + + def fit_oof (self, X, y, sample_weight=None): + + start_time = time.time() + + + # Convert data (X is required to be 2d and indexable) + X, y = check_X_y( + X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, + multi_output=True + ) + + if isinstance(X, list): + X=np.array(X) + + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + self._sparse=True + else : + self._sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + + if type(sample_weight) is not type(None): + sample_weight = check_array(sample_weight, ensure_2d=False) + check_consistent_length(y, sample_weight) + + # Remap output + self._n_samples, self.n_features_ = X.shape + self._validate_y(y) + + self._label_encoder=LabelEncoder() + y=self._label_encoder.fit_transform(y) + + out_puts=[] + + classes = np.unique(y) + #print (classes) + if len(classes)<=1: + raise Exception ("Number of classes must be at least 2, here only %d was given " %(len(classes))) + + self.classes_=classes + self.n_classes_=len(self.classes_) + + if isinstance(self.folds, int) : + indices=KFold( n_splits=self.folds,shuffle=True, random_state=self.random_state).split(y) + + else : + indices=self.folds + + self._level_dims =[] + + previous_input=None #holds previous data for restackng + current_input=X + + self.estimators_=[] + ##start the level training + for level in range (len(self.models)): + start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if self._sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + if self.verbose>0: + print ("Input dinesionality: %d at Level %d " % (current_input.shape[1], level)) + + this_level_models=self.models[level] + + if self.verbose>0: + print ("%d models included in Level %d " % (len(this_level_models), level)) + + + train_oof=None + metrics=[0.0 for k in range(len(this_level_models))] + + + indices=[t for t in indices] + + iter_count=len(indices) + #print ("iter_count",iter_count) + + i=0 + #print (i) + #print (indices) + for train_index, test_index in indices: + + #print ( i, i, i) + metrics_i=[0.0 for k in range(len(this_level_models))] + + X_train, X_cv = current_input[train_index], current_input[test_index] + y_train, y_cv = y[train_index], y[test_index] + w_train,w_cv=None,None + if not type(sample_weight) is type (None): + w_train, w_cv = sample_weight[train_index], sample_weight[test_index] + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + X_train, + y_train, + w_train, d) + for d in range(len(this_level_models))) + + # Reduce + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + if self.use_retraining==False: + fitted_estimators=[t[0] for t in this_level_estimators_] + if i==0: + self.estimators_.append([fitted_estimators]) #add level + else : + self.estimators_[level].append(fitted_estimators) + + #parallel predict + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_predict_proba)( + this_level_estimators_[d][0], + X_cv,d) + for d in range(len(this_level_models))) + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + for d in range (len(this_level_models)): + this_model=this_level_models[d] + if self.use_proba: + if hasattr(this_model, 'predict_proba') : + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + elif self.n_classes_==2 and hasattr(this_model, 'predict'): + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + + + else : + if hasattr(this_model, 'predict_proba') : + preds_transformed=predict_from_broba(predictions_[d]) + metrics_i[d]=self.metric(y_cv,preds_transformed, sample_weight=w_cv) # + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Level %d, fold %d/%d , model %d , %s===%f " % (level, i+1, iter_count, d, self.metric_name, metrics_i[d])) + elif self.n_classes_==2 and hasattr(this_model, 'predict'): + preds_transformed=predict_from_broba(predictions_[d]) + metrics_i[d]=self.metric(y_cv,preds_transformed, sample_weight=w_cv) # + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Level %d, fold %d/%d , model %d , %s===%f " % (level, i+1, iter_count, d, self.metric_name, metrics_i[d])) + + + #concatenate predictions + preds_concat_=np.column_stack( predictions_) + + + + #print ("preds_concat_.shape", preds_concat_.shape) + if type(train_oof) is type(None): + train_oof=np.zeros ( (current_input.shape[0], preds_concat_.shape[1])) + self._level_dims.append(preds_concat_.shape[1]) + + + if self._level_dims[level]!=preds_concat_.shape[1]: + raise Exception ("Output dimensionality among folds is not consistent as %d!=%d " % ( self._level_dims[level],preds_concat_.shape[1])) + train_oof[test_index] = preds_concat_ + print ("=========== end of fold % in level %d ===========" %(i+1,level)) + i+=1 + + metrics=np.array(metrics) + metrics/=float(iter_count) + + if self.verbose>0: + for d in range(len(this_level_models)): + this_model=this_level_models[d] + if hasattr(this_model, 'predict_proba') : + print ("Level %d, model %d , %s===%f " % (level, d, self.metric_name, metrics[d])) + + + #done cv + + if self.use_retraining: + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + current_input, + y, + sample_weight, d) + for d in range(len(this_level_models))) + + + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + fitted_estimators=[t[0] for t in this_level_estimators_] + + self.estimators_.append([fitted_estimators]) #add level + + out_puts.append(train_oof) + + previous_input=current_input + current_input=train_oof + if self.verbose>0: + print ("Output dimensionality of level %d is %d " % ( level,current_input.shape[1] )) + + + + end_of_level_time=time.time() + if self.verbose>0: + print ("====================== End of Level %d ======================" % (level)) + print (" level %d lasted %f seconds " % (level,end_of_level_time-start_level_time )) + + end_of_fit_time=time.time() + if self.verbose>0: + + print ("====================== End of fit ======================") + print (" fit() lasted %f seconds " % (end_of_fit_time-start_time )) + + return out_puts + + + def predict_proba (self, X): + + if type(self.n_classes_) is type(None) or self.n_classes_==1: + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.classes_) is type(None) or len(self.classes_)==1: + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.n_features_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.estimators_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._n_samples) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._sparse) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._label_encoder) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._level_dims) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + + if isinstance(X, list): + X=np.array(X) + + predict_sparse=None + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + predict_sparse=True + else : + predict_sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + if X.shape[1]!=self.n_features_: + raise Exception("Input dimensionality of %d is not the same as the trained one with %d " % ( X.shape[1], self.n_features_)) + + + # Remap output + predict_sparse_samples, predict_sparse_n_features_ = X.shape + + previous_input=None #holds previous data for restackng + current_input=X + + ##start the level training + + for level in range (len(self.estimators_)): + #start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if predict_sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + this_level_estimators=self.estimators_[level] + + if self.verbose>0: + print ("%d estimators included in Level %d " % (len(this_level_estimators), level)) + + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_estimators[0])), verbose=0)( + delayed(_parallel_predict_proba_scoring)( + [this_level_estimators[s][d] for s in range (len(this_level_estimators))], + current_input,d) + for d in range(len(this_level_estimators[0]))) + + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + + #concatenate predictions + test_pred=np.column_stack( predictions_) + if test_pred.shape[1]!= self._level_dims[level]: + raise Exception ("Output dimensionality for level %d with %d is not the same as the one during training with %d " %(level,test_pred.shape[1], self._level_dims[level] )) + + previous_input=current_input + current_input=test_pred + + if len(test_pred.shape)==2 and test_pred.shape[1]==1 : + pr=np.zeros( (test_pred.shape[0],2)) + pr[:,1]=test_pred[:,0] + pr[:,0]=1-test_pred[:,0] + test_pred=pr + elif len(test_pred.shape)==1: + pr=np.zeros( (test_pred.shape[0],2)) + pr[:,1]=test_pred + pr[:,0]=1-test_pred + test_pred=pr + return test_pred + + #predicts output up to the specified level + + def predict_up_to(self, X, lev=None): + + if type(self.n_classes_) is type(None) or self.n_classes_==1: + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.classes_) is type(None) or len(self.classes_)==1: + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.n_features_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.estimators_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._n_samples) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._sparse) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._label_encoder) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._level_dims) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + + if isinstance(X, list): + X=np.array(X) + + predict_sparse=None + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + predict_sparse=True + else : + predict_sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + if X.shape[1]!=self.n_features_: + raise Exception("Input dimensionality of %d is not the same as the trained one with %d " % ( X.shape[1], self.n_features_)) + + + # Remap output + predict_sparse_samples, predict_sparse_n_features_ = X.shape + + previous_input=None #holds previous data for restackng + current_input=X + + if type(lev) is type(None): + lev=len(self.estimators_) + + if not isinstance(lev, int): + raise Exception("lev has to be int") + + lev=min(lev,len(self.estimators_) ) + + ##start the level training + + for level in range (len(self.estimators_)): + #start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if predict_sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + this_level_estimators=self.estimators_[level] + + if self.verbose>0: + print ("%d estimators included in Level %d " % (len(this_level_estimators), level)) + + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_estimators[0])), verbose=0)( + delayed(_parallel_predict_proba_scoring)( + [this_level_estimators[s][d] for s in range (len(this_level_estimators))], + current_input,d) + for d in range(len(this_level_estimators[0]))) + + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + + #concatenate predictions + test_pred=np.column_stack( predictions_) + if test_pred.shape[1]!= self._level_dims[level]: + raise Exception ("Output dimensionality for level %d with %d is not the same as the one during training with %d " %(level,test_pred.shape[1], self._level_dims[level] )) + + previous_input=current_input + current_input=test_pred + + if len(test_pred.shape)==2 and test_pred.shape[1]==1 : + pr=np.zeros( (test_pred.shape[0],2)) + pr[:,1]=test_pred[:,0] + pr[:,0]=1-test_pred[:,0] + test_pred=pr + elif len(test_pred.shape)==1: + pr=np.zeros( (test_pred.shape[0],2)) + pr[:,1]=test_pred + pr[:,0]=1-test_pred + test_pred=pr + return test_pred + + + + + def _validate_y(self, y): + if len(y.shape) == 1 or y.shape[1] == 1: + return column_or_1d(y, warn=True) + else: + return y + + + + + + +########################### Regression ######################### + + + +""" + +models: List of models. This should be a 2-dimensional list . The first level hould defice the stacking level and each entry is the model. +metric: Can be "rmse","mae","rmsle","r2","mape","smape" or your own custom metric as long as it implements (ytrue,ypred,sample_weight=) +folds: This can be either integer to define the number of folds used in StackNet or an iterable yielding train/test splits. +restacking: True for restacking (https://github.com/kaz-Anova/StackNet#restacking-mode) else False +use_retraining : If True it does one model based on the whole training data in order to score the test data. Otherwise it takes the average of all models used in the folds ( however this takes more memory and there is no guarantee that it will work better.) +random_state : Integer for randomised procedures +n_jobs : Number of models to run in parallel. This is independent of any extra threads allocated from the selected algorithms. e.g. it is possible to run 4 models in parallel where one is a randomforest that runs on 10 threads (it selected). +verbose : Integer value higher than zero to allow printing at the console. + +""" + + +class StackNetRegressor(BaseEstimator, RegressorMixin): + + def __init__(self, models, metric="rmse", folds=3, restacking=False, use_retraining=True, random_state=12345, n_jobs=1, verbose=0): + + #check models + if type(models) is type(None): + raise Exception("Models cannot be None. It needs to be a list of sklearn type of models ") + if not isinstance(models, list): + raise Exception("Models has to be a list of sklearn type of models ") + for l in range (len(models)): + if not isinstance(models[l], list): + raise Exception("Each element in the models' list has to be a list . In other words a 2-dimensional list is epected. ") + for m in range (len(models[l])): + if not hasattr(models[l][m], 'fit') : + raise Exception("Each model/algorithm needs to implement a 'fit() method ") + + if not hasattr(models[l][m], 'predict_proba') and not hasattr(models[l][m], 'predict') and not hasattr(models[l][m], 'transform') : + raise Exception("Each model/algorithm needs to implement at least one of ('predict()','predict_proba()' or 'transform()' ") + self.models= models + + #check metrics + self.metric,self.metric_name=check_regression_metric(metric) + + #check kfold + if not isinstance(folds, int): + try: + object_iterator = iter(folds) + except TypeError as te: + raise Exception( 'folds is not int nor iterable') + else: + if folds <2: + raise Exception( 'folds must be 2 or more') + + self.folds=folds + + self.layer_legths=[] + + #check restacking + + if restacking not in [True, False]: + raise Exception("restacking has to be True (to include previous inputs/layers to current layers in stacking) or False") + self.restacking= restacking + + #check retraining + + if use_retraining not in [True, False]: + raise Exception("use_retraining has to be True or False") + + self.use_retraining= use_retraining + + #check random state + if not isinstance(random_state, int): + raise Exception("random_state has to be int") + self.random_state= random_state + + #check verbose + if not isinstance(verbose, int): + raise Exception("Cerbose has to be int") + + #check verbose + self.n_jobs= n_jobs + if self.n_jobs<=0: + self.n_jobs=-1 + + if not isinstance(n_jobs, int): + raise Exception("n_jobs has to be int") + + self.verbose= verbose + + self.n_features_=None + self.estimators_=None + self._n_samples=None + self._sparse=None + self._level_dims=None + + + def fit (self, X, y, sample_weight=None): + + start_time = time.time() + + + # Convert data (X is required to be 2d and indexable) + X, y = check_X_y( + X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, + multi_output=True + ) + + if isinstance(X, list): + X=np.array(X) + + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + self._sparse=True + else : + self._sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + + if type(sample_weight) is not type(None): + sample_weight = check_array(sample_weight, ensure_2d=False) + check_consistent_length(y, sample_weight) + + # Remap output + self._n_samples, self.n_features_ = X.shape + self._validate_y(y) + + + + if isinstance(self.folds, int) : + indices=KFold( n_splits=self.folds,shuffle=True, random_state=self.random_state).split(y) + + else : + indices=self.folds + + self._level_dims =[] + + previous_input=None #holds previous data for restackng + current_input=X + + self.estimators_=[] + ##start the level training + for level in range (len(self.models)): + start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if self._sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + if self.verbose>0: + print ("Input dinesionality: %d at Level %d " % (current_input.shape[1], level)) + + this_level_models=self.models[level] + + if self.verbose>0: + print ("%d models included in Level %d " % (len(this_level_models), level)) + + + train_oof=None + metrics=[0.0 for k in range(len(this_level_models))] + + + indices=[t for t in indices] + + iter_count=len(indices) + #print ("iter_count",iter_count) + + i=0 + #print (i) + #print (indices) + for train_index, test_index in indices: + + #print ( i, i, i) + metrics_i=[0.0 for k in range(len(this_level_models))] + + X_train, X_cv = current_input[train_index], current_input[test_index] + y_train, y_cv = y[train_index], y[test_index] + w_train,w_cv=None,None + if not type(sample_weight) is type (None): + w_train, w_cv = sample_weight[train_index], sample_weight[test_index] + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + X_train, + y_train, + w_train, d) + for d in range(len(this_level_models))) + + # Reduce + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + if self.use_retraining==False: + fitted_estimators=[t[0] for t in this_level_estimators_] + if i==0: + self.estimators_.append([fitted_estimators]) #add level + else : + self.estimators_[level].append(fitted_estimators) + + #parallel predict + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_predict_proba)( + this_level_estimators_[d][0], + X_cv,d) + for d in range(len(this_level_models))) + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + for d in range (len(this_level_models)): + this_model=this_level_models[d] + if hasattr(this_model, 'predict') : + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + elif predictions_[d].shape==y_cv.shape : + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + + + #concatenate predictions + preds_concat_=np.column_stack( predictions_) + #print ("preds_concat_.shape", preds_concat_.shape) + if type(train_oof) is type(None): + train_oof=np.zeros ( (current_input.shape[0], preds_concat_.shape[1])) + self._level_dims.append(preds_concat_.shape[1]) + + + if self._level_dims[level]!=preds_concat_.shape[1]: + raise Exception ("Output dimensionality among folds is not consistent as %d!=%d " % ( self._level_dims[level],preds_concat_.shape[1])) + train_oof[test_index] = preds_concat_ + print ("=========== end of fold % in level %d ===========" %(i+1,level)) + i+=1 + + metrics=np.array(metrics) + metrics/=float(iter_count) + + if self.verbose>0: + for d in range(len(this_level_models)): + this_model=this_level_models[d] + if hasattr(this_model, 'predict_proba') : + print ("Level %d, model %d , %s===%f " % (level, d, self.metric_name, metrics[d])) + + + #done cv + + if self.use_retraining: + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + current_input, + y, + sample_weight, d) + for d in range(len(this_level_models))) + + + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + fitted_estimators=[t[0] for t in this_level_estimators_] + + self.estimators_.append([fitted_estimators]) #add level + + + previous_input=current_input + current_input=train_oof + if self.verbose>0: + print ("Output dimensionality of level %d is %d " % ( level,current_input.shape[1] )) + + + + end_of_level_time=time.time() + if self.verbose>0: + print ("====================== End of Level %d ======================" % (level)) + print (" level %d lasted %f seconds " % (level,end_of_level_time-start_level_time )) + + end_of_fit_time=time.time() + if self.verbose>0: + + print ("====================== End of fit ======================") + print (" fit() lasted %f seconds " % (end_of_fit_time-start_time )) + + + # fit method that returns all out of fold predictions/outputs for all levels + #each ith entry is a stack of oof predictions for the ith level + + def fit_oof (self, X, y, sample_weight=None): + + start_time = time.time() + + + # Convert data (X is required to be 2d and indexable) + X, y = check_X_y( + X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, + multi_output=True + ) + + if isinstance(X, list): + X=np.array(X) + + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + self._sparse=True + else : + self._sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + + if type(sample_weight) is not type(None): + sample_weight = check_array(sample_weight, ensure_2d=False) + check_consistent_length(y, sample_weight) + + # Remap output + self._n_samples, self.n_features_ = X.shape + self._validate_y(y) + + + out_puts=[] + + if isinstance(self.folds, int) : + indices=KFold( n_splits=self.folds,shuffle=True, random_state=self.random_state).split(y) + + else : + indices=self.folds + + self._level_dims =[] + + previous_input=None #holds previous data for restackng + current_input=X + + self.estimators_=[] + ##start the level training + for level in range (len(self.models)): + start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if self._sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + if self.verbose>0: + print ("Input dinesionality: %d at Level %d " % (current_input.shape[1], level)) + + this_level_models=self.models[level] + + if self.verbose>0: + print ("%d models included in Level %d " % (len(this_level_models), level)) + + + train_oof=None + metrics=[0.0 for k in range(len(this_level_models))] + + + indices=[t for t in indices] + + iter_count=len(indices) + #print ("iter_count",iter_count) + + i=0 + #print (i) + #print (indices) + for train_index, test_index in indices: + + #print ( i, i, i) + metrics_i=[0.0 for k in range(len(this_level_models))] + + X_train, X_cv = current_input[train_index], current_input[test_index] + y_train, y_cv = y[train_index], y[test_index] + w_train,w_cv=None,None + if not type(sample_weight) is type (None): + w_train, w_cv = sample_weight[train_index], sample_weight[test_index] + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + X_train, + y_train, + w_train, d) + for d in range(len(this_level_models))) + + # Reduce + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + if self.use_retraining==False: + fitted_estimators=[t[0] for t in this_level_estimators_] + if i==0: + self.estimators_.append([fitted_estimators]) #add level + else : + self.estimators_[level].append(fitted_estimators) + + #parallel predict + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_predict_proba)( + this_level_estimators_[d][0], + X_cv,d) + for d in range(len(this_level_models))) + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + for d in range (len(this_level_models)): + this_model=this_level_models[d] + if hasattr(this_model, 'predict') : + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + elif predictions_[d].shape==y_cv.shape : + metrics_i[d]=self.metric(y_cv,predictions_[d], sample_weight=w_cv) + metrics[d]+=metrics_i[d] + if self.verbose>0: + print ("Fold %d/%d , model %d , %s===%f " % (i+1, iter_count, d, self.metric_name, metrics_i[d])) + + + #concatenate predictions + preds_concat_=np.column_stack( predictions_) + #print ("preds_concat_.shape", preds_concat_.shape) + if type(train_oof) is type(None): + train_oof=np.zeros ( (current_input.shape[0], preds_concat_.shape[1])) + self._level_dims.append(preds_concat_.shape[1]) + + + if self._level_dims[level]!=preds_concat_.shape[1]: + raise Exception ("Output dimensionality among folds is not consistent as %d!=%d " % ( self._level_dims[level],preds_concat_.shape[1])) + train_oof[test_index] = preds_concat_ + print ("=========== end of fold % in level %d ===========" %(i+1,level)) + i+=1 + + metrics=np.array(metrics) + metrics/=float(iter_count) + + if self.verbose>0: + for d in range(len(this_level_models)): + this_model=this_level_models[d] + if hasattr(this_model, 'predict_proba') : + print ("Level %d, model %d , %s===%f " % (level, d, self.metric_name, metrics[d])) + + + #done cv + + if self.use_retraining: + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_models)), verbose=0)( + delayed(_parallel_build_estimators)( + clone(this_level_models[d]), + current_input, + y, + sample_weight, d) + for d in range(len(this_level_models))) + + + this_level_estimators_ = [ [t[0],t[1]] for t in all_results] + + this_level_estimators_=sorted(this_level_estimators_, key=operator.itemgetter(1), reverse=False) + + fitted_estimators=[t[0] for t in this_level_estimators_] + + self.estimators_.append([fitted_estimators]) #add level + + out_puts.append(train_oof) + + previous_input=current_input + current_input=train_oof + if self.verbose>0: + print ("Output dimensionality of level %d is %d " % ( level,current_input.shape[1] )) + + + + end_of_level_time=time.time() + if self.verbose>0: + print ("====================== End of Level %d ======================" % (level)) + print (" level %d lasted %f seconds " % (level,end_of_level_time-start_level_time )) + + end_of_fit_time=time.time() + if self.verbose>0: + + print ("====================== End of fit ======================") + print (" fit() lasted %f seconds " % (end_of_fit_time-start_time )) + + return out_puts + + + def predict (self, X): + + + if type(self.n_features_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.estimators_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._n_samples) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._sparse) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._level_dims) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + + if isinstance(X, list): + X=np.array(X) + + predict_sparse=None + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + predict_sparse=True + else : + predict_sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + if X.shape[1]!=self.n_features_: + raise Exception("Input dimensionality of %d is not the same as the trained one with %d " % ( X.shape[1], self.n_features_)) + + + # Remap output + predict_sparse_samples, predict_sparse_n_features_ = X.shape + + previous_input=None #holds previous data for restackng + current_input=X + + ##start the level training + + for level in range (len(self.estimators_)): + #start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if predict_sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + this_level_estimators=self.estimators_[level] + + if self.verbose>0: + print ("%d estimators included in Level %d " % (len(this_level_estimators), level)) + + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_estimators[0])), verbose=0)( + delayed(_parallel_predict_proba_scoring)( + [this_level_estimators[s][d] for s in range (len(this_level_estimators))], + current_input,d) + for d in range(len(this_level_estimators[0]))) + + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + + #concatenate predictions + test_pred=np.column_stack( predictions_) + if test_pred.shape[1]!= self._level_dims[level]: + raise Exception ("Output dimensionality for level %d with %d is not the same as the one during training with %d " %(level,test_pred.shape[1], self._level_dims[level] )) + + previous_input=current_input + current_input=test_pred + + + return test_pred + + #predicts output up to the specified level + + def predict_up_to(self, X, lev=None): + + if type(self.n_features_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self.estimators_) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._n_samples) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._sparse) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + if type(self._level_dims) is type(None) : + raise Exception ("fit() must run successfuly to be able to execute the current method. ") + + if isinstance(X, list): + X=np.array(X) + + predict_sparse=None + if isinstance(X, csr_matrix) or isinstance(X, csc_matrix): + predict_sparse=True + else : + predict_sparse=False + if len(X.shape)==1: + X=X.reshape((X.shape[0],1)) + + if X.shape[1]!=self.n_features_: + raise Exception("Input dimensionality of %d is not the same as the trained one with %d " % ( X.shape[1], self.n_features_)) + + + # Remap output + predict_sparse_samples, predict_sparse_n_features_ = X.shape + + previous_input=None #holds previous data for restackng + current_input=X + + if type(lev) is type(None): + lev=len(self.estimators_) + + if not isinstance(lev, int): + raise Exception("lev has to be int") + + lev=min(lev,len(self.estimators_) ) + + ##start the level training + + for level in range (len(self.estimators_)): + #start_level_time = time.time() + + if self.verbose>0: + print ("====================== Start of Level %d ======================" % (level)) + + if not type(previous_input) is type(None) and self.restacking: + if predict_sparse: + + current_input=csr_matrix(hstack( [csr_matrix(previous_input), csr_matrix(current_input)] )) + else : + + current_input=np.column_stack((previous_input,current_input) ) + + this_level_estimators=self.estimators_[level] + + if self.verbose>0: + print ("%d estimators included in Level %d " % (len(this_level_estimators), level)) + + + + all_results = Parallel(n_jobs=min(self.n_jobs,len(this_level_estimators[0])), verbose=0)( + delayed(_parallel_predict_proba_scoring)( + [this_level_estimators[s][d] for s in range (len(this_level_estimators))], + current_input,d) + for d in range(len(this_level_estimators[0]))) + + this_level_predictions_ = [ [t[0],t[1]] for t in all_results] + + this_level_predictions_=sorted(this_level_predictions_, key=operator.itemgetter(1), reverse=False) + predictions_=[t[0] for t in this_level_predictions_] + + + #concatenate predictions + test_pred=np.column_stack( predictions_) + if test_pred.shape[1]!= self._level_dims[level]: + raise Exception ("Output dimensionality for level %d with %d is not the same as the one during training with %d " %(level,test_pred.shape[1], self._level_dims[level] )) + + previous_input=current_input + current_input=test_pred + + + return test_pred + + + + + def _validate_y(self, y): + if len(y.shape) == 1 or y.shape[1] == 1: + return column_or_1d(y, warn=True) + else: + return y + \ No newline at end of file diff --git a/pystacknet/test/__init__.py b/pystacknet/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pystacknet/test/__pycache__/__init__.cpython-36.pyc b/pystacknet/test/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f83e350 Binary files /dev/null and b/pystacknet/test/__pycache__/__init__.cpython-36.pyc differ diff --git a/pystacknet/test/__pycache__/test_pystacknet.cpython-36-PYTEST.pyc b/pystacknet/test/__pycache__/test_pystacknet.cpython-36-PYTEST.pyc new file mode 100644 index 0000000..44bc5b5 Binary files /dev/null and b/pystacknet/test/__pycache__/test_pystacknet.cpython-36-PYTEST.pyc differ diff --git a/pystacknet/test/test_amazon.py b/pystacknet/test/test_amazon.py new file mode 100644 index 0000000..3458edb --- /dev/null +++ b/pystacknet/test/test_amazon.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 2 23:06:17 2018 + +@author: mimar + +uses the dataset from https://www.kaggle.com/c/amazon-employee-access-challenge/data + +""" +import numpy as np +from pystacknet.pystacknet import StackNetClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn import preprocessing +from lightgbm import LGBMClassifier +from xgboost import XGBClassifier + +def load_data(pat, filename, use_labels=True): + """ + Load data from CSV files and return them as numpy arrays + The use_labels parameter indicates whether one should + read the first column (containing class labels). If false, + return all 0s. + """ + + # load column 1 to 8 (ignore last one) + data = np.loadtxt(open(pat+ filename), delimiter=',', + usecols=range(1, 8), skiprows=1) + if use_labels: + labels = np.loadtxt(open(pat + filename), delimiter=',', + usecols=[0], skiprows=1) + else: + labels = np.zeros(data.shape[0]) + return labels, data + +def save_results(predictions, filename): + """Given a vector of predictions, save results in CSV format.""" + with open(filename, 'w') as f: + f.write("id,ACTION\n") + for i, pred in enumerate(predictions): + f.write("%d,%f\n" % (i + 1, pred)) + + + +def test_pystacknet(): + + + path="" + + + + y, X = load_data(path, 'train.csv') + y_test, X_test = load_data(path, 'test.csv', use_labels=False) + + # === one-hot encoding === # + # we want to encode the category IDs encountered both in + # the training and the test set, so we fit the encoder on both + encoder = preprocessing.OneHotEncoder() + encoder.fit(np.vstack((X, X_test))) + X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) + X_test = encoder.transform(X_test) + + + + ##################################################################################### + ############################### CLASSIFICATION ##################################### + ##################################################################################### + + + models=[ + + [LogisticRegression(C=1, random_state=1), + LogisticRegression(C=3, random_state=1), + Ridge(alpha=0.1, random_state=1), + LogisticRegression(penalty="l1", C=1, random_state=1), + XGBClassifier(max_depth=5,learning_rate=0.1, n_estimators=300, objective="binary:logistic", n_jobs=1, booster="gbtree", random_state=1, colsample_bytree=0.4 ), + XGBClassifier(max_depth=5,learning_rate=0.3, reg_lambda=0.1, n_estimators=300, objective="binary:logistic", n_jobs=1, booster="gblinear", random_state=1, colsample_bytree=0.4 ), + XGBClassifier(max_depth=5,learning_rate=0.1, n_estimators=300, objective="rank:pairwise", n_jobs=1, booster="gbtree", random_state=1, colsample_bytree=0.4 ), + LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0, min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.5, reg_alpha=0.0, reg_lambda=0.0, random_state=1, n_jobs=1) + ], + + [RandomForestClassifier (n_estimators=300, criterion="entropy", max_depth=6, max_features=0.5, random_state=1)] + + + ] + + + ################## proba metric ############################### + + model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(X,y ) + preds=model.predict_proba(X_test)[:,1] + + save_results(preds,path+ "pystacknet_pred.csv") + #print ("auc test 2 , auc %f " % (roc_auc_score(y_test,preds))) + + + +if __name__ == '__main__': + test_pystacknet() \ No newline at end of file diff --git a/pystacknet/test/test_pystacknet.py b/pystacknet/test/test_pystacknet.py new file mode 100644 index 0000000..c7358ad --- /dev/null +++ b/pystacknet/test/test_pystacknet.py @@ -0,0 +1,697 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Sep 2 23:06:17 2018 + +@author: mimar +""" +import numpy as np +from pystacknet.pystacknet import StackNetClassifier,StackNetRegressor +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier,GradientBoostingRegressor +from sklearn.linear_model import LogisticRegression, Ridge +from sklearn.metrics import roc_auc_score, log_loss +from scipy.sparse import csr_matrix +from sklearn.cross_validation import StratifiedKFold +from sklearn.decomposition import PCA +from pystacknet.metrics import rmse,mae + + +X=[ [20000,2,2,1,24,-2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0], +[120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000], +[90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000], +[50000,2,2,1,37,1,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000], +[50000,1,2,1,57,2,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679], +[50000,1,1,2,37,3,0,0,0,0,0,64400,57069,57608,19394,19619,20024,2500,1815,657,1000,1000,800], +[500000,1,1,2,29,4,0,0,0,0,0,367965,412023,445007,542653,483003,473944,55000,40000,38000,20239,13750,13770], +[100000,2,2,2,23,5,-1,-1,0,0,-1,11876,380,601,221,-159,567,380,601,0,581,1687,1542], +[140000,2,3,1,28,6,0,2,0,0,0,11285,14096,12108,12211,11793,3719,3329,0,432,1000,1000,1000], +[20000,1,3,2,35,7,-2,-2,-2,-1,-1,0,0,0,0,13007,13912,0,0,0,13007,1122,0], +[200000,2,3,2,34,8,0,2,0,0,-1,11073,9787,5535,2513,1828,3731,2306,12,50,300,3738,66], +[260000,2,1,2,51,-1,-1,-1,-1,-1,2,12261,21670,9966,8517,22287,13668,21818,9966,8583,22301,0,3640], +[630000,2,2,2,41,-1,0,-1,-1,-1,-1,12137,6500,6500,6500,6500,2870,1000,6500,6500,6500,2870,0], +[70000,1,2,2,30,1,2,2,0,0,2,65802,67369,65701,66782,36137,36894,3200,0,3000,3000,1500,0], +[250000,1,1,2,29,0,0,0,0,0,0,70887,67060,63561,59696,56875,55512,3000,3000,3000,3000,3000,3000], +[50000,2,3,3,23,1,2,0,0,0,0,50614,29173,28116,28771,29531,30211,0,1500,1100,1200,1300,1100], +[20000,1,1,2,24,0,0,2,2,2,2,15376,18010,17428,18338,17905,19104,3200,0,1500,0,1650,0], +[320000,1,1,1,49,0,0,0,-1,-1,-1,253286,246536,194663,70074,5856,195599,10358,10000,75940,20000,195599,50000], +[360000,2,1,1,49,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[180000,2,1,2,29,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[130000,2,3,2,39,0,0,0,0,0,-1,38358,27688,24489,20616,11802,930,3000,1537,1000,2000,930,33764], +[120000,2,2,1,39,-1,-1,-1,-1,-1,-1,316,316,316,0,632,316,316,316,0,632,316,0], +[70000,2,2,2,26,2,0,0,2,2,2,41087,42445,45020,44006,46905,46012,2007,3582,0,3601,0,1820], +[450000,2,1,1,40,-2,-2,-2,-2,-2,-2,5512,19420,1473,560,0,0,19428,1473,560,0,0,1128], +[90000,1,1,2,23,0,0,0,-1,0,0,4744,7070,0,5398,6360,8292,5757,0,5398,1200,2045,2000], +[50000,1,3,2,23,0,0,0,0,0,0,47620,41810,36023,28967,29829,30046,1973,1426,1001,1432,1062,997], +[60000,1,1,2,27,1,-2,-1,-1,-1,-1,-109,-425,259,-57,127,-189,0,1000,0,500,0,1000], +[50000,2,3,2,30,0,0,0,0,0,0,22541,16138,17163,17878,18931,19617,1300,1300,1000,1500,1000,1012], +[50000,2,3,1,47,-1,-1,-1,-1,-1,-1,650,3415,3416,2040,30430,257,3415,3421,2044,30430,257,0], +[50000,1,1,2,26,0,0,0,0,0,0,15329,16575,17496,17907,18375,11400,1500,1500,1000,1000,1600,0], +[230000,2,1,2,27,-1,-1,-1,-1,-1,-1,16646,17265,13266,15339,14307,36923,17270,13281,15339,14307,37292,0], +[50000,1,2,2,33,2,0,0,0,0,0,30518,29618,22102,22734,23217,23680,1718,1500,1000,1000,1000,716], +[100000,1,1,2,32,0,0,0,0,0,0,93036,84071,82880,80958,78703,75589,3023,3511,3302,3204,3200,2504], +[500000,2,2,1,54,-2,-2,-2,-2,-2,-2,10929,4152,22722,7521,71439,8981,4152,22827,7521,71439,981,51582], +[500000,1,1,1,58,-2,-2,-2,-2,-2,-2,13709,5006,31130,3180,0,5293,5006,31178,3180,0,5293,768], +[160000,1,1,2,30,-1,-1,-2,-2,-2,-1,30265,-131,-527,-923,-1488,-1884,131,396,396,565,792,0], +[280000,1,2,1,40,0,0,0,0,0,0,186503,181328,180422,170410,173901,177413,8026,8060,6300,6400,6400,6737], +[60000,2,2,2,22,0,0,0,0,0,-1,15054,9806,11068,6026,-28335,18660,1500,1518,2043,0,47671,617], +[50000,1,1,2,25,1,-1,-1,-2,-2,-2,0,780,0,0,0,0,780,0,0,0,0,0], +[280000,1,1,2,31,-1,-1,2,-1,0,-1,498,9075,4641,9976,17976,9477,9075,0,9976,8000,9525,781], +[360000,1,1,2,33,0,0,0,0,0,0,218668,221296,206895,628699,195969,179224,10000,7000,6000,188840,28000,4000], +[70000,2,1,2,25,0,0,0,0,0,0,67521,66999,63949,63699,64718,65970,3000,4500,4042,2500,2800,2500], +[10000,1,2,2,22,0,0,0,0,0,0,1877,3184,6003,3576,3670,4451,1500,2927,1000,300,1000,500], +[140000,2,2,1,37,0,0,0,0,0,0,59504,61544,62925,64280,67079,69802,3000,3000,3000,4000,4000,3000], +[40000,2,1,2,30,0,0,0,2,0,0,18927,21295,25921,25209,26636,29197,3000,5000,0,2000,3000,0], +[210000,1,1,2,29,-2,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[20000,2,1,2,22,0,0,2,-1,0,0,14028,16484,15800,16341,16675,0,3000,0,16741,334,0,0], +[150000,2,5,2,46,0,0,-1,0,0,-2,4463,3034,1170,1170,0,0,1013,1170,0,0,0,0], +[380000,1,2,2,32,-1,-1,-1,-1,-1,-1,22401,21540,15134,32018,11849,11873,21540,15138,24677,11851,11875,8251], +[20000,1,1,2,24,0,0,0,0,0,0,17447,18479,19476,19865,20480,20063,1318,1315,704,928,912,1069], +[70000,1,3,2,42,1,2,2,2,2,0,37042,36171,38355,39423,38659,39362,0,3100,2000,0,1500,1500], +[100000,2,3,3,43,0,0,0,0,0,0,61559,51163,43824,39619,35762,33258,2000,1606,1500,2000,1500,1000], +[310000,2,2,1,49,-2,-2,-2,-2,-2,-2,13465,7867,7600,11185,3544,464,7875,7600,11185,3544,464,0], +[180000,2,1,2,25,1,2,0,0,0,0,41402,41742,42758,43510,44420,45319,1300,2010,1762,1762,1790,1622], +[150000,2,1,2,29,2,0,0,0,0,0,46224,34993,31434,26518,21042,16540,1600,1718,1049,1500,2000,5000], +[500000,2,1,1,45,-2,-2,-2,-2,-2,-2,1905,3640,162,0,151,2530,3640,162,0,151,2530,0], +[180000,2,3,1,34,0,0,0,-1,-1,-1,16386,15793,8441,7142,-679,8321,8500,1500,7500,679,9000,2000], +[180000,2,2,1,34,0,0,0,0,0,0,175886,173440,172308,168608,132202,129918,8083,7296,5253,4814,4816,3800], +[200000,2,1,2,34,-1,3,2,2,2,2,1587,1098,782,1166,700,1414,0,0,700,0,1200,0], +[400000,2,2,1,29,0,0,0,0,0,0,400134,398857,404205,360199,356656,364089,17000,15029,30000,12000,12000,23000], +[500000,2,3,1,28,0,0,0,0,0,0,22848,23638,18878,14937,13827,15571,1516,1300,1000,1000,2000,2000], +[70000,1,2,1,39,0,0,0,0,0,-1,70800,72060,69938,16518,14096,830,4025,2095,1000,2000,3000,0], +[50000,1,1,2,29,2,2,2,2,2,2,24987,24300,26591,25865,27667,28264,0,2700,0,2225,1200,0], +[50000,2,2,1,46,0,0,0,-2,-2,-2,28718,29166,0,0,0,0,1000,0,0,0,0,0], +[130000,2,2,1,51,-1,-1,-2,-2,-1,-1,99,0,0,0,2353,0,0,0,0,2353,0,0], +[200000,1,1,1,57,-2,-2,-2,-1,2,2,152519,148751,144076,8174,8198,7918,0,0,8222,300,0,1000], +[10000,1,2,1,56,2,2,2,0,0,0,2097,4193,3978,4062,4196,4326,2300,0,150,200,200,160], +[210000,2,1,2,30,2,-1,-1,-1,-1,-1,300,300,1159,2280,300,4250,300,1159,2280,300,4250,909], +[130000,2,3,2,29,1,-2,-2,-1,2,-1,-190,-9850,-9850,10311,10161,7319,0,0,20161,0,7319,13899], +[20000,1,5,2,22,2,0,0,0,0,0,18565,17204,17285,18085,11205,5982,0,1200,1000,500,1000,0], +[80000,1,1,2,31,-1,-1,-1,-1,-1,-1,780,0,390,390,390,390,0,390,390,390,390,390], +[320000,1,2,2,29,2,2,2,2,2,2,58267,59246,60184,58622,62307,63526,2500,2500,0,4800,2400,1600], +[200000,2,2,1,32,-1,-1,-1,-1,2,-1,9076,5787,-684,5247,3848,3151,5818,15,9102,17,3165,1395], +[290000,2,1,2,37,1,-2,-1,-1,-1,-1,0,0,3155,0,2359,0,0,3155,0,2359,0,0], +[340000,1,1,2,32,-1,-1,-1,-1,-1,-1,3048,5550,23337,4291,80153,25820,5713,23453,4314,80552,25949,2016], +[20000,1,2,2,24,0,0,2,0,0,0,14619,17216,16642,16976,17332,18543,2850,0,610,630,1500,0], +[50000,1,3,2,25,-1,0,0,0,0,0,42838,37225,36087,9636,9590,10030,1759,1779,320,500,1000,1000], +[300000,2,1,1,45,-1,-1,-1,-1,-1,-1,291,291,291,291,291,291,291,291,291,291,291,291], +[30000,2,2,2,22,0,0,0,0,0,0,28387,29612,30326,28004,26446,6411,1686,1400,560,3000,1765,0], +[240000,2,2,2,44,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[470000,2,3,3,33,0,0,0,0,0,0,165254,157784,162702,69923,29271,29889,6400,7566,3000,960,1000,3000], +[360000,2,1,2,26,0,0,0,0,0,-1,23411,27796,30400,33100,180000,196,4796,3400,3100,146900,196,2963], +[60000,1,3,2,30,0,0,0,0,0,0,26324,27471,28108,21993,19899,19771,1576,1213,648,768,1140,0], +[400000,2,2,1,44,0,0,2,0,0,0,131595,139060,126819,104430,104990,94058,10700,3,3050,3000,3200,2800], +[50000,2,3,2,49,0,0,0,0,0,0,48909,47863,21489,20414,19342,19482,1676,1302,700,699,849,826], +[160000,1,2,2,33,0,0,0,0,0,0,130028,107808,71934,118418,118407,120418,4400,3547,80000,4500,4800,4500], +[360000,2,1,1,45,-1,-1,2,0,-1,-1,390,1170,780,390,390,390,1170,0,0,390,390,390], +[160000,2,2,2,32,0,0,0,0,0,-1,3826,4751,6604,8604,7072,766,1147,2000,2000,0,766,2303], +[130000,2,1,1,35,0,0,0,-1,-1,-1,81313,117866,17740,1330,7095,1190,40000,5000,1330,7095,1190,2090], +[20000,1,3,2,44,2,2,0,0,0,2,8583,8303,9651,10488,12314,11970,0,1651,1000,2000,0,1500], +[200000,1,1,1,53,2,2,2,2,2,2,138180,140774,142460,144098,147124,149531,6300,5500,5500,5500,5000,5000], +[280000,2,1,2,39,-1,-1,-1,0,0,-2,7524,0,3968,3868,0,0,0,3968,0,0,0,0], +[100000,2,1,2,27,-2,-2,-2,-2,-2,-2,-2000,5555,0,0,0,0,7555,0,0,0,0,0], +[160000,2,2,1,37,-1,-1,-1,-1,-1,-2,880,1602,840,840,0,0,1602,840,840,0,0,7736], +[60000,2,2,2,23,0,0,0,0,0,0,45648,46850,47214,19595,19209,19323,1937,1301,682,690,816,835], +[90000,1,2,2,35,0,0,0,0,0,0,83725,85996,87653,35565,30942,30835,3621,3597,1179,1112,1104,1143], +[360000,1,1,1,43,-1,-1,-1,-1,-1,0,3967,8322,3394,6451,26370,9956,8339,3394,12902,27000,0,68978], +[150000,1,1,2,27,0,0,0,0,0,0,86009,86108,89006,89775,87725,40788,4031,10006,3266,4040,1698,800], +[50000,2,3,1,22,0,0,0,0,0,0,18722,18160,16997,13150,8866,7899,1411,1194,379,281,321,197], +[20000,1,2,1,38,0,0,0,0,0,-1,17973,19367,19559,18240,17928,150,1699,1460,626,1750,150,0], +[140000,1,1,2,32,-2,-2,-2,-2,-2,-2,672,10212,850,415,100,1430,10212,850,415,100,1430,0], +[380000,2,1,2,30,-2,-2,-1,0,0,0,-81,-303,32475,32891,33564,34056,223,33178,1171,1197,1250,5000], +[480000,1,1,1,63,0,0,0,2,2,0,422069,431342,479432,487066,471145,469961,16078,55693,17000,0,18000,24200], +[50000,2,3,2,22,0,0,0,0,0,0,44698,42254,38347,32496,23477,24094,1767,1362,1002,840,995,904], +[60000,2,2,2,26,2,2,2,2,2,0,56685,55208,59175,60218,55447,55305,0,5000,2511,6,3000,3000], +[70000,2,2,2,24,-1,-1,-2,-2,-2,-1,5580,0,0,0,0,26529,0,0,0,0,26529,2000], +[80000,2,2,1,36,-1,-1,-1,-1,-1,-1,6108,2861,3277,3319,1150,1150,2861,3279,3319,1150,1150,1035], +[350000,1,1,2,52,-1,-1,-1,-1,-1,-1,713,2272,722,867,1150,5263,2272,722,867,1150,5263,5011], +[130000,1,2,2,38,0,0,0,-1,-1,-1,171438,178382,39940,120483,44127,126568,10908,0,133657,4566,133841,4796], +[360000,1,2,1,35,1,-2,-2,-2,-2,-2,-103,-103,-103,-103,-103,-103,0,0,0,0,0,0], +[330000,2,1,1,31,0,0,2,0,0,0,105879,108431,105594,105896,106491,107289,9260,0,3593,4100,15794,0], +[50000,1,3,1,47,0,0,2,0,0,0,13244,14722,15181,15928,16671,17393,2000,1000,1000,1000,1000,1000], +[280000,1,2,1,41,2,2,2,2,2,3,135673,138532,134813,144401,152174,149415,6500,0,14254,14850,0,5000], +[100000,2,1,2,24,0,0,0,0,0,0,52128,52692,54477,56076,60100,59713,2000,2677,3076,5080,3000,2033], +[50000,1,2,2,41,0,0,0,0,0,0,19015,19294,20259,20274,20311,19957,1340,1305,700,718,724,684], +[30000,1,1,2,24,-1,2,0,0,3,2,18199,17618,18631,21319,20692,21201,0,1312,3000,0,1000,1000], +[240000,1,1,2,28,-1,-1,-1,-1,-1,-1,326,326,326,5676,476,326,326,326,5676,476,326,526], +[80000,1,2,2,26,2,0,0,0,0,0,14029,15493,16630,17055,17629,18186,2000,1700,1000,1000,1000,1000], +[400000,1,2,1,34,-1,-1,-1,-1,-1,-1,19660,9666,11867,7839,14837,7959,9677,11867,7839,14837,7959,5712], +[240000,2,2,2,38,0,0,0,0,-1,-1,50254,51445,53015,52479,1307,1203,2000,3000,3000,1307,1203,563], +[50000,1,3,2,37,2,2,2,3,2,2,46004,45976,48953,48851,49318,51143,1000,4035,1000,1400,2800,0], +[450000,1,1,1,40,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[110000,2,1,1,48,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[310000,2,2,1,35,2,0,0,0,0,0,304991,311243,306314,258610,246491,198889,13019,11128,8407,8599,6833,5987], +[20000,1,1,2,27,0,0,0,0,0,0,19115,18962,19298,19378,19717,15630,1404,1130,600,861,313,0], +[20000,1,2,2,23,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[200000,1,3,2,52,0,0,0,0,0,0,110151,99530,98951,100914,103146,104993,3568,3585,3602,3848,3669,3784], +[180000,1,1,1,36,0,0,0,0,0,0,163736,116422,99278,95766,97753,95927,4655,2690,2067,2142,2217,1000], +[50000,1,2,1,51,0,0,0,0,0,0,3347,3899,4503,5347,6375,7077,1000,1066,1300,1500,1200,134], +[60000,1,3,1,55,3,2,2,0,0,0,60521,61450,57244,28853,29510,26547,2504,7,1200,1200,1100,1500], +[30000,2,1,2,23,1,-2,-2,-2,-1,-1,4000,5645,3508,-27,13744,5906,5645,3508,27,13771,5911,3024], +[240000,1,1,2,41,1,-1,-1,0,0,-1,95,2622,3301,3164,360,1737,2622,3301,0,360,1737,924], +[420000,1,2,1,34,0,0,0,0,0,0,253454,247743,229049,220951,210606,188108,9744,9553,7603,7830,7253,11326], +[330000,1,3,1,46,0,0,0,0,0,0,227389,228719,229644,227587,227775,228203,8210,8095,8025,8175,8391,8200], +[30000,2,2,2,22,0,0,0,0,0,0,28452,26145,26712,25350,17603,-780,2000,1400,0,500,0,1560], +[240000,1,2,1,34,0,0,0,2,2,2,10674,12035,13681,13269,14158,13891,1500,1800,0,1000,0,327], +[150000,1,1,2,27,0,0,0,0,0,0,17444,19342,22000,24614,27200,30229,2500,3000,3000,3000,3500,5000], +[210000,2,2,1,33,0,0,0,0,0,0,7166,7997,8792,9189,4404,5708,1500,1500,1000,500,2000,546], +[50000,2,3,1,51,-1,-1,-1,-1,-2,-2,752,300,5880,0,0,0,300,5880,0,0,0,0], +[50000,1,1,2,24,0,0,0,0,0,0,50801,50143,49586,19430,19375,18995,2360,1700,1000,900,870,2130], +[240000,1,1,2,47,1,-2,-2,-2,-2,-2,0,0,0,0,0,0,0,0,0,0,0,0], +[180000,1,2,2,28,-1,-1,-1,-1,-1,-1,1832,0,832,332,416,416,0,416,332,500,3500,832], +[50000,1,2,2,23,1,2,2,2,0,0,10131,10833,20583,19996,19879,18065,1000,10000,400,700,800,600], +[170000,1,2,2,29,-2,-2,-2,-2,-2,-2,12159,10000,10000,10000,9983,15846,10000,10000,10000,9983,15863,10000], +[20000,1,1,2,29,-1,-1,-1,-1,0,-1,1199,15586,344,2340,6702,339,15586,344,2340,4702,339,330], +[50000,1,1,2,28,0,0,0,0,0,3,4999,5913,7315,9195,10624,10138,1000,1500,2000,1583,1100,0], +[170000,2,2,2,27,0,0,0,0,0,0,19269,20313,20852,17560,17918,9100,1661,1200,351,358,182,0], +[200000,1,1,2,34,1,2,0,0,0,0,197236,176192,93069,135668,132233,59875,8000,5000,55500,5000,5000,8500], +[80000,2,2,1,23,1,2,3,2,0,0,9168,10522,10205,9898,10123,12034,1650,0,0,379,2091,1], +[260000,2,1,1,60,1,-2,-1,-1,-1,-1,-1100,-1100,21400,0,969,869,0,22500,0,969,1000,0], +[140000,1,2,1,32,0,0,0,0,2,0,86627,78142,68336,64648,58319,55251,3455,3110,5000,0,2100,2602], +[80000,1,1,2,25,0,0,0,0,0,0,42444,55744,43476,41087,41951,31826,30000,3000,6000,8000,2000,14000], +[350000,1,1,2,41,1,-1,-1,-1,-1,-2,208,2906,1000,630,0,0,2906,1000,630,0,0,0], +[280000,2,2,1,56,0,0,0,0,0,0,208775,182350,132257,101783,177145,169311,8042,6700,5137,100000,7000,6321], +[30000,2,3,2,26,0,0,0,0,0,0,9014,10406,11427,11935,13084,14206,1700,1500,1000,1500,1500,1500], +[140000,1,1,1,34,0,0,0,0,0,0,23944,28049,32073,43129,47086,48699,5000,5000,11885,5000,3000,5504], +[200000,2,1,2,37,0,0,0,0,0,0,105420,102870,89643,90938,92505,94031,4000,3250,3250,3500,3560,5000], +[200000,2,3,2,30,0,0,0,0,0,0,196031,196143,189524,167163,146975,122324,7300,7108,7680,6200,5000,4500], +[210000,1,3,1,45,2,3,4,4,5,6,115785,122904,129847,137277,145533,154105,10478,10478,11078,11078,11678,10478], +[50000,1,3,1,57,3,2,0,0,0,0,12854,12362,13447,13427,13711,14083,0,1600,500,500,600,600], +[30000,1,1,2,41,2,2,2,2,2,0,24357,27453,26718,28168,27579,28321,3500,0,2200,0,1200,1250], +[50000,1,2,2,27,2,-1,-1,-1,-1,2,390,390,780,216,1080,540,390,780,216,864,0,390], +[290000,1,3,1,47,-1,-1,-1,-1,0,-1,1234,396,396,792,396,423,396,396,792,0,423,369], +[250000,2,1,1,34,0,0,2,0,0,0,141223,156858,151841,152803,155997,160220,17994,0,5469,5656,6811,3920], +[60000,2,2,1,46,0,0,0,0,0,0,21148,23803,24908,26034,26655,27756,3000,1500,1500,1000,1500,1500], +[110000,2,1,2,27,0,0,0,0,0,0,101640,104795,104855,74737,76058,77254,5500,3900,3000,2900,3000,2800], +[370000,1,1,2,50,-2,-2,-2,-2,-2,-2,6093,15130,8204,15398,4792,13453,15383,8204,15413,4792,13453,4699], +[100000,1,2,1,27,-1,2,2,0,0,0,102349,96847,58824,29336,22979,-246,3166,0,1330,1398,12,50000], +[90000,2,2,1,35,0,0,0,0,0,2,72112,73854,75526,77317,85852,88290,3500,3500,3652,10000,4000,0], +[50000,2,2,2,22,0,0,0,0,0,0,28040,29092,29366,27737,28318,28806,1510,1442,982,1017,1277,567], +[270000,1,2,2,37,0,0,0,0,0,0,37695,33397,30534,27598,26344,24641,5000,2000,3000,4000,3000,2000], +[300000,2,1,2,30,-1,-1,-1,-1,-1,-1,688,3280,0,4340,2672,800,3288,0,4340,2672,800,746], +[50000,2,2,2,22,-1,0,0,0,0,0,8567,15273,11650,7457,3115,7725,15000,1000,149,0,5000,10000], +[50000,2,1,2,24,1,-2,-2,-2,-2,-2,-709,-709,-709,-2898,-3272,-3272,0,0,0,0,0,0], +[360000,1,1,2,29,1,-2,-1,-1,-2,-2,0,0,77,0,0,0,0,77,0,0,0,0], +[130000,1,3,1,56,1,2,2,2,2,3,64617,65978,67282,68557,72796,71345,3000,3000,3000,5500,0,0], +[80000,1,1,2,30,-2,-1,0,0,0,0,6187,100,600,1438,1919,5380,504,500,1000,500,3500,0], +[50000,1,2,2,30,1,2,0,0,0,2,48860,47801,48363,30221,22877,22361,0,1500,1000,2000,0,2000], +[20000,2,2,2,22,0,0,0,0,0,0,16001,12622,13221,13130,14034,14906,1212,1201,500,1500,1500,1000], +[80000,2,2,1,29,0,0,2,0,0,0,77883,81811,80250,61467,10662,11486,5800,1000,600,400,1000,0], +[240000,1,1,2,37,-1,-1,2,0,0,-1,12212,26578,25331,26605,26279,1256,15000,0,2000,0,1256,65935], +[80000,2,3,2,35,0,-1,0,0,0,0,49608,12412,14873,17364,17770,17460,12500,6500,3000,2000,3000,2000], +[500000,2,1,1,47,0,0,0,0,0,0,56422,110616,110340,122967,108834,70064,70010,30357,30000,20000,52183,20000], +[60000,2,2,1,24,0,0,0,0,0,0,58024,57891,48839,18971,19323,19395,2500,1600,3000,1000,737,2000], +[20000,1,2,2,25,0,0,0,0,0,-1,10642,11677,13070,12280,1615,1620,1200,1593,601,135,1824,0], +[100000,2,2,1,38,1,2,0,0,2,0,14483,13961,15323,16268,15868,16448,0,1600,1500,0,1000,1500], +[360000,2,1,2,32,1,-1,-1,-1,-1,-1,2616,57077,5287,68445,13881,16240,57087,5295,68454,13889,16250,38313], +[200000,2,3,2,47,2,2,2,2,2,2,199436,202947,193936,196186,200162,189915,8214,7000,6800,7134,0,6836], +[130000,2,2,1,34,1,-1,0,0,0,0,0,5396,10270,13576,13864,14636,5396,5000,3500,501,1000,2000], +[20000,2,2,2,31,1,5,4,4,3,2,21703,21087,21461,20835,20219,20487,0,1000,0,0,760,0], +[310000,1,1,2,32,0,0,0,0,0,0,59901,62147,62102,65875,60387,43328,10020,6031,10057,5028,5060,4223], +[60000,2,1,2,27,2,0,0,0,2,0,19625,20347,21669,23005,22499,22873,1342,1664,2000,0,900,846], +[180000,2,1,2,29,-1,-1,-1,-2,-1,0,11386,199,0,0,17227,17042,199,0,0,17227,341,5114], +[180000,2,1,2,24,-1,-1,2,0,0,-2,14670,22087,21282,10200,0,0,37867,0,200,0,0,0], +[50000,1,2,1,36,0,0,0,0,-1,-1,47790,18114,18250,-14,72,658,2000,1000,2000,500,1000,20011], +[50000,2,1,2,24,1,2,2,2,2,2,36166,37188,37680,38462,39228,40035,1900,1400,1700,1532,1600,0], +[150000,2,2,1,34,-2,-2,-2,-2,-2,-2,0,0,0,116,0,1500,0,0,116,0,1500,0], +[20000,2,1,2,22,0,0,0,0,-1,0,18553,19446,19065,8332,18868,19247,1500,1032,541,20000,693,1000], +[500000,2,1,1,34,-2,-2,-2,-1,-1,-1,412,138,2299,1251,1206,1151,138,2299,1251,1206,1151,15816], +[30000,2,3,2,22,1,2,2,0,0,0,29010,29256,28122,29836,1630,0,1000,85,1714,104,0,0], +[180000,2,1,1,38,-2,-2,-2,-2,-2,-2,750,0,0,0,0,0,0,0,0,0,0,0] +] + +y=[1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0, + 0,0,0,0,1,0,1,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0, + 0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0] + +y2d=[1,1,0,2,0,2,0,0,2,0,0,0,0,1,0,0,1,0,0,0,0,1,1,1,0,2,1,0,0,0,0,1,0,0,2,0,2,0,1,0,0,0,2,0,0,1,1,1,0,0,1,0,0,2,0,0, + 0,0,0,0,1,0,1,1,0,1,1,0,0,2,0,1,0,2,2,0,2,0,1,1,0,0,1,0,0,0,1,0,2,2,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,2,0,0,0,0,2,0, + 0,0,0,1,0,1,0,0,1,1,0,1,0,2,0,1,1,0,0,2,0,0,0,0,0,0,1,0,1,0,2,0,0,0,1,0,2,0,0,2,0,0,2,0,0,0,1,0,0,0,0,0,0,2,0,0, + 0,0,0,2,0,1,0,1,0,0,0,2,0,0,0,1,0,1,0,1,0,2,0,1,0,0,0,1,0,2,0,1,0] + + +weight=[1. if d%2==0 else 2. for d in range (len(y))] + +x_train=X[:100] +y_train=y[:100] +w_train=weight[:100] +x_test=X[100:] +y_test=y[100:] +w_test=weight[100:] + +def gini(y_true, y_pred, sample_weight=None): + return (roc_auc_score(y_true, y_pred, sample_weight=sample_weight)*2 ) -1. + + + +def R(y_true ,y_pred , sample_weight=None): + + sx=0.0 + sy=0.0 + sx_2=0.0 + sy_2=0.0 + sxy=0.0 + n=0.0 + if sample_weight==None: + for i in range(0,len(y_pred)): + sx=sx+y_pred[i] + sy=sy+y_true[i] + sx_2=sx_2+(y_pred[i]*y_pred[i]) + sy_2=sy_2+(y_true[i]*y_true[i]) + sxy=sxy+(y_pred[i]*y_true[i]) + n+=1.0 + else : + for i in range(0,len(y_pred)): + sx=sx+y_pred[i]*sample_weight[i] + sy=sy+y_true[i]*sample_weight[i] + sx_2=sx_2+(y_pred[i]*y_pred[i])*sample_weight[i] + sy_2=sy_2+(y_true[i]*y_true[i])*sample_weight[i] + sxy=sxy+(y_pred[i]*y_true[i])*sample_weight[i] + n+=sample_weight[i] + cor=(n*sxy - sx*sy)/(np.sqrt(np.abs(n*sx_2-(sx*sx))*np.abs(n*sy_2-(sy*sy)))) + return cor + +def test_pystacknet(): + + Xn=np.array(x_train) + yn=np.array(y_train) + print (Xn.shape, yn.shape) + + + ##################################################################################### + ############################### CLASSIFICATION ##################################### + ##################################################################################### + + + models=[ + + [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + LogisticRegression(random_state=1) + ], + + [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] + + + ] + + ################## no proba metric ############################### + model=StackNetClassifier(models, metric="accuracy", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict_proba(x_test)[:,1] + print ("accuracy test 1 , auc %f " % (roc_auc_score(y_test,preds))) + + ################## proba metric ############################### + + model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict_proba(x_test)[:,1] + print ("auc test 2 , auc %f " % (roc_auc_score(y_test,preds))) + + ################## custom metric ############################### + + model=StackNetClassifier(models, metric=gini, folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict_proba(x_test)[:,1] + print ("custom metric gini test 3 , auc %f " % (gini(y_test,preds))) + + ################## numpy input ############################### + + model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(Xn,yn ) + preds=model.predict_proba(np.array(x_test))[:,1] + print ("numpy auc test 4 , auc %f " % (roc_auc_score(y_test,preds))) + + ################## csr_matrix input ############################### + + model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict_proba(csr_matrix(x_test))[:,1] + print ("csr auc test 5 , auc %f " % (roc_auc_score(y_test,preds))) + + ################## restacking ############################### + + model=StackNetClassifier(models, metric="auc", folds=4, restacking=True, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict_proba(csr_matrix(x_test))[:,1] + print ("restacking auc test 6 , auc %f " % (roc_auc_score(y_test,preds))) + + ################## without retraining ############################### + + model=StackNetClassifier(models, metric="auc", folds=4, restacking=True, + use_retraining=False, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict_proba(csr_matrix(x_test))[:,1] + print ("no retraining auc test 7 , auc %f " % (roc_auc_score(y_test,preds))) + + ################## custom k folder object ############################### + + + k=StratifiedKFold(yn, n_folds=4, shuffle=True, random_state=1251) + + model=StackNetClassifier(models, metric="auc", folds=k, restacking=True, + use_retraining=False, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict_proba(csr_matrix(x_test))[:,1] + print ("custom kfold auc test 8 , auc %f " % (roc_auc_score(y_test,preds))) + + + + ################## regressor in base level ############################### + models_reg=[ + + [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + LogisticRegression(random_state=1) + ], + + [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] + + + ] + + model=StackNetClassifier(models_reg, metric="auc", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict_proba(x_test)[:,1] + print ("with regressor test 9 , auc %f " % (roc_auc_score(y_test,preds))) + + + ################## transformer in base level ############################### + models_pca=[ + + [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + LogisticRegression(random_state=1), + PCA(n_components=4,random_state=1) + ], + + [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] + + + ] + + model=StackNetClassifier(models_pca, metric="auc", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict_proba(x_test)[:,1] + print ("with PCA test 10 , auc %f " % (roc_auc_score(y_test,preds))) + + + ################## multiclass metric ############################### + + model=StackNetClassifier(models, metric="logloss", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y2d[:100] ) + preds=model.predict_proba(x_test) + print ("logloss test 11 , auc %f " % (log_loss(y2d[100:],preds))) + + + + ################## 3 levels ############################### + + models3=[ + + [RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1), + GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + LogisticRegression(random_state=1) + ], + + [GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + LogisticRegression(random_state=1) + ], + + [RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)] + + + ] + + + model=StackNetClassifier(models3, metric="logloss", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y2d[:100] ) + preds=model.predict_proba(x_test) + print ("3 levels test 12 , auc %f " % (log_loss(y2d[100:],preds))) + + + ################## with sample_weight ############################### + + model=StackNetClassifier(models, metric="auc", folds=4, restacking=False, + use_retraining=True, use_proba=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train , sample_weight=w_train) + preds=model.predict_proba(x_test)[:,1] + print ("auc weighted test 13 , auc %f " % (roc_auc_score(y_test,preds, sample_weight=w_test))) + + + ##################################################################################### + ############################### REGRESSION ######################################### + ##################################################################################### + + + + models=[ + + [RandomForestRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + Ridge(random_state=1) + ], + + [RandomForestRegressor (n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] + + + ] + + ################## rmse metric ############################### + model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict(x_test) + print ("rmse test 1 , %f " % (rmse(y_test,preds))) + + ################## mae metric ############################### + + model=StackNetRegressor(models, metric="mae", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict(x_test) + print ("mae test 2 , %f " % (mae(y_test,preds))) + + ################## custom metric ############################### + + model=StackNetRegressor(models, metric=R, folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict(x_test) + print ("custom metric R test 3 %f " % (R(y_test,preds))) + + ################## numpy input ############################### + + model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(Xn,yn ) + preds=model.predict(x_test) + print ("numpy rmse test 4 %f " % (rmse(y_test,preds))) + + ################## csr_matrix input ############################### + + model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict(x_test) + print ("csr test 5 , rmse %f " % (rmse(y_test,preds))) + + ################## restacking ############################### + + model=StackNetRegressor(models, metric="rmse", folds=4, restacking=True, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict(x_test) + print ("restacking rmse test 6 , rmse %f " % (rmse(y_test,preds))) + + ################## without retraining ############################### + + model=StackNetRegressor(models, metric="rmse", folds=4, restacking=True, + use_retraining=False, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict(x_test) + print ("no retraining rmse test 7, rmse %f " % (rmse(y_test,preds))) + + ################## custom k folder object ############################### + + + k=StratifiedKFold(yn, n_folds=4, shuffle=True, random_state=1251) + + model=StackNetRegressor(models, metric="rmse", folds=k, restacking=True, + use_retraining=False,random_state=12345, + n_jobs=1, verbose=1) + + model.fit(csr_matrix( Xn) ,yn ) + preds=model.predict(x_test) + print ("custom kfold rmse test 8, %f " % (rmse(y_test,preds))) + + + + ################## classifier in base level ############################### + models_class=[ + + [RandomForestRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + ExtraTreesClassifier (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + Ridge(random_state=1) + ], + + [RandomForestRegressor (n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] + + + ] + + model=StackNetRegressor(models_class, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict(x_test) + print ("with regressor test 9, rmse %f " % (rmse(y_test,preds))) + + + ################## transformer in base level ############################### + models_pca=[ + + [RandomForestRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + Ridge(random_state=1), + PCA(n_components=4,random_state=1) + ], + + [RandomForestRegressor(n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] + + + ] + + model=StackNetRegressor(models_pca, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train ) + preds=model.predict(x_test) + print ("with PCA test 10 , rmse %f " % (rmse(y_test,preds))) + + + ################## 2d target ############################### + models2=[ + + [RandomForestRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + #GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + Ridge(random_state=1) + ], + + + [RandomForestRegressor(n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] + + + ] + + + model=StackNetRegressor(models2, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,np.column_stack((y_train,y2d[:100] ))) + preds=model.predict(x_test) + print ("rmse test 11 , rmse %f " % (rmse(np.column_stack((y_test,y2d[100:])),preds))) + + + + ################## 3 levels ############################### + + models3=[ + + [RandomForestRegressor(n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + #GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1), + Ridge(random_state=1) + ], + + [ExtraTreesRegressor (n_estimators=100, max_depth=5, max_features=0.5, random_state=1), + Ridge(random_state=1) + ], + + [RandomForestRegressor(n_estimators=200, max_depth=5, max_features=0.5, random_state=1)] + + + ] + + + model=StackNetRegressor(models3, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y2d[:100] ) + preds=model.predict(x_test) + print ("3 levels test 12 , rmse %f " % (rmse(y2d[100:],preds))) + + + ################## with sample)weight ############################### + model=StackNetRegressor(models, metric="rmse", folds=4, restacking=False, + use_retraining=True, random_state=12345, + n_jobs=1, verbose=1) + + model.fit(x_train,y_train,sample_weight=w_train ) + preds=model.predict(x_test) + print ("rmse weighted test 13 , %f " % (rmse(y_test,preds, sample_weight=w_test))) + + +if __name__ == '__main__': + test_pystacknet() \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..743aa59 --- /dev/null +++ b/setup.py @@ -0,0 +1,42 @@ +from setuptools import setup + +try: + from pypandoc import convert + read_md = lambda f: convert(f, 'rst') +except ImportError: + print("warning: pypandoc module not found, could not convert Markdown to RST") + read_md = lambda f: open(f, 'r').read() + +setup( + name='pystacknet', + version='0.0.1', + + author='Marios Michailidis', + author_email='kazanovassoftware@gmail.com', + + packages=['pystacknet', + 'pystacknet.test'], + + url='https://github.com/h2oai/pystacknet', + license='LICENSE.txt', + + description='StackNet framework for python', + long_description=read_md('README.md'), + + install_requires=[ + 'numpy >= 1.15.1', + 'scipy >= 1.1.0', + 'scikit-learn >= 0.19.1' + ], + + classifiers=[ + + + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6' + ], +) \ No newline at end of file