In [0]:
# pip installs go here
!pip install mord

Collecting mord
  Downloading https://files.pythonhosted.org/packages/67/9d/c791c841501d9ff4ecb76b57f208dec6cf9f925109c59c995ddec80f9b32/mord-0.6.tar.gz
Building wheels for collected packages: mord
  Building wheel for mord (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/98/14/b2/244c2cec93a0c6edb29b488bd6b2710ded7e9d457033b86366
Successfully built mord
Installing collected packages: mord
Successfully installed mord-0.6


In [0]:
# The essentials
import pandas as pd
import numpy as np

from collections import defaultdict

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, RandomTreesEmbedding
from sklearn.base import clone


from collections import defaultdict, Counter
from scipy.stats import norm

from sklearn.preprocessing import PowerTransformer, StandardScaler
import glob

In [0]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
better_prev_object_id_per_10.csv
dtw_distances_3.p
extended_phase_predictors.csv
final_phase_labels_15_14.csv
final

In [0]:
def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

def fit_stack(clf, name, X_train, y_train, X_test, train_index, test_index, 
              n_splits=5):  
  train_predictions = np.zeros((len(X_train),))
  test_predictions = np.zeros((len(X_test), n_splits))
  kf = KFold(n_splits=n_splits, shuffle=True)
  for fold_ix, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train)):
    X_cv_train = X_train[train_idx, :]
    X_cv_test = X_train[test_idx, :]
    y_cv_train = y_train[train_idx]
    y_cv_test = y_train[test_idx]
    
    clf_clone = clone(clf)
    clf_clone.fit(X_cv_train, y_cv_train)
    
    print('[{}] Fold #{} MAPE={}'.format(name, fold_ix + 1, custom_mape(np.exp(y_cv_test), np.exp(clf_clone.predict(X_cv_test)))))
    
    train_predictions[test_idx] = np.minimum(np.max(y_cv_train), np.maximum(0, clf_clone.predict(X_cv_test)))
    test_predictions[:, fold_ix] = np.minimum(np.max(y_cv_train), np.maximum(0, clf_clone.predict(X_test)))
    
  train_predictions_df = pd.DataFrame(train_predictions, index=train_index, columns=['{}_pred'.format(name)])
  
  # Taking min instead of mean, since undershooting is better than overshooting for MAPE
  # TODO: This changed to mean for now
  test_predictions_df = pd.DataFrame(np.mean(test_predictions, axis=1), index=test_index, columns=['{}_pred'.format(name)])
    
  return train_predictions_df, test_predictions_df

def get_corr_features(X):
  row_idx, col_idx = np.where(X.corr() == 1)
  self_corr = set([(i, i) for i in range(X.shape[1])])
  return set(list(zip(row_idx, col_idx))) - self_corr 

def get_uncorr_features(data):
  X_train_corr = data.copy()
  correlated_features = get_corr_features(X_train_corr)
  
  corr_cols = set()
  for row_idx, col_idx in correlated_features:
    corr_cols.add(row_idx)
    corr_cols.add(col_idx)
  
  uncorr_cols = list(set(X_train_corr.columns) - set(X_train_corr.columns[list(corr_cols)]))
   
  col_mask = [False]*X_train_corr.shape[1]
  for col in corr_cols:
    col_mask[col] = True
  X_train_corr = X_train_corr.loc[:, col_mask]
  
  correlated_features = get_corr_features(X_train_corr)
  
  while correlated_features:
    print('{} correlated feature pairs left...'.format(len(correlated_features)))
    corr_row, corr_col = correlated_features.pop()
    col_mask = [True]*X_train_corr.shape[1]
    col_mask[corr_row] = False
    X_train_corr = X_train_corr.loc[:, col_mask]
    correlated_features = get_corr_features(X_train_corr)
  return list(set(list(X_train_corr.columns) + uncorr_cols))

def remove_features(data, target, p_val=0.25):
  single_cols = list(data.columns[data.nunique() == 1])
  
  uncorr_cols = get_uncorr_features(data)
  corr_cols = list(set(data.columns) - set(uncorr_cols))
  
  return list(set(single_cols + corr_cols))

In [0]:

from sklearn.feature_extraction import FeatureHasher
from sklearn.svm import SVR, LinearSVR, SVC
from sklearn.linear_model import BayesianRidge, LogisticRegression
from sklearn.kernel_approximation import RBFSampler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from mord import OrdinalRidge, LAD

combinations_per_recipe = {
    3: [3, 1, 2],  # 1, 2, 
    9: [8],
    15: [15]#, 3, 7, 1, 2, 6, 14]
}

prediction_df = None
import warnings; warnings.filterwarnings('ignore')
for recipe in [15]:
  for process_combination in combinations_per_recipe[recipe]:
    train_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_{}_{}.csv'.format(recipe, process_combination), index_col=0)
    test_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_{}_{}.csv'.format(recipe, process_combination), index_col=0)
    
    X_train = train_features.drop('target', axis=1)
    y_train = np.log(train_features['target'])
    
    X_test = test_features
    
    #to_drop = remove_features(X_train, y_train)
    to_drop = ["real_('flow_diff', '<lambda>')", 'target_value__symmetry_looking__r_0.45', 'return_turbidity__large_standard_deviation__r_0.5', "real_('tank_level_caustic', 'count')", "flow_('target_value', 'mean')", 'target_value__symmetry_looking__r_0.9', 'supply_flow__symmetry_looking__r_0.45', 'flow_diff__large_standard_deviation__r_0.9500000000000001', 'flow_diff__symmetry_looking__r_0.6000000000000001', 'supply_flow__symmetry_looking__r_0.7000000000000001', "real_('tank_concentration_acid', 'count')", 'flow_diff__symmetry_looking__r_0.4', 'flow_diff__symmetry_looking__r_0.45', "last_real_('tank_temperature_pre_rinse', 'count_zeros')", 'target_value__maximum', "last_flow_('return_flow', 'sum')", 'supply_flow__large_standard_deviation__r_0.75', "real_('return_conductivity', 'count')", "last_real_('tank_temperature_pre_rinse', '<lambda>')", "real_('tank_temperature_pre_rinse', 'count_zeros')", 'return_turbidity__symmetry_looking__r_0.4', 'target_value__symmetry_looking__r_0.9500000000000001', "last_real_('return_flow', 'max')", "real_('tank_level_pre_rinse', 'count_zeros')", 'target_value__autocorrelation__lag_1', 'target_value__large_standard_deviation__r_0.5', 'return_turbidity__large_standard_deviation__r_0.6000000000000001', 'supply_flow__symmetry_looking__r_0.65', "last_flow_('supply_flow', 'sum')", 'return_turbidity__large_standard_deviation__r_0.55', 'return_flow__symmetry_looking__r_0.45', 'return_turbidity__symmetry_looking__r_0.8', "last_bin_('tank_lsh_pre_rinse', 'std')", 'flow_diff__number_crossing_m__m_0', "real_('target_value', 'count')", "last_real_('tank_level_caustic', '<lambda>')", "last_real_('supply_flow', 'count')", 'supply_flow__large_standard_deviation__r_0.5', 'flow_diff__large_standard_deviation__r_0.55', 'return_flow__symmetry_looking__r_0.65', "last_real_('return_turbidity', '<lambda>')", 'target_value__large_standard_deviation__r_0.6000000000000001', "bin_('return_caustic', '<lambda>')", "real_('tank_temperature_acid', '<lambda>')", "last_real_('tank_level_acid', 'count')", 'flow_diff__large_standard_deviation__r_0.8500000000000001', 'flow_diff__large_standard_deviation__r_0.8', "last_real_('tank_temperature_acid', 'count')", 'flow_diff__symmetry_looking__r_0.30000000000000004', "real_('tank_level_acid', '<lambda>')", 'return_flow__symmetry_looking__r_0.9', "real_('tank_temperature_caustic', 'count')", "last_real_('tank_temperature_caustic', 'count')", "bin_('supply_pre_rinse', '<lambda>')", 'flow_diff__maximum', 'return_turbidity__symmetry_looking__r_0.9500000000000001', 'return_turbidity__number_crossing_m__m_-1', 'target_value__large_standard_deviation__r_0.9', "real_('tank_concentration_caustic', 'count')", "last_real_('supply_flow', 'max')", "bin_('tank_lsh_clean_water', '<lambda>')", 'flow_diff__large_standard_deviation__r_0.9', 'return_flow__symmetry_looking__r_0.0', 'return_flow__symmetry_looking__r_0.35000000000000003', 'target_value__large_standard_deviation__r_0.8', 'return_flow__large_standard_deviation__r_0.9', 'return_turbidity__symmetry_looking__r_0.6000000000000001', 'return_turbidity__symmetry_looking__r_0.65', 'target_value__large_standard_deviation__r_0.8500000000000001', "real_('return_conductivity', '<lambda>')", "real_('return_flow', '<lambda>')", 'flow_diff__symmetry_looking__r_0.7000000000000001', "real_('target_value', '<lambda>')", "last_real_('target_value', 'count')", "flow_('target_value', 'sum')", 'return_flow__large_standard_deviation__r_0.55', 'return_turbidity__large_standard_deviation__r_0.65', 'target_value__symmetry_looking__r_0.6000000000000001', "last_bin_('supply_caustic', 'mean')", 'return_flow__large_standard_deviation__r_0.65', "last_real_('flow_diff', 'count')", "real_('return_turbidity', 'count')", "bin_('tank_lsh_acid', 'mean')", "bin_('supply_pump', '<lambda>')", 'return_flow__symmetry_looking__r_0.8', "last_real_('supply_flow', '<lambda>')", 'flow_diff__large_standard_deviation__r_0.7000000000000001', 'return_turbidity__partial_autocorrelation__lag_0', 'target_value__partial_autocorrelation__lag_0', "last_real_('return_turbidity', 'count')", 'supply_flow__large_standard_deviation__r_0.8', "real_('tank_concentration_caustic', 'count_zeros')", "real_('supply_flow', 'count')", 'return_turbidity__symmetry_looking__r_0.5', 'target_value__large_standard_deviation__r_0.45', 'return_turbidity__large_standard_deviation__r_0.8', "bin_('tank_lsh_pre_rinse', 'count_zeros')", "last_real_('tank_level_pre_rinse', 'count')", "last_bin_('tank_lsh_caustic', '<lambda>')", "bin_('tank_lsh_pre_rinse', '<lambda>')", 'return_flow__symmetry_looking__r_0.9500000000000001', "last_bin_('tank_lsh_acid', 'std')", 'target_value__symmetry_looking__r_0.35000000000000003', "last_bin_('return_caustic', '<lambda>')", "last_real_('tank_temperature_caustic', '<lambda>')", 'return_turbidity__large_standard_deviation__r_0.7000000000000001', "bin_('tank_lsh_acid', 'std')", 'flow_diff__large_standard_deviation__r_0.65', 'supply_flow__symmetry_looking__r_0.0', "bin_('return_drain', '<lambda>')", 'flow_diff__symmetry_looking__r_0.8500000000000001', 'flow_diff__large_standard_deviation__r_0.6000000000000001', "last_bin_('return_drain', '<lambda>')", "real_('tank_level_clean_water', 'count_zeros')", 'return_flow__maximum', "last_real_('tank_temperature_acid', 'count_zeros')", "last_bin_('supply_caustic', 'count_zeros')", 'supply_flow__large_standard_deviation__r_0.6000000000000001', "last_real_('return_temperature', 'count')", 'supply_flow__symmetry_looking__r_0.55', 'return_turbidity__symmetry_looking__r_0.8500000000000001', "last_real_('tank_level_pre_rinse', '<lambda>')", 'return_turbidity__symmetry_looking__r_0.75', 'flow_diff__symmetry_looking__r_0.75', 'target_value__symmetry_looking__r_0.0', "real_('return_flow', 'count')", "last_bin_('tank_lsh_clean_water', '<lambda>')", 'supply_flow__large_standard_deviation__r_0.7000000000000001', "real_('tank_level_acid', 'count')", 'return_flow__large_standard_deviation__r_0.9500000000000001', "last_real_('tank_concentration_caustic', 'count_zeros')", "last_flow_('return_flow', 'mean')", "bin_('return_recovery_water', '<lambda>')", "real_('tank_level_clean_water', 'count')", 'supply_flow__large_standard_deviation__r_0.65', "last_bin_('tank_lsh_acid', '<lambda>')", "real_('return_flow', 'sum')", 'return_flow__symmetry_looking__r_0.75', "bin_('tank_lsh_clean_water', 'std')", "real_('tank_concentration_acid', '<lambda>')", "bin_('tank_lsh_acid', '<lambda>')", "last_bin_('tank_lsh_pre_rinse', '<lambda>')", 'return_flow__symmetry_looking__r_0.7000000000000001', 'return_turbidity__symmetry_looking__r_0.7000000000000001', "bin_('supply_caustic', '<lambda>')", 'return_flow__partial_autocorrelation__lag_0', "real_('tank_level_clean_water', '<lambda>')", 'target_value__large_standard_deviation__r_0.75', "last_bin_('tank_lsh_acid', 'count_zeros')", "real_('target_value', 'max')", 'return_flow__symmetry_looking__r_0.6000000000000001', 'target_value__symmetry_looking__r_0.4', "real_('flow_diff', 'count')", "real_('supply_flow', 'mean')", 'return_flow__large_standard_deviation__r_0.5', 'supply_flow__symmetry_looking__r_0.9', 'return_flow__symmetry_looking__r_0.5', 'flow_diff__large_standard_deviation__r_0.4', 'return_turbidity__symmetry_looking__r_0.55', 'target_value__minimum', "flow_('return_flow', 'max')", 'supply_flow__symmetry_looking__r_0.8', "real_('flow_diff', 'min')", "last_real_('tank_level_clean_water', 'count')", "last_real_('tank_concentration_caustic', '<lambda>')", 'return_flow__large_standard_deviation__r_0.6000000000000001', "real_('return_turbidity', 'min')", 'target_value__large_standard_deviation__r_0.55', "real_('tank_level_pre_rinse', 'count')", "last_real_('tank_level_clean_water', 'count_zeros')", 'flow_diff__partial_autocorrelation__lag_0', 'supply_flow__symmetry_looking__r_0.8500000000000001', "last_flow_('target_value', 'max')", "last_bin_('tank_lsh_clean_water', 'mean')", 'supply_flow__large_standard_deviation__r_0.9500000000000001', 'supply_flow__large_standard_deviation__r_0.55', 'supply_flow__symmetry_looking__r_0.5', "last_bin_('supply_pre_rinse', 'std')", 'flow_diff__symmetry_looking__r_0.9', 'return_turbidity__large_standard_deviation__r_0.45', "bin_('supply_clean_water', '<lambda>')", 'flow_diff__symmetry_looking__r_0.9500000000000001', "last_bin_('tank_lsh_acid', 'mean')", 'return_flow__large_standard_deviation__r_0.8', 'return_flow__symmetry_looking__r_0.55', 'supply_flow__large_standard_deviation__r_0.8500000000000001', "last_bin_('object_low_level', '<lambda>')", "bin_('tank_lsh_clean_water', 'mean')", "last_real_('tank_concentration_caustic', 'count')", "last_bin_('supply_pre_rinse', 'count_zeros')", 'target_value__symmetry_looking__r_0.7000000000000001', 'supply_flow__number_crossing_m__m_-1', "real_('return_flow', 'min')", 'return_flow__large_standard_deviation__r_0.8500000000000001', "last_real_('tank_concentration_acid', 'count')", "real_('supply_flow', 'max')", 'supply_flow__symmetry_looking__r_0.75', "last_real_('return_conductivity', 'count_zeros')", 'flow_diff__symmetry_looking__r_0.5', 'flow_diff__symmetry_looking__r_0.65', "last_real_('tank_level_caustic', 'count')", "last_bin_('supply_caustic', 'std')", 'flow_diff__symmetry_looking__r_0.8', "last_bin_('supply_caustic', '<lambda>')", "last_real_('tank_level_caustic', 'count_zeros')", "last_bin_('supply_acid', '<lambda>')", "bin_('tank_lsh_pre_rinse', 'std')", 'target_value__large_standard_deviation__r_0.65', 'flow_diff__large_standard_deviation__r_0.5', 'supply_flow__symmetry_looking__r_0.9500000000000001', "last_real_('return_conductivity', 'count')", 'flow_diff__symmetry_looking__r_0.2', "last_real_('tank_level_acid', 'count_zeros')", "last_bin_('return_acid', '<lambda>')", "flow_('return_flow', 'mean')", "real_('supply_pressure', 'count')", 'return_flow__number_crossing_m__m_-1', 'supply_flow__large_standard_deviation__r_0.9', "real_('return_turbidity', 'max')", "real_('tank_level_acid', 'count_zeros')", 'return_turbidity__large_standard_deviation__r_0.9', 'return_turbidity__symmetry_looking__r_0.45', "last_bin_('return_recovery_water', '<lambda>')", "real_('tank_temperature_pre_rinse', 'count')", "last_real_('tank_temperature_pre_rinse', 'count')", "last_real_('target_value', 'sum')", 'target_value__symmetry_looking__r_0.5', "last_real_('return_temperature', '<lambda>')", 'target_value__symmetry_looking__r_0.8500000000000001', 'target_value__symmetry_looking__r_0.30000000000000004', 'supply_flow__partial_autocorrelation__lag_0', "flow_('supply_flow', 'max')", "last_bin_('supply_pre_rinse', '<lambda>')", "last_flow_('supply_flow', 'mean')", "last_bin_('supply_pre_rinse', 'mean')", 'target_value__large_standard_deviation__r_0.9500000000000001', 'target_value__symmetry_looking__r_0.75', "real_('tank_temperature_acid', 'count')", "bin_('tank_lsh_pre_rinse', 'mean')", "real_('tank_level_caustic', 'count_zeros')", 'flow_diff__large_standard_deviation__r_0.35000000000000003', 'flow_diff__range_count__max_1__min_-1', "last_real_('tank_concentration_acid', 'count_zeros')", "bin_('tank_lsh_clean_water', 'count_zeros')", "real_('supply_pressure', '<lambda>')", 'return_flow__symmetry_looking__r_0.4', 'supply_flow__symmetry_looking__r_0.6000000000000001', 'target_value__symmetry_looking__r_0.8', "last_real_('return_flow', 'count')", "last_bin_('tank_lsh_pre_rinse', 'count_zeros')", 'flow_diff__large_standard_deviation__r_0.45', 'target_value__large_standard_deviation__r_0.7000000000000001', 'target_value__symmetry_looking__r_0.55', 'supply_flow__minimum', 'return_turbidity__large_standard_deviation__r_0.9500000000000001', "real_('supply_flow', 'sum')", "last_bin_('tank_lsh_pre_rinse', 'mean')", 'return_turbidity__large_standard_deviation__r_0.8500000000000001', 'flow_diff__symmetry_looking__r_0.55', 'target_value__symmetry_looking__r_0.65', 'flow_diff__symmetry_looking__r_0.35000000000000003', 'return_flow__large_standard_deviation__r_0.7000000000000001', "bin_('tank_lsh_acid', 'count_zeros')", 'flow_diff__symmetry_looking__r_0.0', 'flow_diff__large_standard_deviation__r_0.75', "last_flow_('target_value', 'mean')", "real_('tank_temperature_acid', 'count_zeros')", 'return_turbidity__large_standard_deviation__r_0.75', 'return_flow__large_standard_deviation__r_0.45', 'return_turbidity__symmetry_looking__r_0.0', "last_real_('tank_level_pre_rinse', 'count_zeros')", "last_bin_('tank_lsh_clean_water', 'std')", "last_bin_('tank_lsh_clean_water', 'count_zeros')", 'flow_diff__symmetry_looking__r_0.25', 'return_flow__symmetry_looking__r_0.8500000000000001', 'return_flow__large_standard_deviation__r_0.75', "last_real_('tank_temperature_caustic', 'count_zeros')", 'return_turbidity__symmetry_looking__r_0.9', 'return_turbidity__symmetry_looking__r_0.35000000000000003']
    print(len(to_drop), to_drop)

    X_train = X_train.drop(to_drop, axis=1)
    X_test = X_test.drop(to_drop, axis=1)
    
    rand_features = np.random.choice(list(X_train.columns), replace=False, size=int(0.5*len(X_train.columns)))
    X_train = X_train[rand_features]
    X_test = X_test[rand_features]
    
    train_index = X_train.index
    test_index = X_test.index
    
#     train_index = X_train.index
#     test_index = X_test.index
    
#     hasher = FeatureHasher()
#     X_train = hasher.fit_transform(X_train.to_dict(orient='records'))
#     X_test = hasher.fit_transform(X_test.to_dict(orient='records'))
    
    clfs = [
#           ('knn_100', Pipeline(steps=[('scale', StandardScaler()), ('knn_100', KNeighborsRegressor(n_neighbors=100))])),
#           ('knn_10', Pipeline(steps=[('scale', StandardScaler()), ('knn_10', KNeighborsRegressor(n_neighbors=10))])),
#           ('knn_50_3', Pipeline(steps=[('scale', StandardScaler()), ('knn_50', KNeighborsRegressor(n_neighbors=50))])),
#           ('knn_250', Pipeline(steps=[('scale', StandardScaler()), ('knn_250', KNeighborsRegressor(n_neighbors=250))])),
#           ('lr', Pipeline(steps=[('scale', StandardScaler()), ('lr', Lasso(max_iter=10000))])),
#           ('knn_pca', Pipeline(steps=[('scale', StandardScaler()), ('pca', PCA(n_components=5)), ('knn', KNeighborsRegressor(n_neighbors=100))])),
#           ('mlp',  Pipeline(steps=[('scale', StandardScaler()), ('mlp', MLPRegressor(max_iter=1000))])),
#           ('rf_25', RandomForestRegressor(n_estimators=25)),
#           ('rf_100', RandomForestRegressor(n_estimators=100)),
#           ('rf_250', RandomForestRegressor(n_estimators=250)),
          ('et_50_4', ExtraTreesRegressor(n_estimators=50)),
#           ('et_100', ExtraTreesRegressor(n_estimators=100)),
#           ('et_250', ExtraTreesRegressor(n_estimators=250)),
#           ('logreg_l1', Pipeline(steps=[('scale', StandardScaler()), ('lr', LogisticRegression(penalty='l1'))])),
#           ('logreg_l2', Pipeline(steps=[('scale', StandardScaler()), ('lr', LogisticRegression(penalty='l2'))])),
#         ('ada_50', AdaBoostRegressor(n_estimators=50)),
#         ('rt_embedding', Pipeline(steps=[('embed', RandomTreesEmbedding(max_depth=3, n_estimators=100)), ('et',  ExtraTreesRegressor(n_estimators=25))])),
#         ('pca_et', Pipeline(steps=[('scale', StandardScaler()), ('pca', PCA(n_components=25)), ('et',  ExtraTreesRegressor(n_estimators=25))]))
#         ('ada_250', AdaBoostRegressor(n_estimators=250)),
          #('et_10_hash', ExtraTreesRegressor(n_estimators=10, max_depth=5, max_features='log2')),
#           ('et_100_depth_3', ExtraTreesRegressor(n_estimators=100, max_depth=3, max_features='sqrt')),
#           ('et_50', ExtraTreesRegressor(n_estimators=50, min_samples_leaf=10)),
#           ('svr_5', Pipeline(steps=[('scale', StandardScaler()), ('svr', SVR())])),
#           ('linear_svr', Pipeline(steps=[('scale', StandardScaler()), ('svr', LinearSVR())])),
#           ('bay_lr', Pipeline(steps=[('scale', StandardScaler()), ('lr', BayesianRidge())])),
#           ('knn_25_rbf', Pipeline(steps=[('scale', StandardScaler()), ('rbf', RBFSampler()), ('knn', KNeighborsRegressor(n_neighbors=25, weights='distance'))])),
#           ('svr_C_100', Pipeline(steps=[('scale', StandardScaler()), ('svr', SVR(C=100))])),
#           #('gp', Pipeline(steps=[('scale', StandardScaler()), ('gp', GaussianProcessRegressor())]))
#           ('svm_clf', Pipeline(steps=[('scale', StandardScaler()), ('svr', SVC())])),
#           ('et_50_clf', ExtraTreesClassifier(n_estimators=50)),
#           ('rf_50_clf', RandomForestClassifier(n_estimators=50)),
#           ('et_100_clf', ExtraTreesClassifier(n_estimators=100)),
#           ('rf_100_clf', RandomForestClassifier(n_estimators=100)),
#           ('et_250_clf', ExtraTreesClassifier(n_estimators=100)),
          #('ord_ridge', Pipeline(steps=[('scale', StandardScaler()), ('ord', OrdinalRidge())]))
        
        #TODO:
#           ('et_25_mae', ExtraTreesRegressor(n_estimators=25, criterion='mae')),
      ]

    for name, clf in clfs:
        print(name)
        train_pred_df, test_pred_df = fit_stack(clf, name, X_train.values, y_train.values, X_test.values, train_index, test_index, n_splits=10)
        train_pred_df.to_csv('/content/drive/My Drive/Rinse Over Run/train_predictions_stack_{}_{}_{}.csv'.format(name, recipe, process_combination))
        test_pred_df.to_csv('/content/drive/My Drive/Rinse Over Run/test_predictions_stack_{}_{}_{}.csv'.format(name, recipe, process_combination))

279 ["real_('flow_diff', '<lambda>')", 'target_value__symmetry_looking__r_0.45', 'return_turbidity__large_standard_deviation__r_0.5', "real_('tank_level_caustic', 'count')", "flow_('target_value', 'mean')", 'target_value__symmetry_looking__r_0.9', 'supply_flow__symmetry_looking__r_0.45', 'flow_diff__large_standard_deviation__r_0.9500000000000001', 'flow_diff__symmetry_looking__r_0.6000000000000001', 'supply_flow__symmetry_looking__r_0.7000000000000001', "real_('tank_concentration_acid', 'count')", 'flow_diff__symmetry_looking__r_0.4', 'flow_diff__symmetry_looking__r_0.45', "last_real_('tank_temperature_pre_rinse', 'count_zeros')", 'target_value__maximum', "last_flow_('return_flow', 'sum')", 'supply_flow__large_standard_deviation__r_0.75', "real_('return_conductivity', 'count')", "last_real_('tank_temperature_pre_rinse', '<lambda>')", "real_('tank_temperature_pre_rinse', 'count_zeros')", 'return_turbidity__symmetry_looking__r_0.4', 'target_value__symmetry_looking__r_0.9500000000000001',

In [0]:
train_pred_df, test_pred_df = fit_stack(RRForestRegressor(), name, X_train.values, y_train.values, X_test.values, n_splits=10)

[rot_et_50] Fold #1 MAPE=1.2176318312591492


KeyboardInterrupt: ignored

In [0]:
# X_train[[0, 5, 10]]

<3x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 2734 stored elements in Compressed Sparse Row format>

In [0]:
# combinations_per_recipe = {
#     3: [3, 1, 2], 
#     9: [8],
#     15: [1, 2, 3, 6, 7, 14, 15]
# }

# prediction_df = None
# import warnings; warnings.filterwarnings('ignore')
# for recipe in [3, 9, 15]:
#   for process_combination in combinations_per_recipe[recipe]:
#     train_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_{}_{}.csv'.format(recipe, process_combination), index_col=0)
#     test_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_{}_{}.csv'.format(recipe, process_combination), index_col=0)
#     labels = pd.read_csv('/content/drive/My Drive/Rinse Over Run/final_phase_labels_{}_{}.csv'.format(recipe, process_combination), index_col=0)
    
#     X_train = train_features.drop('target', axis=1)
#     y_train = np.log(labels.loc[X_train.index]['return_flow_sum'])
    
#     X_test = test_features
    
#     to_drop = remove_features(X_train, y_train)
#     print(len(to_drop), to_drop)

#     X_train = X_train.drop(to_drop, axis=1)
#     X_test = X_test.drop(to_drop, axis=1)
    
#     clfs = [
#           ('rf_25_return_flow_sum', RandomForestRegressor(n_estimators=25)),
#           ('et_25_return_flow_sum', ExtraTreesRegressor(n_estimators=25)),
#       ]

#     for name, clf in clfs:
#         train_pred_df, test_pred_df = fit_stack(clf, name, X_train, y_train, X_test, n_splits=10)
#         train_pred_df.to_csv('/content/drive/My Drive/Rinse Over Run/train_predictions_stack_flow_{}_{}_{}.csv'.format(name, recipe, process_combination))
#         test_pred_df.to_csv('/content/drive/My Drive/Rinse Over Run/test_predictions_stack_flow_{}_{}_{}.csv'.format(name, recipe, process_combination))

1104 correlated feature pairs left...
1062 correlated feature pairs left...
1022 correlated feature pairs left...
984 correlated feature pairs left...
942 correlated feature pairs left...
902 correlated feature pairs left...
864 correlated feature pairs left...
862 correlated feature pairs left...
826 correlated feature pairs left...
824 correlated feature pairs left...
788 correlated feature pairs left...
754 correlated feature pairs left...
720 correlated feature pairs left...
688 correlated feature pairs left...
658 correlated feature pairs left...
630 correlated feature pairs left...
628 correlated feature pairs left...
602 correlated feature pairs left...
578 correlated feature pairs left...
556 correlated feature pairs left...
554 correlated feature pairs left...
552 correlated feature pairs left...
550 correlated feature pairs left...
518 correlated feature pairs left...
498 correlated feature pairs left...
468 correlated feature pairs left...
440 correlated feature pairs left..

KeyboardInterrupt: ignored

In [0]:
# combinations_per_recipe = {
#     3: [1, 2, 3], 
#     9: [8],
#     15: [1, 2, 3, 6, 7, 14, 15]
# }

# prediction_df = None
# import warnings; warnings.filterwarnings('ignore')
# for recipe in [9, 15, 3]:
#   for process_combination in combinations_per_recipe[recipe]:
#     train_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_{}_{}.csv'.format(recipe, process_combination), index_col=0)
#     test_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_{}_{}.csv'.format(recipe, process_combination), index_col=0)
#       labels = pd.read_csv('/content/drive/My Drive/Rinse Over Run/final_phase_labels_{}_{}.csv'.format(recipe, process_combination), index_col=0)
    
#     X_train = train_features.drop('target', axis=1)
#     y_train = np.log(np.maximum(0, labels.loc[X_train.index]['return_turbidity_sum']) + 1)
    
#     X_test = test_features
    
#     to_drop = remove_features(X_train, y_train)
#     print(len(to_drop), to_drop)

#     X_train = X_train.drop(to_drop, axis=1)
#     X_test = X_test.drop(to_drop, axis=1)
    
    
#     clfs = [
#           ('rf_25_return_turbidity_sum', RandomForestRegressor(n_estimators=25)),
#           ('et_25_return_turbidity_sum', ExtraTreesRegressor(n_estimators=25)),
#       ]

#     for name, clf in clfs:
#         train_pred_df, test_pred_df = fit_stack(clf, name, X_train, y_train, X_test, n_splits=10)
#         train_pred_df.to_csv('/content/drive/My Drive/Rinse Over Run/train_predictions_stack_turbidity_{}_{}_{}.csv'.format(name, recipe, process_combination))
#         test_pred_df.to_csv('/content/drive/My Drive/Rinse Over Run/test_predictions_stack_turbidity_{}_{}_{}.csv'.format(name, recipe, process_combination))