In [0]:
!pip install catboost
!pip install tsfresh
!pip install xgboost
!pip install shap

In [0]:
# The essentials
import pandas as pd
import numpy as np

from collections import defaultdict

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

# Gradient Boosting
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor

# TSFRESH Feature Extraction
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

from collections import defaultdict, Counter
from scipy.stats import norm

from scipy.stats import boxcox, boxcox_normmax
from scipy.special import inv_boxcox

from sklearn.preprocessing import PowerTransformer, StandardScaler

import shap

In [0]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

In [0]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')
all_data = pd.concat([train_df, test_df], axis=0)

train_df = train_df[train_df['phase'] != 'final_rinse']

train_df['phase_int'] = train_df['phase'].map({'pre_rinse': 1, 
                                               'caustic': 2, 
                                               'intermediate_rinse': 4, 
                                               'acid': 8})
test_df['phase_int'] = test_df['phase'].map({'pre_rinse': 1, 
                                             'caustic': 2, 
                                             'intermediate_rinse': 4, 
                                             'acid': 8})
train_process_combinations = pd.DataFrame(train_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
test_process_combinations = pd.DataFrame(test_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
process_combinations = pd.concat([train_process_combinations, test_process_combinations], axis=0)

recipe_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/recipe_metadata.csv', index_col='process_id')
recipe_df = recipe_df.drop('final_rinse', axis=1)
recipe_df['pre_rinse_num'] = recipe_df['pre_rinse'] * 1
recipe_df['caustic_num'] = recipe_df['caustic'] * 2
recipe_df['intermediate_rinse_num'] = recipe_df['intermediate_rinse'] * 4
recipe_df['acid_num'] = recipe_df['acid'] * 8
recipe_df['recipe'] = recipe_df['pre_rinse_num'] + recipe_df['caustic_num'] + recipe_df['intermediate_rinse_num'] + recipe_df['acid_num']

In [0]:
ts_real = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value',
    'flow_diff',
    'supply_flow_log',
    'return_flow_log'
]

# variables we'll use to create our time series features
ts_cols = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value',
    'flow_diff',
    #'supply_flow_log',
    #'return_flow_log'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level',
    'tank_lsh_caustic',
    'tank_lsh_acid',
    'tank_lsh_clean_water',
    'tank_lsh_pre_rinse'
]

process_comb_to_phases = {
    15: ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid'],
    3:  ['pre_rinse', 'caustic'],
    7:  ['pre_rinse', 'caustic', 'intermediate_rinse'],
    1:  ['pre_rinse'],
    8:  ['acid'],
    2:  ['caustic'],
    6:  ['caustic', 'intermediate_rinse'],
    14: ['caustic', 'intermediate_rinse', 'acid'],
}

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

def encode_categorical(df):
    # Currently just copy-pasted from http://drivendata.co/blog/rinse-over-run-benchmark/
    
    # select process_id and pipeline
    meta = df[['process_id', 'pipeline', 'object_id']].drop_duplicates().set_index('process_id') 
    meta['object_id'] = meta['object_id'] // 10
    
    # convert categorical pipeline data to dummy variables
    meta = pd.get_dummies(meta, columns=['pipeline', 'object_id'])
    
    # pipeline L12 not in test data (so useless feature)
    if 'pipeline_L12' in meta:
        meta = meta.drop('pipeline_L12', axis=1)
    
    return meta
  
def count_zeros(x):
  return np.sum(x == 0)
  
def encode_real_timeseries(df):   
    ts_df = df[['process_id'] + ts_cols].set_index('process_id')
    
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', 
                                                   'count', 'median', 'sum', 
                                                   lambda x: x.tail(5).mean(),
                                                   count_zeros])
    
    cols = []
    for col in ts_features.columns:
        cols.append('real_{}'.format(col))
    ts_features.columns = cols
    
    return ts_features

def encode_binary_timeseries(df):
    ts_df = df[['process_id'] + bin_cols].set_index('process_id')
            
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['mean', 'std', 
                                                   lambda x: x.tail(5).mean(),
                                                   count_zeros])
    
    cols = []
    for col in ts_features.columns:
        cols.append('bin_{}'.format(col))
    ts_features.columns = cols
    
    return ts_features
  
def get_tsfresh_features(df):
    extraction_settings = EfficientFCParameters()
    filtered_funcs = ['abs_energy', 'mean_abs_change', 'mean_change', 
                      'skewness', 'kurtosis', 'absolute_sum_of_changes', 
                      'longest_strike_below_mean', 'longest_strike_above_mean', 
                      'count_above_mean', 'count_below_mean', 'last_location_of_maximum', 
                      'first_location_of_maximum', 'last_location_of_minimum', 
                      'first_location_of_minimum', 
                      'percentage_of_reoccurring_datapoints_to_all_datapoints', 
                      'percentage_of_reoccurring_values_to_all_values', 
                      'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points', 
                      'ratio_value_number_to_time_series_length', 'maximum', 'minimum', 
                      'cid_ce', 'symmetry_looking', 'large_standard_deviation', 'quantile', 
                      'autocorrelation', 'number_peaks', 'binned_entropy', 'index_mass_quantile', 
                      'linear_trend',  'number_crossing_m']
#     new_funcs = ['augmented_dickey_fuller', 'number_cwt_peaks', 'agg_autocorrelation',
#                'spkt_welch_density', 'friedrich_coefficients', 'max_langevin_fixed_point',
#                'c3', 'ar_coefficient', 'mean_second_derivative_central', 'ratio_beyond_r_sigma',
#                'energy_ratio_by_chunks', 'partial_autocorrelation',
#                'fft_aggregated', 'time_reversal_asymmetry_statistic', 'range_count']
#     filtered_funcs += new_funcs
    filtered_settings = {}
    for func in filtered_funcs:
      filtered_settings[func] = extraction_settings[func]

    ts_features = extract_features(df[['process_id', 'timestamp', 'return_turbidity', 'return_flow', 'supply_flow', 'target_value', 'flow_diff']], 
                                   column_id='process_id', column_sort="timestamp", 
                                   column_kind=None, column_value=None,
                                   impute_function=impute, 
                                   default_fc_parameters=filtered_settings,
                                   show_warnings=False)
  
    return ts_features
                                       

def create_feature_matrix(df, processes, phases):
    df['return_flow'] = df['return_flow'].apply(lambda x: max(x, 0))
    df['supply_flow'] = df['supply_flow'].apply(lambda x: max(x, 0))
    df['target_value'] = df['return_flow'] * df['return_turbidity']
    df['flow_diff'] = df['supply_flow'] - df['return_flow']
    
    phase_data = df[(df['process_id'].isin(processes)) &
                    ((df['phase'].isin(phases)))]
    
    metadata = encode_categorical(phase_data)
    time_series = encode_real_timeseries(phase_data)
    binary_features = encode_binary_timeseries(phase_data)
    
    if len(phases) > 1:
      last_phase_data = phase_data[phase_data['phase'] == phases[-1]]
      time_series_last_phase = encode_real_timeseries(last_phase_data)
      new_cols = []
      for col in time_series_last_phase.columns:
        new_cols.append('last_{}'.format(col))
      time_series_last_phase.columns = new_cols
      binary_features_last_phase = encode_binary_timeseries(last_phase_data)
      new_cols = []
      for col in binary_features_last_phase.columns:
        new_cols.append('last_{}'.format(col))
      binary_features_last_phase.columns = new_cols
    
    tsfresh_features = get_tsfresh_features(phase_data)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = metadata
    feature_matrix = feature_matrix.merge(time_series, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(binary_features, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(tsfresh_features, left_index=True, right_index=True)
    
    if len(phases) > 1:
      feature_matrix = feature_matrix.merge(time_series_last_phase, left_index=True, right_index=True)
      feature_matrix = feature_matrix.merge(binary_features_last_phase, left_index=True, right_index=True)
    
    return feature_matrix
    
  
def get_processes(data, phases, train=True):
    filtered_processes = []
    phases = set(phases)
    processes = set(data['process_id'])
    for process in processes:
        process_phases = set(data[data['process_id'] == process]['phase'])
        if train:
            if phases.issubset(process_phases):
                filtered_processes.append(process)
        else:
            if len(phases) == len(process_phases) == len(phases.intersection(process_phases)):
                filtered_processes.append(process)
    return filtered_processes

In [0]:
def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

class MAPEMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, targets, weight):
        return custom_mape(np.exp(approxes), np.exp(targets)), len(targets)

In [0]:
from sklearn.base import clone

def fit_stack(clf, name, X_train, y_train, X_test, n_splits=5):
  scaler = StandardScaler()
  cols = X_train.columns
  train_idx = X_train.index
  test_idx = X_test.index
  X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=cols, index=train_idx)
  X_test = pd.DataFrame(scaler.transform(X_test), columns=cols, index=test_idx)
  
  train_predictions = np.zeros((len(X_train),))
  test_predictions = np.zeros((len(X_test), n_splits))
  kf = KFold(n_splits=n_splits, shuffle=True)
  for fold_ix, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train)):
    X_cv_train = X_train.iloc[train_idx, :]
    X_cv_test = X_train.iloc[test_idx, :]
    y_cv_train = y_train.iloc[train_idx]
    y_cv_test = y_train.iloc[test_idx]
    
    clf_clone = clone(clf)
    clf_clone.fit(X_cv_train, y_cv_train)
    
    print('[{}] Fold #{} MAPE={}'.format(name, fold_ix + 1, custom_mape(np.exp(y_cv_test), np.exp(clf_clone.predict(X_cv_test)))))
    
    train_predictions[test_idx] = np.minimum(np.max(y_cv_train), np.maximum(0, clf_clone.predict(X_cv_test)))
    test_predictions[:, fold_ix] = np.minimum(np.max(y_cv_train), np.maximum(0, clf_clone.predict(X_test)))
    
  train_predictions_df = pd.DataFrame(train_predictions, index=X_train.index, columns=['{}_pred'.format(name)])
  # Taking min instead of mean, since undershooting is better than overshooting for MAPE
  test_predictions_df = pd.DataFrame(np.min(test_predictions, axis=1), index=X_test.index, columns=['{}_pred'.format(name)])
    
  return train_predictions_df, test_predictions_df

In [0]:
#from tsfresh.feature_selection.relevance import calculate_relevance_table

def get_corr_features(X):
  row_idx, col_idx = np.where(X.corr() == 1)
  self_corr = set([(i, i) for i in range(X.shape[1])])
  return set(list(zip(row_idx, col_idx))) - self_corr 

def get_uncorr_features(data):
  X_train_corr = data.copy()
  correlated_features = get_corr_features(X_train_corr)
  
  corr_cols = set()
  for row_idx, col_idx in correlated_features:
    corr_cols.add(row_idx)
    corr_cols.add(col_idx)
  
  uncorr_cols = list(set(X_train_corr.columns) - set(X_train_corr.columns[list(corr_cols)]))
   
  col_mask = [False]*X_train_corr.shape[1]
  for col in corr_cols:
    col_mask[col] = True
  X_train_corr = X_train_corr.loc[:, col_mask]
  
  correlated_features = get_corr_features(X_train_corr)
  
  while correlated_features:
    print('{} correlated feature pairs left...'.format(len(correlated_features)))
    corr_row, corr_col = correlated_features.pop()
    col_mask = [True]*X_train_corr.shape[1]
    col_mask[corr_row] = False
    X_train_corr = X_train_corr.loc[:, col_mask]
    correlated_features = get_corr_features(X_train_corr)
  return list(set(list(X_train_corr.columns) + uncorr_cols))

def remove_features(data, target, p_val=0.25):
  single_cols = list(data.columns[data.nunique() == 1])
  
  uncorr_cols = get_uncorr_features(data)
  corr_cols = list(set(data.columns) - set(uncorr_cols))
  
  return list(set(single_cols + corr_cols))

In [0]:
combinations_per_recipe = {
    3: [3], 
    9: [8],
    15: [1]
}

import warnings; warnings.filterwarnings('ignore')

prediction_df = None
all_mapes = defaultdict(list)
for recipe in [15]:
    recipe_train_data = train_df[train_df['process_id'].isin(recipe_df[recipe_df['recipe'] == recipe].index)]
    recipe_test_data = test_df[test_df['process_id'].isin(recipe_df[recipe_df['recipe'] == recipe].index)]
    for process_combination in combinations_per_recipe[recipe]:
      print('Recipe = {} || Combination = {}'.format(recipe, process_combination))
      train_processes = get_processes(recipe_train_data, process_comb_to_phases[process_combination])
      phase_features = create_feature_matrix(train_df, train_processes, process_comb_to_phases[process_combination])
      
      X = phase_features.loc[train_processes]
      y = np.log(label_df.loc[X.index]['final_rinse_total_turbidity_liter'])
    
      to_drop = remove_features(X, y)
      X = X.drop(to_drop, axis=1)
    
      kf = KFold(n_splits=5, random_state=2019)
      mapes = []
      shaps = []
      for train_idx, test_idx in kf.split(X, y):
        X_train = X.iloc[train_idx, :]
        X_test = X.iloc[test_idx, :]

        y_train = y.iloc[train_idx]
        y_test = y.iloc[test_idx]
        
        clfs = [
            ('knn', GridSearchCV(KNeighborsRegressor(), {'n_neighbors': [3, 5, 10, 25, 100]})),
            ('lr', GridSearchCV(Lasso(max_iter=10000), {'alpha': [1.0, 10.0, 100.0, 1000.0]})),
            ('knn_pca', GridSearchCV(
                Pipeline(steps=[('pca', PCA()), ('knn', KNeighborsRegressor())]),
                {'knn__n_neighbors': [10, 25, 100], 'pca__n_components': [5, 10, 25]}
              )
            ),
            ('mlp', GridSearchCV(MLPRegressor(max_iter=1000), {'hidden_layer_sizes': [(100,), (250,), (100, 100)]})),
            ('rf_25', RandomForestRegressor(n_estimators=25)),
            ('rf_100', RandomForestRegressor(n_estimators=100)),
            ('rf_250', RandomForestRegressor(n_estimators=250)),
            ('et_25', ExtraTreesRegressor(n_estimators=25)),
            ('et_100', ExtraTreesRegressor(n_estimators=100)),
            ('et_250', ExtraTreesRegressor(n_estimators=250)),
        ]
        
        for name, clf in clfs:
            train_pred_df, test_pred_df = fit_stack(clf, name, X_train, y_train, X_test)
            X_train = pd.concat([X_train, train_pred_df], axis=1)
            X_test = pd.concat([X_test, test_pred_df], axis=1)
            
        train_idx = np.random.choice(X_train.index, replace=False, size=int(0.9 * len(X_train)))
        val_idx = list(set(X_train.index) - set(train_idx))

        X_val = X_train.loc[val_idx, :]
        y_val = y_train.loc[val_idx]
        X_train = X_train.loc[train_idx, :]
        y_train = y_train.loc[train_idx]
    
        if (recipe, process_combination) in [(9, 8)]:
          recipe_15_train_data = train_df[train_df['process_id'].isin(recipe_df[recipe_df['recipe'] == 15].index)]
            
          extra_processes = get_processes(recipe_15_train_data, process_comb_to_phases[process_combination])
          extra_phase_data = train_df[(train_df['process_id'].isin(extra_processes)) &
                                      ((train_df['phase'].isin(process_comb_to_phases[process_combination])))]
          
          extra_phase_features = create_feature_matrix(train_df, extra_processes, process_comb_to_phases[process_combination])
          X_extra = extra_phase_features.loc[list(set(extra_phase_data['process_id']))]
          # WARNING: This does not work!!!! Use log-transform when we augment
          y_extra = np.log(label_df.loc[X_extra.index]['final_rinse_total_turbidity_liter'])

          for col in set(X_train.columns) - set(X_extra.columns):
              X_extra[col] = 0
          X_extra = X_extra[X.columns]

          X_train = pd.concat([X_train, X_extra])
          y_train = pd.concat([y_train, y_extra])
        
        
        print(list(X_train.columns))
        
        print('CV TRAIN = {} || CV VAL = {} || CV TEST = {}'.format(X_train.shape, X_val.shape, X_test.shape))

        cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, 
                                learning_rate=0.33,
                                loss_function='MAPE', eval_metric='MAPE', task_type='GPU')
        cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)
        
        explainer = shap.TreeExplainer(cat)
        shap_values = explainer.shap_values(pd.concat([X_train, X_val, X_test]))
        
        plt.figure()
        shap.summary_plot(shap_values, pd.concat([X_train, X_val, X_test]), max_display=30, 
                          auto_size_plot=True, show=False, color_bar=False)
        plt.show()
        
        predictions = np.exp(cat.predict(X_test))
        mape = custom_mape(predictions, np.exp(y_test))
        print('TEST MAPE = {}'.format(mape))
        mapes.append(mape)
        all_mapes[(recipe, process_combination)].append(mape)

      print('Combination = {}, MAPE = {}+-{}'.format(process_combination, np.mean(mapes), np.std(mapes)))
      
    print('Recipe {}: MAPES: {}'.format(recipe, all_mapes))
    
for k in all_mapes:
    print(k, np.mean(all_mapes[k]), np.std(all_mapes[k]))

In [0]:
combinations_per_recipe = {
    3: [3], #1, 2, 
    9: [8],
    15: [1, 3, 7, 15] # 2, 6, 14
}

prediction_df = None
import warnings; warnings.filterwarnings('ignore')
for recipe in [3]:
    recipe_train_data = train_df[train_df['process_id'].isin(recipe_df[recipe_df['recipe'] == recipe].index)]
    recipe_test_data = test_df[test_df['process_id'].isin(recipe_df[recipe_df['recipe'] == recipe].index)]
    for process_combination in combinations_per_recipe[recipe]:
      print('Recipe = {} || Combination = {}'.format(recipe, process_combination))
      train_processes = get_processes(recipe_train_data, process_comb_to_phases[process_combination])
      test_processes = get_processes(recipe_test_data, process_comb_to_phases[process_combination], train=False)
      all_processes = train_processes + test_processes

      phase_features = create_feature_matrix(all_data, all_processes, process_comb_to_phases[process_combination])

      X_train = phase_features.loc[train_processes]
      X_test = phase_features.loc[test_processes]

      y_train = np.log(label_df.loc[X_train.index]['final_rinse_total_turbidity_liter'])
      
      to_drop = remove_features(X_train, y_train)
      print(len(to_drop), to_drop)

      X_train = X_train.drop(to_drop, axis=1)
      X_test = X_test.drop(to_drop, axis=1)
      
      if (recipe, process_combination) in [(9, 8)]:
          if recipe == 9:
            recipe_15_train_data = train_df[train_df['process_id'].isin(recipe_df[recipe_df['recipe'] == 15].index)]
          else:
            recipe_15_train_data = train_df[train_df['process_id'].isin(recipe_df[recipe_df['recipe'] == 3].index)]
            
          extra_processes = get_processes(recipe_15_train_data, process_comb_to_phases[process_combination])
          extra_phase_data = train_df[(train_df['process_id'].isin(extra_processes)) &
                                      ((train_df['phase'].isin(process_comb_to_phases[process_combination])))]
          extra_phase_features = create_feature_matrix(all_data, extra_processes, process_comb_to_phases[process_combination])
          X_extra = extra_phase_features.loc[list(set(extra_phase_data['process_id']))]
          y_extra = np.log(label_df.loc[X_extra.index]['final_rinse_total_turbidity_liter'])
          
          for col in set(X_train.columns) - set(X_extra.columns):
              X_extra[col] = 0
          X_extra = X_extra[X_train.columns]

          X_train = pd.concat([X_train, X_extra])
          y_train = pd.concat([y_train, y_extra])
        
      clfs = [
          ('knn', GridSearchCV(KNeighborsRegressor(), {'n_neighbors': [3, 5, 10, 25, 100]})),
          ('lr', GridSearchCV(Lasso(max_iter=10000), {'alpha': [1.0, 10.0, 100.0, 1000.0]})),
          ('knn_pca', GridSearchCV(
              Pipeline(steps=[('pca', PCA()), ('knn', KNeighborsRegressor())]),
              {'knn__n_neighbors': [10, 25, 100], 'pca__n_components': [5, 10, 25]}
            )
          ),
          ('mlp', GridSearchCV(MLPRegressor(max_iter=1000), {'hidden_layer_sizes': [(100,), (250,), (100, 100)]})),
          ('rf_25', RandomForestRegressor(n_estimators=25)),
          ('rf_100', RandomForestRegressor(n_estimators=100)),
          ('rf_250', RandomForestRegressor(n_estimators=250)),
          ('et_25', ExtraTreesRegressor(n_estimators=25)),
          ('et_100', ExtraTreesRegressor(n_estimators=100)),
          ('et_250', ExtraTreesRegressor(n_estimators=250)),
          #('tsne_knn', Pipeline(steps=[('tsne', TSNE(n_components=2)), ('knn', GridSearchCV(KNeighborsRegressor(), {'n_neighbors': [3, 5, 10, 25, 100]}))]))
      ]

      for name, clf in clfs:
          train_pred_df, test_pred_df = fit_stack(clf, name, X_train, y_train, X_test, n_splits=5)
          X_train = pd.concat([X_train, train_pred_df], axis=1)
          X_test = pd.concat([X_test, test_pred_df], axis=1)

      train_idx = np.random.choice(X_train.index, replace=False, size=int(0.9 * len(X_train)))
      val_idx = list(set(X_train.index) - set(train_idx))

      X_val = X_train.loc[val_idx, :]
      X_train = X_train.loc[train_idx, :]
      y_val = y_train.loc[val_idx]
      y_train = y_train.loc[train_idx]
      
      print(X_train.shape, X_val.shape, X_test.shape)

      cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, learning_rate=0.33,
                              loss_function='MAPE', eval_metric='MAPE', task_type='GPU')
      cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)

      predictions = np.exp(cat.predict(X_test))

      sub_predictions_df = pd.DataFrame(predictions, columns=['final_rinse_total_turbidity_liter'])
      sub_predictions_df.index = X_test.index
      sub_predictions_df.index.name = X_test.index.name

      if prediction_df is None:
          prediction_df = sub_predictions_df
      else:
          prediction_df = pd.concat([prediction_df, sub_predictions_df])

      del cat

In [0]:
prediction_df = prediction_df.sort_index()
prediction_df.index.name = X_test.index.name
prediction_df = prediction_df.sort_index()
prediction_df.to_csv('/content/drive/My Drive/Rinse Over Run/stacking_v2_not_all_models.csv')

print(len(prediction_df))