In [0]:
!pip install catboost



In [0]:
# The essentials
import pandas as pd
import numpy as np

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

# Gradient Boosting
from catboost import CatBoostRegressor

from collections import defaultdict

In [0]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
dtw_distances_3.p
extended_phase_predictors.csv
last_cleaned_test.csv
last_cleaned_train.csv
mds_embeddings_2d_3.csv
mds_embeddings_2d_3.p
more_features_with_preds_per_phase.csv
pca_features_with_preds_per_phase.csv
predictions_machine_405.csv
processes_all_phases.p
test_features_14.csv
test_features_15.csv
test_features_1.csv
test_features_2.csv
test_features_3.csv
test_features_6.csv
test_features_7.csv
test_features_8.csv
test_features_per_phase_14.csv
test_features_per_phase_15.csv
test_features_per_phase_1.csv
test_features_per_phase_2.csv
test_features_per_phase_3.csv
test_features_per_phase_6.csv
test_features_

In [0]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')

  mask |= (ar1 == a)


In [0]:
# variables we'll use to create our time series features
ts_cols = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level'
]

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

In [0]:
from tsfresh.feature_extraction.feature_calculators import number_cwt_peaks

def encode_categorical(df):
    # Currently just copy-pasted from http://drivendata.co/blog/rinse-over-run-benchmark/
    
    # select process_id and pipeline
    meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') 
    
    # convert categorical pipeline data to dummy variables
    meta = pd.get_dummies(meta)
    
    # pipeline L12 not in test data (so useless feature)
    if 'pipeline_L12' in meta:
        meta = meta.drop('pipeline_L12', axis=1)
    
    # calculate number of phases for each process_object
    meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())
    
    return meta

def percentile_25(x):
  return np.percentile(x, 0.25)

def percentile_75(x):
  return np.percentile(x, 0.75)

def fft_coeffs_real(x, coef):
  return np.fft.rfft(x)[coef].real

def fft_coeffs_imag(x, coef):
  return np.fft.rfft(x)[coef].imag

def fft_coeffs_abs(x, coef):
  return np.fft.rfft(x)[coef].abs

def fft_coeffs_angle(x, coef):
  return np.fft.rfft(x)[coef].angle

def fft_mean(x):
  return np.mean(np.abs(np.fft.rfft(x)[:250]))

def fft_std(x):
  return np.std(np.abs(np.fft.rfft(x)[:250]))

def cwt_peaks(x):
  return number_cwt_peaks(x, 5)
  
def encode_real_timeseries(df):   
    ts_df = df[['process_id'] + ts_cols].set_index('process_id')
    
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', 
                                                   'count', 'median', 'sum', 
                                                   'mad'])
    
    # Now we will get the mean + variance value of the last K measurements for each phase
    # TODO: This can be moved to extracting features per phase
    all_vals_per_phase = []
    K = 5
    col_names = ['process_id'] 
    for phase in phases:
        for col in ts_cols:
            col_names.extend(['mean_{}_{}_{}'.format(col, K, phase), 
                              'std_{}_{}_{}'.format(col, K, phase)])
    for process in tqdm(ts_features.index, total=len(ts_features)):
        vals_per_phase = [process]
        process_filtered_df = df[df['process_id'] == process]
        for phase in phases:
            filtered_df = process_filtered_df[process_filtered_df['phase'] == phase].tail(K)
            for col in ts_cols:
                vals_per_phase.extend([filtered_df[col].mean(), filtered_df[col].std()])
                
        all_vals_per_phase.append(vals_per_phase)
    values_df = pd.DataFrame(all_vals_per_phase, columns=col_names)
    values_df = values_df.set_index('process_id')
    
    ts_features = ts_features.merge(values_df, left_index=True, right_index=True)
    
    col_map = {}
    for col in ts_features.columns:
        col_map[col] = 'real_{}'.format(col)
    ts_features = ts_features.rename(columns=col_map)
    
    return ts_features

def encode_binary_timeseries(df):
    ts_df = df[['process_id'] + bin_cols].set_index('process_id')
            
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['mean', 'std', 'count', 'sum', 'mad'])
    
    # TODO: Count fraction of True in each phase
    feature_vectors = []
    col_names = ['process_id'] 
    for phase in phases:
        for col in bin_cols:
            col_names.append('fraction_{}_{}'.format(col, phase))
            
    # Get fraction of True values for each binary timeseries
    # TODO: This can be moved to extracting features per phase
    for process in tqdm(set(df['process_id']), total=len(set(df['process_id']))):
        vector = [process]
        process_filtered_df = df[df['process_id'] == process]
        for phase in phases:
            filtered_df = process_filtered_df[process_filtered_df['phase'] == phase]
            for col in bin_cols:
                if len(filtered_df):
                    vector.append(sum(filtered_df[col]) / len(filtered_df))
                else:
                    vector.append(np.NaN)
                
        feature_vectors.append(vector)
                
    feature_df = pd.DataFrame(feature_vectors, columns=col_names)
    feature_df = feature_df.set_index('process_id')
    
    feature_df = feature_df.merge(ts_features, left_index=True, right_index=True)
    col_map = {}
    for col in feature_df.columns:
        col_map[col] = 'bin_{}'.format(col)
    feature_df = feature_df.rename(columns=col_map)
    
    return feature_df

def get_descript(data, functions, cols):
    ts_df = data.set_index('process_id').sort_values(by='timestamp')
    return ts_df.groupby('process_id')[cols].agg(functions)  
  
  
def get_descript_prev_process(data):
    machines = set(data['object_id'])
    all_features = []
    for machine in tqdm(machines):
        machine_data = data[data['object_id'] == machine]
        machine_data = machine_data.sort_values(by='timestamp')
        machine_processes = machine_data['process_id'].unique()
        for process_ix, process in enumerate(machine_processes):
            if process_ix > 0:
                prev_process = machine_data[machine_data['process_id'] == machine_processes[process_ix - 1]]
                this_process = machine_data[machine_data['process_id'] == machine_processes[process_ix]]
                features = get_descript(prev_process, ['mean', 'std', 'min', 'max', 'count'], ts_cols)
                _columns = list(features.columns)
                assert len(features) == 1
                features = features.iloc[0, :].values
                time_delta = (this_process['timestamp'].values[0] - prev_process['timestamp'].values[-1]) / np.timedelta64(1, 'h')
                assert time_delta > 0
                all_features.append([machine, process, time_delta] + list(features))
            else:
                all_features.append([machine, process, np.NaN] + ([np.NaN] * 60))
                
    all_features = pd.DataFrame(all_features, columns=['object_id', 'process_id', 'time_delta'] + _columns)
    all_features = all_features.set_index('process_id', drop=True)
    col_map = {}
    for col in all_features.columns:
        col_map[col] = 'prev_{}'.format(col)
    all_features = all_features.rename(columns=col_map)
    return all_features

def create_feature_matrix(df):
    df['return_flow_relu'] = df['return_flow'].apply(lambda x: max(0, x))
    df['target_value'] = df['return_flow_relu'] * df['return_turbidity']
    
    prev_features = get_descript_prev_process(df)
    metadata = encode_categorical(df)
    time_series = encode_real_timeseries(df)
    binary_features = encode_binary_timeseries(df)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = metadata
    feature_matrix = feature_matrix.merge(time_series, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(binary_features, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(prev_features, left_index=True, right_index=True)
    
    return feature_matrix

In [0]:
train_features_per_phase = {}
test_features_per_phase = {}
for phase in phases:
  print(phase)
  train_phase_data = train_df[train_df['phase'] == phase] 
  test_phase_data = test_df[test_df['phase'] == phase] 
  train_features = create_feature_matrix(train_phase_data)
  test_features = create_feature_matrix(test_phase_data)
  
  train_features_per_phase[phase] = train_features
  test_features_per_phase[phase] = test_features

pre_rinse


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 4746/4746 [01:11<00:00, 66.37it/s]
100%|██████████| 4746/4746 [00:21<00:00, 219.48it/s]
100%|██████████| 2815/2815 [00:38<00:00, 73.73it/s]
100%|██████████| 2815/2815 [00:12<00:00, 228.25it/s]


caustic


100%|██████████| 4803/4803 [01:58<00:00, 40.69it/s]
100%|██████████| 4803/4803 [00:43<00:00, 109.82it/s]
100%|██████████| 2553/2553 [00:43<00:00, 58.87it/s]
100%|██████████| 2553/2553 [00:19<00:00, 134.30it/s]


intermediate_rinse


100%|██████████| 3748/3748 [00:57<00:00, 64.83it/s]
100%|██████████| 3748/3748 [00:19<00:00, 193.30it/s]
100%|██████████| 1348/1348 [00:26<00:00, 51.63it/s]
100%|██████████| 1348/1348 [00:18<00:00, 73.92it/s]


acid


100%|██████████| 3947/3947 [01:06<00:00, 59.56it/s]
100%|██████████| 3947/3947 [00:33<00:00, 118.24it/s]
100%|██████████| 798/798 [00:19<00:00, 41.90it/s]
100%|██████████| 798/798 [00:08<00:00, 89.08it/s]


In [0]:
for phase in phases:
  train_features_per_phase[phase].to_csv('train_features_all_phase_{}.csv'.format(phase))
  test_features_per_phase[phase].to_csv('test_features_all_phase_{}.csv'.format(phase))

In [0]:
def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

class MAPEMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, targets, weight):
        return custom_mape(np.exp(approxes), np.exp(targets)), len(targets)

In [0]:
all_train_processes = set(train_df['process_id'])
all_test_processes = set(test_df['process_id'])
validation_processes = list(np.random.choice(list(all_train_processes), replace=False, 
                                             size=int(0.1 * len(all_train_processes))))
train_processes = list(all_train_processes - set(validation_processes))

val_prediction_vectors = defaultdict(lambda: defaultdict(lambda: np.NaN))
train_prediction_vectors = defaultdict(lambda: defaultdict(lambda: np.NaN))
test_prediction_vectors = defaultdict(lambda: defaultdict(lambda: np.NaN))

ordered_phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

for phase_ix, phase in enumerate(phases):
  print('\nFitting models for {} phase'.format(phase))
  phase_train_processes = set(train_processes).intersection(train_features_per_phase[phase].index)
  phase_validation_processes = set(validation_processes).intersection(train_features_per_phase[phase].index)
  X_train = train_features_per_phase[phase].loc[phase_train_processes]
  X_val = train_features_per_phase[phase].loc[phase_validation_processes]
  X_test = test_features_per_phase[phase]
  
  if phase_ix > 0:
    train_predictions, val_predictions, test_predictions = [], [], []
    for process in X_train.index:
      preds = []
      for prev_phase in ordered_phases[:phase_ix]:
        pred = train_prediction_vectors[process].get(prev_phase, np.NaN)
        if pred > 0:
          preds.append(pred)
        else:
          preds.append(np.NaN)
      train_predictions.append(preds)
    for process in X_val.index:
      preds = []
      for prev_phase in ordered_phases[:phase_ix]:
        pred = val_prediction_vectors[process].get(prev_phase, np.NaN)
        if pred > 0:
          preds.append(pred)
        else:
          preds.append(np.NaN)
      val_predictions.append(preds)
    for process in X_test.index:
      preds = []
      for prev_phase in ordered_phases[:phase_ix]:
        pred = test_prediction_vectors[process].get(prev_phase, np.NaN)
        if pred > 0:
          preds.append(pred)
        else:
          preds.append(np.NaN)
      test_predictions.append(preds)
  
    train_pred_df = pd.DataFrame(train_predictions, columns=['pred_{}'.format(prev_phase) for prev_phase in ordered_phases[:phase_ix]], index=X_train.index)
    val_pred_df = pd.DataFrame(val_predictions, columns=['pred_{}'.format(prev_phase) for prev_phase in ordered_phases[:phase_ix]], index=X_val.index)
    test_pred_df = pd.DataFrame(test_predictions, columns=['pred_{}'.format(prev_phase) for prev_phase in ordered_phases[:phase_ix]], index=X_test.index)
  
    X_train = X_train.merge(train_pred_df, left_index=True, right_index=True)
    X_val = X_val.merge(val_pred_df, left_index=True, right_index=True)
    X_test = X_test.merge(test_pred_df, left_index=True, right_index=True)
  
  y_train = np.log(label_df.loc[phase_train_processes])
  y_val = np.log(label_df.loc[phase_validation_processes])
  
  print('\nFitting model on ALL training data')
  cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, learning_rate=0.33,
                      loss_function='MAPE', eval_metric=MAPEMetric())
  cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)
  
  print('\nGenerating predictions for validation set')
  for process in tqdm(X_val.index):
    val_prediction_vectors[process][phase] = cat.predict([X_val.loc[process]])[0]
  
  print('\nGenerating predictions for test set')
  for process in tqdm(X_test.index):
    test_prediction_vectors[process][phase] = cat.predict([X_test.loc[process]])[0]
      
  print('\nGenerating out-of-bag predictions for training set')
  # Apply CV to the training set to create out-of-sample predictions for this phase
  FOLDS = 5
  chunk_size = len(phase_train_processes) // FOLDS
  for i in range(FOLDS):
      cv_test_processes = list(phase_train_processes)[i*chunk_size:(i+1)*chunk_size]
      cv_train_processes = phase_train_processes - set(cv_test_processes)
      cv_val_processes = list(np.random.choice(list(cv_train_processes), 
                                              replace=False, 
                                              size=int(0.1 * len(cv_train_processes))))
      cv_train_processes -= set(cv_val_processes)
      
      X_cv_train = X_train.loc[cv_train_processes]
      y_cv_train = y_train.loc[cv_train_processes]
      X_cv_val = X_train.loc[cv_val_processes]
      y_cv_val = y_train.loc[cv_val_processes]
      X_cv_test = X_train.loc[cv_test_processes]
      y_cv_test = y_train.loc[cv_test_processes]
      
      print('\nFitting model on {} training samples'.format(len(cv_train_processes)))
      cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, learning_rate=0.33,
                          loss_function='MAPE', eval_metric=MAPEMetric())
      cat.fit(X_cv_train, y_cv_train, eval_set=(X_cv_val, y_cv_val), verbose=50)
      
      print('\nGenerating out-of-sample predictions for {} train samples'.format(len(cv_test_processes)))
      for process in tqdm(X_cv_test.index):
        train_prediction_vectors[process][phase] = cat.predict([X_cv_test.loc[process]])[0]
    


Fitting models for pre_rinse phase

Fitting model on ALL training data
0:	learn: 0.8751636	test: 0.8746878	best: 0.8746878 (0)	total: 173ms	remaining: 4h 47m 35s
50:	learn: 0.8751594	test: 0.8746836	best: 0.8746836 (50)	total: 11.9s	remaining: 6h 30m 9s
100:	learn: 0.8751440	test: 0.8746681	best: 0.8746681 (100)	total: 20.7s	remaining: 5h 41m 53s
150:	learn: 0.8750871	test: 0.8746106	best: 0.8746106 (150)	total: 29.4s	remaining: 5h 23m 30s
200:	learn: 0.8748746	test: 0.8743952	best: 0.8743952 (200)	total: 38.6s	remaining: 5h 19m 44s
250:	learn: 0.8740713	test: 0.8735790	best: 0.8735790 (250)	total: 48.3s	remaining: 5h 20m 6s
300:	learn: 0.8710137	test: 0.8704575	best: 0.8704575 (300)	total: 59.2s	remaining: 5h 26m 43s
350:	learn: 0.8595034	test: 0.8586545	best: 0.8586545 (350)	total: 1m 10s	remaining: 5h 31m 18s
400:	learn: 0.8292533	test: 0.8292636	best: 0.8292636 (400)	total: 1m 46s	remaining: 7h 20m 56s
450:	learn: 0.7790700	test: 0.7806789	best: 0.7806789 (450)	total: 2m 23s	remai

  3%|▎         | 15/473 [00:00<00:03, 146.44it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3214634116
bestIteration = 778

Shrink model to first 779 iterations.

Generating predictions for validation set


100%|██████████| 473/473 [00:02<00:00, 165.78it/s]
  0%|          | 14/2815 [00:00<00:20, 139.16it/s]


Generating predictions for test set


100%|██████████| 2815/2815 [00:17<00:00, 158.96it/s]



Generating out-of-bag predictions for training set

Fitting model on 3078 training samples
0:	learn: 0.8736680	test: 0.8699566	best: 0.8699566 (0)	total: 44.8ms	remaining: 1h 14m 44s
50:	learn: 0.8736638	test: 0.8699527	best: 0.8699527 (50)	total: 4.61s	remaining: 2h 30m 34s
100:	learn: 0.8736484	test: 0.8699380	best: 0.8699380 (100)	total: 7.29s	remaining: 2h 14s
150:	learn: 0.8735914	test: 0.8698834	best: 0.8698834 (150)	total: 10s	remaining: 1h 50m 26s
200:	learn: 0.8733789	test: 0.8696780	best: 0.8696780 (200)	total: 12.7s	remaining: 1h 45m 27s
250:	learn: 0.8725763	test: 0.8688968	best: 0.8688968 (250)	total: 15.4s	remaining: 1h 42m 5s
300:	learn: 0.8695219	test: 0.8659034	best: 0.8659034 (300)	total: 18.1s	remaining: 1h 40m 3s
350:	learn: 0.8581113	test: 0.8546707	best: 0.8546707 (350)	total: 22.6s	remaining: 1h 46m 45s
400:	learn: 0.8278779	test: 0.8263025	best: 0.8263025 (400)	total: 36.8s	remaining: 2h 32m 18s
450:	learn: 0.7785176	test: 0.7826278	best: 0.7826278 (450)	total:

  0%|          | 0/854 [00:00<?, ?it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3436642432
bestIteration = 758

Shrink model to first 759 iterations.

Generating out-of-sample predictions for 854 train samples


100%|██████████| 854/854 [00:05<00:00, 167.60it/s]



Fitting model on 3078 training samples
0:	learn: 0.8761872	test: 0.8638548	best: 0.8638548 (0)	total: 34.9ms	remaining: 58m 13s
50:	learn: 0.8761831	test: 0.8638504	best: 0.8638504 (50)	total: 3.92s	remaining: 2h 8m 10s
100:	learn: 0.8761679	test: 0.8638344	best: 0.8638344 (100)	total: 6.78s	remaining: 1h 51m 46s
150:	learn: 0.8761117	test: 0.8637756	best: 0.8637756 (150)	total: 9.59s	remaining: 1h 45m 39s
200:	learn: 0.8759021	test: 0.8635568	best: 0.8635568 (200)	total: 12.4s	remaining: 1h 42m 27s
250:	learn: 0.8751117	test: 0.8627340	best: 0.8627340 (250)	total: 15.8s	remaining: 1h 44m 20s
300:	learn: 0.8721084	test: 0.8596116	best: 0.8596116 (300)	total: 19.2s	remaining: 1h 46m 6s
350:	learn: 0.8608468	test: 0.8479850	best: 0.8479850 (350)	total: 23.6s	remaining: 1h 51m 43s
400:	learn: 0.8311846	test: 0.8205861	best: 0.8205861 (400)	total: 37.2s	remaining: 2h 33m 53s
450:	learn: 0.7813620	test: 0.7678254	best: 0.7678254 (450)	total: 53s	remaining: 3h 14m 49s
500:	learn: 0.6736968	

  2%|▏         | 16/854 [00:00<00:05, 153.96it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3226854999
bestIteration = 711

Shrink model to first 712 iterations.

Generating out-of-sample predictions for 854 train samples


100%|██████████| 854/854 [00:05<00:00, 143.45it/s]



Fitting model on 3078 training samples
0:	learn: 0.8780702	test: 0.8488776	best: 0.8488776 (0)	total: 44.7ms	remaining: 1h 14m 31s
50:	learn: 0.8780660	test: 0.8488732	best: 0.8488732 (50)	total: 3.94s	remaining: 2h 8m 38s
100:	learn: 0.8780509	test: 0.8488569	best: 0.8488569 (100)	total: 7.36s	remaining: 2h 1m 17s
150:	learn: 0.8779949	test: 0.8487967	best: 0.8487967 (150)	total: 10.7s	remaining: 1h 58m 13s
200:	learn: 0.8777866	test: 0.8485717	best: 0.8485717 (200)	total: 13.9s	remaining: 1h 55m 16s
250:	learn: 0.8770019	test: 0.8477214	best: 0.8477214 (250)	total: 17.5s	remaining: 1h 55m 43s
300:	learn: 0.8740217	test: 0.8444815	best: 0.8444815 (300)	total: 20.9s	remaining: 1h 55m 11s
350:	learn: 0.8628221	test: 0.8322636	best: 0.8322636 (350)	total: 26.1s	remaining: 2h 3m 23s
400:	learn: 0.8329250	test: 0.7989222	best: 0.7989222 (400)	total: 39.5s	remaining: 2h 43m 33s
450:	learn: 0.7839182	test: 0.7519162	best: 0.7519162 (450)	total: 55.6s	remaining: 3h 24m 38s
500:	learn: 0.6742

  2%|▏         | 17/854 [00:00<00:05, 165.35it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3336700763
bestIteration = 780

Shrink model to first 781 iterations.

Generating out-of-sample predictions for 854 train samples


100%|██████████| 854/854 [00:05<00:00, 166.02it/s]



Fitting model on 3078 training samples
0:	learn: 0.8762608	test: 0.8785393	best: 0.8785393 (0)	total: 34.6ms	remaining: 57m 36s
50:	learn: 0.8762566	test: 0.8785351	best: 0.8785351 (50)	total: 3.5s	remaining: 1h 54m 27s
100:	learn: 0.8762413	test: 0.8785196	best: 0.8785196 (100)	total: 6.82s	remaining: 1h 52m 22s
150:	learn: 0.8761850	test: 0.8784628	best: 0.8784628 (150)	total: 10s	remaining: 1h 50m 11s
200:	learn: 0.8759749	test: 0.8782516	best: 0.8782516 (200)	total: 13.1s	remaining: 1h 48m 22s
250:	learn: 0.8751825	test: 0.8774577	best: 0.8774577 (250)	total: 16.3s	remaining: 1h 48m 12s
300:	learn: 0.8721698	test: 0.8744445	best: 0.8744445 (300)	total: 19.5s	remaining: 1h 47m 55s
350:	learn: 0.8608768	test: 0.8631138	best: 0.8631138 (350)	total: 23.5s	remaining: 1h 51m 12s
400:	learn: 0.8314605	test: 0.8332114	best: 0.8332114 (400)	total: 37.3s	remaining: 2h 34m 24s
450:	learn: 0.7825480	test: 0.7862911	best: 0.7862911 (450)	total: 52.1s	remaining: 3h 11m 48s
500:	learn: 0.6728658

  2%|▏         | 17/854 [00:00<00:05, 165.71it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3183159748
bestIteration = 714

Shrink model to first 715 iterations.

Generating out-of-sample predictions for 854 train samples


100%|██████████| 854/854 [00:05<00:00, 162.12it/s]



Fitting model on 3078 training samples
0:	learn: 0.8776911	test: 0.8609937	best: 0.8609937 (0)	total: 38.9ms	remaining: 1h 4m 54s
50:	learn: 0.8776869	test: 0.8609896	best: 0.8609896 (50)	total: 3.89s	remaining: 2h 7m 3s
100:	learn: 0.8776716	test: 0.8609741	best: 0.8609741 (100)	total: 6.73s	remaining: 1h 50m 59s
150:	learn: 0.8776155	test: 0.8609163	best: 0.8609163 (150)	total: 9.57s	remaining: 1h 45m 29s
200:	learn: 0.8774067	test: 0.8606978	best: 0.8606978 (200)	total: 12.4s	remaining: 1h 42m 12s
250:	learn: 0.8766221	test: 0.8598638	best: 0.8598638 (250)	total: 15.1s	remaining: 1h 40m
300:	learn: 0.8736427	test: 0.8566506	best: 0.8566506 (300)	total: 17.9s	remaining: 1h 38m 34s
350:	learn: 0.8623404	test: 0.8444408	best: 0.8444408 (350)	total: 21.2s	remaining: 1h 40m 10s
400:	learn: 0.8316983	test: 0.8153571	best: 0.8153571 (400)	total: 34.7s	remaining: 2h 23m 49s
450:	learn: 0.7806979	test: 0.7718475	best: 0.7718475 (450)	total: 51.6s	remaining: 3h 9m 41s
500:	learn: 0.6703238	t

  1%|          | 5/854 [00:00<00:18, 46.53it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.323692395
bestIteration = 1098

Shrink model to first 1099 iterations.

Generating out-of-sample predictions for 854 train samples


100%|██████████| 854/854 [00:19<00:00, 44.90it/s]



Fitting models for caustic phase

Fitting model on ALL training data
0:	learn: 0.8756250	test: 0.8757443	best: 0.8757443 (0)	total: 105ms	remaining: 2h 54m 56s
50:	learn: 0.8756208	test: 0.8757401	best: 0.8757401 (50)	total: 11.2s	remaining: 6h 6m 50s
100:	learn: 0.8756051	test: 0.8757243	best: 0.8757243 (100)	total: 22.1s	remaining: 6h 4m 41s
150:	learn: 0.8755473	test: 0.8756659	best: 0.8756659 (150)	total: 33.1s	remaining: 6h 5m 9s
200:	learn: 0.8753304	test: 0.8754464	best: 0.8754464 (200)	total: 43.1s	remaining: 5h 57m 2s
250:	learn: 0.8745099	test: 0.8746141	best: 0.8746141 (250)	total: 53.1s	remaining: 5h 51m 52s
300:	learn: 0.8713821	test: 0.8714294	best: 0.8714294 (300)	total: 1m 5s	remaining: 6h 1m 40s
350:	learn: 0.8595908	test: 0.8593668	best: 0.8593668 (350)	total: 1m 29s	remaining: 7h 4m 54s
400:	learn: 0.8270982	test: 0.8275495	best: 0.8275495 (400)	total: 2m 9s	remaining: 8h 56m 38s
450:	learn: 0.7718694	test: 0.7730515	best: 0.7730515 (450)	total: 2m 48s	remaining: 10

  3%|▎         | 13/482 [00:00<00:03, 129.50it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.299295027
bestIteration = 1131

Shrink model to first 1132 iterations.

Generating predictions for validation set


100%|██████████| 482/482 [00:03<00:00, 131.33it/s]
  0%|          | 12/2553 [00:00<00:22, 114.85it/s]


Generating predictions for test set


100%|██████████| 2553/2553 [00:19<00:00, 128.20it/s]



Generating out-of-bag predictions for training set

Fitting model on 3112 training samples
0:	learn: 0.8736697	test: 0.8760927	best: 0.8760927 (0)	total: 46ms	remaining: 1h 16m 43s
50:	learn: 0.8736655	test: 0.8760886	best: 0.8760886 (50)	total: 3.92s	remaining: 2h 8m 9s
100:	learn: 0.8736498	test: 0.8760733	best: 0.8760733 (100)	total: 7.57s	remaining: 2h 4m 46s
150:	learn: 0.8735917	test: 0.8760166	best: 0.8760166 (150)	total: 11.3s	remaining: 2h 4m 12s
200:	learn: 0.8733741	test: 0.8758038	best: 0.8758038 (200)	total: 14.9s	remaining: 2h 3m 30s
250:	learn: 0.8725513	test: 0.8749980	best: 0.8749980 (250)	total: 18.6s	remaining: 2h 3m 23s
300:	learn: 0.8694170	test: 0.8719268	best: 0.8719268 (300)	total: 22.8s	remaining: 2h 6m 4s
350:	learn: 0.8576523	test: 0.8605308	best: 0.8605308 (350)	total: 29.2s	remaining: 2h 18m 23s
400:	learn: 0.8257181	test: 0.8296063	best: 0.8296063 (400)	total: 47.2s	remaining: 3h 15m 23s
450:	learn: 0.7715208	test: 0.7801708	best: 0.7801708 (450)	total: 1

  1%|▏         | 12/864 [00:00<00:07, 119.57it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2910699143
bestIteration = 966

Shrink model to first 967 iterations.

Generating out-of-sample predictions for 864 train samples


100%|██████████| 864/864 [00:06<00:00, 123.86it/s]



Fitting model on 3112 training samples
0:	learn: 0.8771961	test: 0.8605722	best: 0.8605722 (0)	total: 46.6ms	remaining: 1h 17m 36s
50:	learn: 0.8771919	test: 0.8605677	best: 0.8605677 (50)	total: 3.41s	remaining: 1h 51m 18s
100:	learn: 0.8771764	test: 0.8605509	best: 0.8605509 (100)	total: 7.06s	remaining: 1h 56m 27s
150:	learn: 0.8771190	test: 0.8604886	best: 0.8604886 (150)	total: 10.7s	remaining: 1h 57m 58s
200:	learn: 0.8769041	test: 0.8602548	best: 0.8602548 (200)	total: 14.4s	remaining: 1h 59m 18s
250:	learn: 0.8760915	test: 0.8593698	best: 0.8593698 (250)	total: 18.1s	remaining: 1h 59m 37s
300:	learn: 0.8729988	test: 0.8559955	best: 0.8559955 (300)	total: 21.9s	remaining: 2h 56s
350:	learn: 0.8613798	test: 0.8434071	best: 0.8434071 (350)	total: 27.8s	remaining: 2h 11m 44s
400:	learn: 0.8299670	test: 0.8081548	best: 0.8081548 (400)	total: 45.5s	remaining: 3h 8m 18s
450:	learn: 0.7751335	test: 0.7504191	best: 0.7504191 (450)	total: 1m 4s	remaining: 3h 58m 12s
500:	learn: 0.653412

  2%|▏         | 14/864 [00:00<00:06, 134.01it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3082049045
bestIteration = 744

Shrink model to first 745 iterations.

Generating out-of-sample predictions for 864 train samples


100%|██████████| 864/864 [00:06<00:00, 129.95it/s]



Fitting model on 3112 training samples
0:	learn: 0.8770884	test: 0.8643282	best: 0.8643282 (0)	total: 44.7ms	remaining: 1h 14m 32s
50:	learn: 0.8770842	test: 0.8643238	best: 0.8643238 (50)	total: 3.9s	remaining: 2h 7m 29s
100:	learn: 0.8770687	test: 0.8643073	best: 0.8643073 (100)	total: 7.58s	remaining: 2h 5m 2s
150:	learn: 0.8770115	test: 0.8642456	best: 0.8642456 (150)	total: 11.4s	remaining: 2h 5m 31s
200:	learn: 0.8767976	test: 0.8640129	best: 0.8640129 (200)	total: 15s	remaining: 2h 4m 15s
250:	learn: 0.8759903	test: 0.8631268	best: 0.8631268 (250)	total: 18.7s	remaining: 2h 3m 36s
300:	learn: 0.8729211	test: 0.8597292	best: 0.8597292 (300)	total: 22.7s	remaining: 2h 5m 10s
350:	learn: 0.8613506	test: 0.8467654	best: 0.8467654 (350)	total: 28.7s	remaining: 2h 15m 54s
400:	learn: 0.8292881	test: 0.8105647	best: 0.8105647 (400)	total: 46.1s	remaining: 3h 10m 55s
450:	learn: 0.7761376	test: 0.7539963	best: 0.7539963 (450)	total: 1m 5s	remaining: 3h 59m 18s
500:	learn: 0.6526923	tes

  2%|▏         | 14/864 [00:00<00:06, 137.25it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3173352898
bestIteration = 789

Shrink model to first 790 iterations.

Generating out-of-sample predictions for 864 train samples


100%|██████████| 864/864 [00:06<00:00, 138.26it/s]



Fitting model on 3112 training samples
0:	learn: 0.8775749	test: 0.8647860	best: 0.8647860 (0)	total: 41.9ms	remaining: 1h 9m 45s
50:	learn: 0.8775707	test: 0.8647818	best: 0.8647818 (50)	total: 3.33s	remaining: 1h 48m 57s
100:	learn: 0.8775551	test: 0.8647663	best: 0.8647663 (100)	total: 6.9s	remaining: 1h 53m 42s
150:	learn: 0.8774974	test: 0.8647081	best: 0.8647081 (150)	total: 10.6s	remaining: 1h 56m 42s
200:	learn: 0.8772818	test: 0.8644886	best: 0.8644886 (200)	total: 14.2s	remaining: 1h 57m 26s
250:	learn: 0.8764672	test: 0.8636522	best: 0.8636522 (250)	total: 17.8s	remaining: 1h 57m 58s
300:	learn: 0.8733717	test: 0.8604441	best: 0.8604441 (300)	total: 21.7s	remaining: 1h 59m 38s
350:	learn: 0.8617756	test: 0.8482765	best: 0.8482765 (350)	total: 28.2s	remaining: 2h 13m 22s
400:	learn: 0.8294462	test: 0.8161095	best: 0.8161095 (400)	total: 45.3s	remaining: 3h 7m 38s
450:	learn: 0.7748787	test: 0.7662162	best: 0.7662162 (450)	total: 1m 4s	remaining: 3h 56m 33s
500:	learn: 0.6490

  2%|▏         | 14/864 [00:00<00:06, 139.29it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2959057465
bestIteration = 1210

Shrink model to first 1211 iterations.

Generating out-of-sample predictions for 864 train samples


100%|██████████| 864/864 [00:05<00:00, 148.38it/s]



Fitting model on 3112 training samples
0:	learn: 0.8735911	test: 0.9037281	best: 0.9037281 (0)	total: 42.1ms	remaining: 1h 10m 5s
50:	learn: 0.8735868	test: 0.9037242	best: 0.9037242 (50)	total: 4.12s	remaining: 2h 14m 31s
100:	learn: 0.8735711	test: 0.9037101	best: 0.9037101 (100)	total: 7.45s	remaining: 2h 2m 47s
150:	learn: 0.8735128	test: 0.9036580	best: 0.9036580 (150)	total: 11s	remaining: 2h 1m 6s
200:	learn: 0.8732949	test: 0.9034644	best: 0.9034644 (200)	total: 14.5s	remaining: 1h 59m 59s
250:	learn: 0.8724721	test: 0.9027385	best: 0.9027385 (250)	total: 17.9s	remaining: 1h 58m 35s
300:	learn: 0.8693372	test: 0.8999905	best: 0.8999905 (300)	total: 21.3s	remaining: 1h 57m 31s
350:	learn: 0.8574171	test: 0.8897947	best: 0.8897947 (350)	total: 25.1s	remaining: 1h 58m 55s
400:	learn: 0.8250209	test: 0.8635736	best: 0.8635736 (400)	total: 42.1s	remaining: 2h 54m 8s
450:	learn: 0.7705425	test: 0.8114520	best: 0.8114520 (450)	total: 59.8s	remaining: 3h 40m 5s
500:	learn: 0.6453443	t

  2%|▏         | 16/864 [00:00<00:05, 155.82it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3070066375
bestIteration = 1539

Shrink model to first 1540 iterations.

Generating out-of-sample predictions for 864 train samples


100%|██████████| 864/864 [00:05<00:00, 152.28it/s]



Fitting models for intermediate_rinse phase

Fitting model on ALL training data
0:	learn: 0.8463352	test: 0.8456088	best: 0.8456088 (0)	total: 408ms	remaining: 11h 20m
50:	learn: 0.8463304	test: 0.8456040	best: 0.8456040 (50)	total: 4.62s	remaining: 2h 31m 2s
100:	learn: 0.8463123	test: 0.8455860	best: 0.8455860 (100)	total: 9.53s	remaining: 2h 37m 1s
150:	learn: 0.8462438	test: 0.8455174	best: 0.8455174 (150)	total: 15.3s	remaining: 2h 48m 34s
200:	learn: 0.8459825	test: 0.8452542	best: 0.8452542 (200)	total: 20.4s	remaining: 2h 48m 24s
250:	learn: 0.8449732	test: 0.8442298	best: 0.8442298 (250)	total: 25.4s	remaining: 2h 48m 33s
300:	learn: 0.8410380	test: 0.8402084	best: 0.8402084 (300)	total: 30s	remaining: 2h 45m 22s
350:	learn: 0.8267102	test: 0.8256987	best: 0.8256987 (350)	total: 38.3s	remaining: 3h 1m 5s
400:	learn: 0.7909226	test: 0.7914810	best: 0.7914810 (400)	total: 54.6s	remaining: 3h 45m 51s
450:	learn: 0.7280331	test: 0.7339616	best: 0.7339616 (450)	total: 1m 12s	remai

  5%|▍         | 17/370 [00:00<00:02, 162.84it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2831223954
bestIteration = 704

Shrink model to first 705 iterations.

Generating predictions for validation set


100%|██████████| 370/370 [00:02<00:00, 165.40it/s]
  1%|          | 16/1348 [00:00<00:08, 156.33it/s]


Generating predictions for test set


100%|██████████| 1348/1348 [00:08<00:00, 162.24it/s]



Generating out-of-bag predictions for training set

Fitting model on 2433 training samples
0:	learn: 0.8468227	test: 0.8069035	best: 0.8069035 (0)	total: 71.1ms	remaining: 1h 58m 32s
50:	learn: 0.8468179	test: 0.8068984	best: 0.8068984 (50)	total: 3.43s	remaining: 1h 52m 5s
100:	learn: 0.8467999	test: 0.8068790	best: 0.8068790 (100)	total: 7.23s	remaining: 1h 59m 8s
150:	learn: 0.8467321	test: 0.8068051	best: 0.8068051 (150)	total: 11.1s	remaining: 2h 1m 48s
200:	learn: 0.8464748	test: 0.8065211	best: 0.8065211 (200)	total: 14.7s	remaining: 2h 1m 30s
250:	learn: 0.8454881	test: 0.8054196	best: 0.8054196 (250)	total: 18.2s	remaining: 2h 31s
300:	learn: 0.8416671	test: 0.8011056	best: 0.8011056 (300)	total: 21.4s	remaining: 1h 58m 22s
350:	learn: 0.8274752	test: 0.7853251	best: 0.7853251 (350)	total: 28s	remaining: 2h 12m 22s
400:	learn: 0.7907311	test: 0.7498741	best: 0.7498741 (400)	total: 43s	remaining: 2h 58m 4s
450:	learn: 0.7291081	test: 0.6944772	best: 0.6944772 (450)	total: 59.9

  1%|          | 7/675 [00:00<00:11, 59.79it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2926697938
bestIteration = 717

Shrink model to first 718 iterations.

Generating out-of-sample predictions for 675 train samples


100%|██████████| 675/675 [00:09<00:00, 73.52it/s]



Fitting model on 2433 training samples
0:	learn: 0.8479616	test: 0.8305414	best: 0.8305414 (0)	total: 495ms	remaining: 13h 44m 59s
50:	learn: 0.8479568	test: 0.8305364	best: 0.8305364 (50)	total: 6.46s	remaining: 3h 30m 52s
100:	learn: 0.8479389	test: 0.8305175	best: 0.8305175 (100)	total: 12.4s	remaining: 3h 25m 15s
150:	learn: 0.8478715	test: 0.8304461	best: 0.8304461 (150)	total: 18.2s	remaining: 3h 20m 36s
200:	learn: 0.8476159	test: 0.8301727	best: 0.8301727 (200)	total: 24.3s	remaining: 3h 20m 53s
250:	learn: 0.8466359	test: 0.8291162	best: 0.8291162 (250)	total: 30.2s	remaining: 3h 19m 57s
300:	learn: 0.8428438	test: 0.8249976	best: 0.8249976 (300)	total: 36.4s	remaining: 3h 20m 42s
350:	learn: 0.8290249	test: 0.8096381	best: 0.8096381 (350)	total: 47.4s	remaining: 3h 44m 14s
400:	learn: 0.7944743	test: 0.7740523	best: 0.7740523 (400)	total: 1m 4s	remaining: 4h 26m 11s
450:	learn: 0.7292863	test: 0.7164148	best: 0.7164148 (450)	total: 1m 22s	remaining: 5h 3m 55s
500:	learn: 0.5

  1%|          | 6/675 [00:00<00:11, 59.03it/s]

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2732064886
bestIteration = 903

Shrink model to first 904 iterations.

Generating out-of-sample predictions for 675 train samples


100%|██████████| 675/675 [00:09<00:00, 71.64it/s]



Fitting model on 2433 training samples
0:	learn: 0.8455384	test: 0.8505327	best: 0.8505327 (0)	total: 589ms	remaining: 16h 21m 58s
50:	learn: 0.8455336	test: 0.8505279	best: 0.8505279 (50)	total: 7s	remaining: 3h 48m 38s
100:	learn: 0.8455155	test: 0.8505097	best: 0.8505097 (100)	total: 13s	remaining: 3h 35m 1s
150:	learn: 0.8454477	test: 0.8504412	best: 0.8504412 (150)	total: 19.3s	remaining: 3h 32m 15s
200:	learn: 0.8451904	test: 0.8501815	best: 0.8501815 (200)	total: 25s	remaining: 3h 27m 15s
250:	learn: 0.8442054	test: 0.8491871	best: 0.8491871 (250)	total: 30.8s	remaining: 3h 23m 45s
300:	learn: 0.8403982	test: 0.8453431	best: 0.8453431 (300)	total: 37s	remaining: 3h 24m 16s
350:	learn: 0.8260854	test: 0.8308758	best: 0.8308758 (350)	total: 45.3s	remaining: 3h 34m 11s
400:	learn: 0.7885739	test: 0.7945653	best: 0.7945653 (400)	total: 1m 8s	remaining: 4h 45m 37s
450:	learn: 0.7263981	test: 0.7304496	best: 0.7304496 (450)	total: 1m 35s	remaining: 5h 51m 58s
500:	learn: 0.5911352	te

In [0]:
# Pickle the predictions
import pickle

pickle.dump(dict(train_prediction_vectors), open('/content/drive/My Drive/Rinse Over Run/train_preds_per_phase.p', 'wb+'))
pickle.dump(dict(val_prediction_vectors), open('/content/drive/My Drive/Rinse Over Run/val_preds_per_phase.p', 'wb+'))
pickle.dump(dict(test_prediction_vectors), open('/content/drive/My Drive/Rinse Over Run/test_preds_per_phase.p', 'wb+'))

all_train_prediction_vectors = defaultdict(lambda: defaultdict(float))
for phase in phases:
  for k in train_prediction_vectors:
    all_train_prediction_vectors[k][phase] = train_prediction_vectors[k][phase]
  for k in val_prediction_vectors:
    all_train_prediction_vectors[k][phase] = val_prediction_vectors[k][phase]
pickle.dump(dict(all_train_prediction_vectors), open('/content/drive/My Drive/Rinse Over Run/all_train_preds_per_phase.p', 'wb+'))

AttributeError: ignored

In [0]:
# Evaluate model trained on 4 features
train_vectors = []
for process in train_processes:
  train_vector = []
  for phase in phases:
    if process in train_prediction_vectors and phase in train_prediction_vectors[process]:
      train_vector.append(train_prediction_vectors[process][phase])
    else:
      train_vector.append(np.NaN)
  train_vectors.append([process] + train_vector)
  
train_predictions_df = pd.DataFrame(train_vectors, columns=['process_id'] + ['pred_{}'.format(phase) for phase in phases])

val_vectors = []
for process in validation_processes:
  val_vector = []
  for phase in phases:
    if process in val_prediction_vectors and phase in val_prediction_vectors[process]:
      val_vector.append(val_prediction_vectors[process][phase])
    else:
      val_vector.append(np.NaN)
  val_vectors.append([process] + val_vector)
  
val_predictions_df = pd.DataFrame(val_vectors, columns=['process_id'] + ['pred_{}'.format(phase) for phase in phases])

In [0]:
train_predictions_df = train_predictions_df.set_index('process_id')
val_predictions_df = val_predictions_df.set_index('process_id')

X_train = train_predictions_df.values
X_val = val_predictions_df.values
y_train = np.log(label_df.loc[train_predictions_df.index])
y_val = np.log(label_df.loc[val_predictions_df.index])

print(X_train.shape, X_val.shape)


cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, learning_rate=0.33,
                    loss_function='MAPE', eval_metric=MAPEMetric())
cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)

In [0]:
import pickle
test_prediction_vectors = pickle.load(open('/content/drive/My Drive/Rinse Over Run/test_preds_per_phase.p', 'rb'))
all_train_prediction_vectors = pickle.load(open('/content/drive/My Drive/Rinse Over Run/all_train_preds_per_phase.p', 'rb'))

In [0]:
all_train_prediction_vectors[25433]

defaultdict(float,
            {'acid': 0.0,
             'caustic': 15.208683417749851,
             'intermediate_rinse': 0.0,
             'pre_rinse': 14.655230900784572})

In [0]:
from tsfresh.feature_selection.relevance import calculate_relevance_table

def get_corr_features(X):
  row_idx, col_idx = np.where(X.corr() == 1)
  self_corr = set([(i, i) for i in range(X_train.shape[1])])
  return set(list(zip(row_idx, col_idx))) - self_corr 

def get_uncorr_features(data):
  X_train_corr = data.copy()
  correlated_features = get_corr_features(X_train_corr)
  
  corr_cols = set()
  for row_idx, col_idx in correlated_features:
    corr_cols.add(row_idx)
    corr_cols.add(col_idx)
  
  uncorr_cols = list(set(X_train.columns) - set(X_train.columns[list(corr_cols)]))
   
  col_mask = [False]*X_train_corr.shape[1]
  for col in corr_cols:
    col_mask[col] = True
  X_train_corr = X_train_corr.loc[:, col_mask]
  
  correlated_features = get_corr_features(X_train_corr)
  
  while correlated_features:
    print('{} correlated feature pairs left...'.format(len(correlated_features)))
    corr_row, corr_col = correlated_features.pop()
    col_mask = [True]*X_train_corr.shape[1]
    col_mask[corr_row] = False
    X_train_corr = X_train_corr.loc[:, col_mask]
    correlated_features = get_corr_features(X_train_corr)
  return list(set(list(X_train_corr.columns) + uncorr_cols))

def remove_features(data, target, p_val=0.25):
  single_cols = list(data.columns[data.nunique() == 1])
  
  uncorr_cols = get_uncorr_features(data)
  corr_cols = list(set(data.columns) - set(uncorr_cols))
  
  rel_table = calculate_relevance_table(data.dropna(axis=1), target, 
                                        ml_task='regression')
  irrel_cols = list(rel_table[rel_table['p_value'] < p_val]['feature'])
  
  na_cols = data.columns[data.isnull().sum() > 1]
  data_na = data[na_cols].dropna(axis=0)
  rel_table_na = calculate_relevance_table(data_na, target.loc[data_na.index], ml_task='regression')
  irrel_na_cols = list(rel_table_na[rel_table_na['p_value'] < p_val]['feature'])
  
  return single_cols + corr_cols + irrel_cols + irrel_na_cols

In [0]:
import os
from sklearn.manifold import TSNE

def get_predictions(process, phases, train=True):
  preds = []
  if train:
    for phase in phases:
      if process in all_train_prediction_vectors and phase in all_train_prediction_vectors[process]:
        pred = all_train_prediction_vectors[process][phase]
        if pred > 0:
          preds.append(pred)
        else:
          preds.append(np.NaN)
      else:
        print('Did not find phase {} for train process {}'.format(phase, process))
        preds.append(np.NaN)
  else:
    for phase in phases:
      if process in test_prediction_vectors and phase in test_prediction_vectors[process]:
        pred = test_prediction_vectors[process][phase]
        if pred > 0:
          preds.append(pred)
        else:
          preds.append(np.NaN)
      else:
        preds.append(np.NaN)
        print('Did not find phase {} for test process {}'.format(phase, process))
  return preds
  
process_comb_to_phases = {
    15: ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid'],
    3:  ['pre_rinse', 'caustic'],
    7:  ['pre_rinse', 'caustic', 'intermediate_rinse'],
    1:  ['pre_rinse'],
    8:  ['acid'],
    2:  ['caustic'],
    6:  ['caustic', 'intermediate_rinse'],
    14: ['caustic', 'intermediate_rinse', 'acid'],
}

mapes = {}
prediction_df = None
for process_comb, augment in zip([15, 3, 7, 1, 8, 2, 6, 14], [True]*8):
    test_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_{}.csv'.format(process_comb), index_col=['process_id']).index
    train_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index
    val_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/val_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index
    
    all_train_features_index = list(train_features_index) + list(val_features_index)
    val_idx = list(np.random.choice(list(all_train_features_index), replace=False, 
                                         size=int(0.05 * len(all_train_features_index))))
    train_idx = set(all_train_features_index) - set(val_idx)
    
    #val_idx = val_features_index
    #train_idx = train_features_index
    
    
    train_features = train_df[(train_df['phase'].isin(process_comb_to_phases[process_comb])) & 
                              (train_df['process_id'].isin(train_idx))]
    val_features = train_df[(train_df['phase'].isin(process_comb_to_phases[process_comb])) & 
                             (train_df['process_id'].isin(val_idx))]
    test_features = test_df[(test_df['phase'].isin(process_comb_to_phases[process_comb])) & 
                            (test_df['process_id'].isin(test_features_index))]
    
    all_features = create_feature_matrix(pd.concat([train_features, val_features, test_features]))
    train_features = all_features.loc[train_idx]
    val_features = all_features.loc[val_idx]
    test_features = all_features.loc[test_features_index]
    
    
    
    # Generate PCA embeddings based on features
    X_test_null = test_features.columns[test_features.isnull().sum() > 0]
    X_train_null = train_features.columns[train_features.isnull().sum() > 0]
    X_val_null = val_features.columns[val_features.isnull().sum() > 0]
    all_null = set(X_test_null).union(X_train_null).union(X_val_null)
    
    train_features_no_nan = train_features.drop(all_null, axis=1)
    val_features_no_nan = val_features.drop(all_null, axis=1)
    test_features_no_nan = test_features.drop(all_null, axis=1)
    
    all_df = pd.concat([train_features_no_nan, val_features_no_nan, test_features_no_nan])
    all_df = all_df[all_df.columns[all_df.isnull().sum() == 0]]
    
    embeddings = TSNE(n_components=3).fit_transform(all_df)
    X_tsne_train = pd.DataFrame(embeddings[:len(train_features), :], columns=['tsne_0', 'tsne_1', 'tsne_2'], index=train_features.index)
    X_tsne_val = pd.DataFrame(embeddings[len(train_features):(len(train_features)+len(val_features)), :], columns=['tsne_0', 'tsne_1', 'tsne_2'], index=val_features.index)
    X_tsne_test = pd.DataFrame(embeddings[(len(train_features)+len(val_features)):, :], columns=['tsne_0', 'tsne_1', 'tsne_2'], index=test_features.index)
    
    train_features = train_features.merge(X_tsne_train, left_index=True, right_index=True)    
    test_features = test_features.merge(X_tsne_test, left_index=True, right_index=True)    
    val_features = val_features.merge(X_tsne_val, left_index=True, right_index=True)
    
    if process_comb not in [1, 2, 4]:
      _columns = ['process_id'] +  ['pred_{}'.format(phase) for phase in process_comb_to_phases[process_comb]]
      
      train_predictions_df = pd.DataFrame([[process] + get_predictions(process, process_comb_to_phases[process_comb]) for process in train_features.index], columns=_columns)
      val_predictions_df = pd.DataFrame([[process] + get_predictions(process, process_comb_to_phases[process_comb]) for process in val_features.index], columns=_columns)
      test_predictions_df = pd.DataFrame([[process] + get_predictions(process, process_comb_to_phases[process_comb], train=False) for process in test_features.index], columns=_columns)
      
      train_predictions_df = train_predictions_df.set_index('process_id', drop=True)
      val_predictions_df = val_predictions_df.set_index('process_id', drop=True)
      test_predictions_df = test_predictions_df.set_index('process_id', drop=True)
      
      train_features = train_features.merge(train_predictions_df, left_index=True, right_index=True)
      val_features = val_features.merge(val_predictions_df, left_index=True, right_index=True)
      test_features = test_features.merge(test_predictions_df, left_index=True, right_index=True)
      
    if augment and os.path.exists('/content/drive/My Drive/Rinse Over Run/tsfresh_features_{}.csv'.format(process_comb)):
      ts_fresh_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/tsfresh_features_{}.csv'.format(process_comb), index_col=['id'])
      
      ts_fresh_cols = list(set(ts_fresh_features.columns) - set(train_features.columns))
      ts_fresh_features = ts_fresh_features[ts_fresh_cols]
      
      train_features = train_features.merge(ts_fresh_features, left_index=True, right_index=True)
      val_features = val_features.merge(ts_fresh_features, left_index=True, right_index=True)
      test_features = test_features.merge(ts_fresh_features, left_index=True, right_index=True)
    
    
    for col in set(train_features.columns) - set(test_features.columns):
        train_features = train_features.drop(col, axis=1)
        
    for col in set(train_features.columns) - set(val_features.columns):
        train_features = train_features.drop(col, axis=1)
        
    for col in train_features.columns:
        if 'process_id' in col:
            train_features = train_features.drop(col, axis=1)
        
    test_features = test_features[train_features.columns]
    val_features = val_features[train_features.columns]
            
    X_train = train_features
    X_val = val_features
    X_test = test_features
    
    print(X_train.shape, X_val.shape, X_test.shape)
    
    all_labels = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col=['process_id'])
    
    y_train = np.log(all_labels.loc[X_train.index])
    y_val = np.log(all_labels.loc[X_val.index])
    
    to_drop = remove_features(X_train, y_train['final_rinse_total_turbidity_liter'])
    print(list(to_drop))
    
    X_train = X_train.drop(to_drop, axis=1)
    X_val = X_val.drop(to_drop, axis=1)
    X_test = X_test.drop(to_drop, axis=1)
    
    cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=250, learning_rate=0.33,
                        loss_function='MAPE', eval_metric=MAPEMetric())
    cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)
    
    val_predictions = np.exp(cat.predict(X_val))
    predictions = np.exp(cat.predict(X_test))
    
    model_mape = custom_mape(val_predictions, np.exp(y_val.values).flatten())
    print(process_comb, model_mape)
    mapes[process_comb] = model_mape

    sub_predictions_df = pd.DataFrame(predictions, columns=['final_rinse_total_turbidity_liter'])
    sub_predictions_df.index = X_test.index
    sub_predictions_df.index.name = X_test.index.name
    
    if prediction_df is None:
        prediction_df = sub_predictions_df
    else:
        prediction_df = pd.concat([prediction_df, sub_predictions_df])


100%|██████████| 84/84 [02:17<00:00,  1.43s/it]
100%|██████████| 4397/4397 [01:28<00:00, 49.85it/s]
100%|██████████| 4397/4397 [00:57<00:00, 76.86it/s]


(3540, 2385) (186, 2385) (671, 2385)
3492 correlated feature pairs left...
3392 correlated feature pairs left...
3294 correlated feature pairs left...
3198 correlated feature pairs left...
3104 correlated feature pairs left...
3012 correlated feature pairs left...
2994 correlated feature pairs left...
2904 correlated feature pairs left...
2898 correlated feature pairs left...
2810 correlated feature pairs left...
2724 correlated feature pairs left...
2640 correlated feature pairs left...
2558 correlated feature pairs left...
2478 correlated feature pairs left...
2400 correlated feature pairs left...
2324 correlated feature pairs left...
2250 correlated feature pairs left...
2234 correlated feature pairs left...
2162 correlated feature pairs left...
2092 correlated feature pairs left...
2074 correlated feature pairs left...
2050 correlated feature pairs left...
1982 correlated feature pairs left...
1916 correlated feature pairs left...
1852 correlated feature pairs left...
1790 correlat

In [0]:
len(set(pd.read_csv('/content/drive/My Drive/Rinse Over Run/last_cleaned_train.csv')['Unnamed: 0']).intersection(set(train_df['process_id'])))

5005

In [0]:
weights = {3: 0.39838220424671383, 15: 0.22615436467812605, 7: 0.22581732389619144, 
           1: 0.09841590832490732, 8: 0.04111897539602292, 2: 0.007751937984496124, 
           14: 0.0016852039096730705, 6: 0.0006740815638692282}
print(mapes)
total_mape = 0
for k in mapes:
    total_mape += weights[k]*mapes[k]
print('Expected LB MAPE = {}'.format(total_mape))

"""
{15: 0.25769775417938057, 3: 0.301130905636298, 7: 0.26783457688511125, 1: 0.3055192670014494, 8: 0.3151284100458177, 2: 0.3267185046048411, 6: 0.2677516727551795, 14: 0.2699025744647523}
Expected LB MAPE = 0.2849200956667515

{15: 0.24010280403309212, 3: 0.2985107914947549, 7: 0.3012968169764717, 1: 0.3634260291955923, 8: 0.21516974271342973, 2: 0.3009410040368695, 6: 0.27725749639154607, 14: 0.20462033067518295}
Expected LB MAPE = 0.28873878439582057

{15: 0.2378025413290688, 3: 0.2971653141489995, 7: 0.24830747912649828, 1: 0.3434213103999385, 8: 0.24797482325956668, 2: 0.30302695244659444, 6: 0.3012732377223895, 14: 0.26598975270473707}
Expected LB MAPE = 0.27523255267622704
--> but LB = 0.3057...
"""

{15: 0.2378025413290688, 3: 0.2971653141489995, 7: 0.24830747912649828, 1: 0.3434213103999385, 8: 0.24797482325956668, 2: 0.30302695244659444, 6: 0.3012732377223895, 14: 0.26598975270473707}
Expected LB MAPE = 0.27523255267622704


'\n{15: 0.25769775417938057, 3: 0.301130905636298, 7: 0.26783457688511125, 1: 0.3055192670014494, 8: 0.3151284100458177, 2: 0.3267185046048411, 6: 0.2677516727551795, 14: 0.2699025744647523}\nExpected LB MAPE = 0.2849200956667515\n\n{15: 0.24010280403309212, 3: 0.2985107914947549, 7: 0.3012968169764717, 1: 0.3634260291955923, 8: 0.21516974271342973, 2: 0.3009410040368695, 6: 0.27725749639154607, 14: 0.20462033067518295}\nExpected LB MAPE = 0.28873878439582057\n'

In [0]:
prediction_df = prediction_df.sort_index()
prediction_df.index.name = X_test.index.name
prediction_df.to_csv('/content/drive/My Drive/Rinse Over Run/extended_phase_predictors.csv')

In [0]:
len(prediction_df)

In [0]:
set(train_features.index) - set(all_train_prediction_vectors.keys())

In [0]:

all_null = set(X_test_null).union(X_train_null).union(X_val_null)

train_features_no_nan = train_features.drop(all_null, axis=1)
val_features_no_nan = val_features.drop(all_null, axis=1)
test_features_no_nan = test_features.drop(all_null, axis=1)

In [0]:
len(train_features_no_nan.columns[train_features_no_nan.isnull().sum() > 0])

0

In [0]:
len(all_null)

132

In [0]:
PCA(n_components=3).fit_transform(pd.concat([train_features_no_nan, val_features_no_nan, test_features_no_nan]))

ValueError: ignored

In [0]:
all_df = pd.concat([train_features_no_nan, val_features_no_nan, test_features_no_nan])
all_df.columns[all_df.isnull().sum() > 0]

Index(['pipeline_L11'], dtype='object')