In [7]:
!pip install catboost



In [0]:
# The essentials
import pandas as pd
import numpy as np

from collections import defaultdict

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

# Gradient Boosting
from catboost import CatBoostRegressor, Pool

from sklearn.model_selection import KFold

from collections import defaultdict, Counter

In [9]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
dtw_distances_3.p
extended_phase_predictors.csv
last_cleaned_test.csv
last_cleaned_train.csv
mds_embeddings_2d_3.csv
mds_embeddings_2d_3.p
model_per_recipe_simple.csv
more_features_with_preds_per_phase.csv
pca_features_with_preds_per_phase.csv
predictions_machine_405.csv
preds_feature_selection.csv
processes_all_phases.p
process_machine_outlier_predicted.csv
recipe_metadata.csv
test_features_14.csv
test_features_15.csv
test_features_1.csv
test_features_2.csv
test_features_3.csv
test_features_6.csv
test_features_7.csv
test_features_8.csv
test_features_per_phase_14.csv
test_features_per_phase_15.csv
test_features_per_ph

In [10]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')

  mask |= (ar1 == a)


In [0]:
recipe_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/recipe_metadata.csv', index_col='process_id')
recipe_df = recipe_df.drop('final_rinse', axis=1)
recipe_df['pre_rinse_num'] = recipe_df['pre_rinse'] * 1
recipe_df['caustic_num'] = recipe_df['caustic'] * 2
recipe_df['intermediate_rinse_num'] = recipe_df['intermediate_rinse'] * 4
recipe_df['acid_num'] = recipe_df['acid'] * 8
recipe_df['recipe'] = recipe_df['pre_rinse_num'] + recipe_df['caustic_num'] + recipe_df['intermediate_rinse_num'] + recipe_df['acid_num']

In [0]:
ts_real = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value'
]

# variables we'll use to create our time series features
ts_cols = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level'
]

process_comb_to_phases = {
    15: ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid'],
    3:  ['pre_rinse', 'caustic'],
    7:  ['pre_rinse', 'caustic', 'intermediate_rinse'],
    1:  ['pre_rinse'],
    8:  ['acid'],
    2:  ['caustic'],
    6:  ['caustic', 'intermediate_rinse'],
    14: ['caustic', 'intermediate_rinse', 'acid'],
}

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

def encode_categorical(df):
    # Currently just copy-pasted from http://drivendata.co/blog/rinse-over-run-benchmark/
    
    # select process_id and pipeline
    meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') 
    #meta = meta.merge(recipe_df[['recipe', 'pre_rinse', 'caustic', 'intermediate_rinse', 'acid']], left_index=True, right_index=True)
    #meta = meta.merge(process_combinations[['phase_int']], left_index=True, right_index=True)
    
    # convert categorical pipeline data to dummy variables
    meta = pd.get_dummies(meta, columns=['pipeline'])
    
    # pipeline L12 not in test data (so useless feature)
    if 'pipeline_L12' in meta:
        meta = meta.drop('pipeline_L12', axis=1)
    
    # calculate number of phases for each process_object
    meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())
    
    return meta

def percentile_25(x):
  return np.percentile(x, 0.25)

def percentile_75(x):
  return np.percentile(x, 0.75)
  
def encode_real_timeseries(df):   
    ts_df = df[['process_id'] + ts_cols].set_index('process_id')
    
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', 
                                                   'count', 'median', 'sum', 
                                                   lambda x: x.tail(5).mean()])
    
    col_map = {}
    for col in ts_features.columns:
        col_map[col] = 'real_{}'.format(col)
    ts_features = ts_features.rename(columns=col_map)
    
    return ts_features

def encode_binary_timeseries(df):
    ts_df = df[['process_id'] + bin_cols].set_index('process_id')
            
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['mean', 'std', 
                                                   lambda x: x.tail(5).mean()])
    
    col_map = {}
    for col in ts_features.columns:
        col_map[col] = 'bin_{}'.format(col)
    ts_features = ts_features.rename(columns=col_map)
    
    return ts_features

def get_descript(data, functions, cols):
    ts_df = data.set_index('process_id').sort_values(by='timestamp')
    return ts_df.groupby('process_id')[cols].agg(functions)  
  
  
def get_descript_prev_process(data):
    machines = set(data['object_id'])
    all_features = []
    for machine in tqdm(machines):
        machine_data = data[data['object_id'] == machine]
        machine_data = machine_data.sort_values(by='timestamp')
        machine_processes = machine_data['process_id'].unique()
        for process_ix, process in enumerate(machine_processes):
            if process_ix > 0:
                prev_process = machine_data[machine_data['process_id'] == machine_processes[process_ix - 1]]
                this_process = machine_data[machine_data['process_id'] == machine_processes[process_ix]]
                features = get_descript(prev_process, ['mean', 'std', 'min', 'max', 'count'], ts_cols)
                _columns = list(features.columns)
                assert len(features) == 1
                features = features.iloc[0, :].values
                time_delta = (this_process['timestamp'].values[0] - prev_process['timestamp'].values[-1]) / np.timedelta64(1, 'h')
                assert time_delta > 0
                all_features.append([machine, process, time_delta] + list(features))
            else:
                all_features.append([machine, process, np.NaN] + ([np.NaN] * 60))
                
    all_features = pd.DataFrame(all_features, columns=['object_id', 'process_id', 'time_delta'] + _columns)
    all_features = all_features.set_index('process_id', drop=True)
    col_map = {}
    for col in all_features.columns:
        col_map[col] = 'prev_{}'.format(col)
    all_features = all_features.rename(columns=col_map)
    return all_features

def create_feature_matrix(df):
    df['return_flow_relu'] = df['return_flow'].apply(lambda x: max(0, x))
    df['target_value'] = df['return_flow_relu'] * df['return_turbidity']
    
    prev_features = get_descript_prev_process(df)
    metadata = encode_categorical(df)
    time_series = encode_real_timeseries(df)
    binary_features = encode_binary_timeseries(df)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = metadata
    feature_matrix = feature_matrix.merge(time_series, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(binary_features, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(prev_features, left_index=True, right_index=True)
    
    return feature_matrix
  
def get_processes(data, phases, train=True):
    filtered_processes = []
    phases = set(phases)
    processes = set(data['process_id'])
    for process in processes:
        process_phases = set(data[data['process_id'] == process]['phase'])
        if train:
            if phases.issubset(process_phases):
                filtered_processes.append(process)
        else:
            if len(phases) == len(process_phases) == len(phases.intersection(process_phases)):
                filtered_processes.append(process)
    return filtered_processes

In [0]:
def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

def mape_1_row(x):
  return abs(x['prediction'] - x['target']) / max(290000, x['target'])

class MAPEMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, targets, weight):
        return custom_mape(np.exp(approxes), np.exp(targets)), len(targets)

In [0]:
def get_corr_features(X):
  row_idx, col_idx = np.where(X.corr() == 1)
  self_corr = set([(i, i) for i in range(X.shape[1])])
  return set(list(zip(row_idx, col_idx))) - self_corr 

def get_uncorr_features(data):
  X_train_corr = data.copy()
  correlated_features = get_corr_features(X_train_corr)
  
  corr_cols = set()
  for row_idx, col_idx in correlated_features:
    corr_cols.add(row_idx)
    corr_cols.add(col_idx)
  
  uncorr_cols = list(set(data.columns) - set(data.columns[list(corr_cols)]))
   
  col_mask = [False]*X_train_corr.shape[1]
  for col in corr_cols:
    col_mask[col] = True
  X_train_corr = X_train_corr.loc[:, col_mask]
  
  correlated_features = get_corr_features(X_train_corr)
  
  while correlated_features:
    print('{} correlated feature pairs left...'.format(len(correlated_features)))
    corr_row, corr_col = correlated_features.pop()
    col_mask = [True]*X_train_corr.shape[1]
    col_mask[corr_row] = False
    X_train_corr = X_train_corr.loc[:, col_mask]
    correlated_features = get_corr_features(X_train_corr)
  return list(set(list(X_train_corr.columns) + uncorr_cols))

def remove_features(data, target, p_val=0.25):
  single_cols = list(data.columns[data.nunique() == 1])
  
  uncorr_cols = get_uncorr_features(data)
  corr_cols = list(set(data.columns) - set(uncorr_cols))
  
  return list(set(single_cols + corr_cols))

In [0]:
from sklearn.model_selection import KFold

def create_predictions_per_phase(train_data, test_data, labels, phase, n_folds=5):
    filtered_train_data = train_data[train_data['phase'] == phase]
    filtered_test_data = test_data[test_data['phase'] == phase]
    train_procs = list(set(filtered_train_data['process_id']))
    test_procs = list(set(filtered_test_data['process_id']))
    
    all_data = pd.concat([filtered_train_data, filtered_test_data], axis=0)
    features = create_feature_matrix(all_data)
    
    X = features.loc[train_procs, :]
    y = labels.loc[train_procs]
    X_test_lb = features.loc[test_procs, :]
    
    kf = KFold(n_splits=n_folds)
    
    train_predictions = np.zeros((len(X), 1))
    test_predictions = np.zeros((len(X_test_lb), n_folds))
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
      X_train = X.iloc[train_idx, :]
      X_test = X.iloc[test_idx, :]
      
      y_train = np.log(y.iloc[train_idx])
      y_test = np.log(y.iloc[test_idx])
      
      train_idx = np.random.choice(X_train.index, replace=False, 
                                   size=int(0.9 * len(X_train)))
      val_idx = list(set(X_train.index) - set(train_idx))

      X_val = X_train.loc[val_idx, :]
      y_val = y_train.loc[val_idx]
      X_train = X_train.loc[train_idx, :]
      y_train = y_train.loc[train_idx]
      
      to_drop = remove_features(X_train, y_train)
      print(len(to_drop), to_drop)

      X_train = X_train.drop(to_drop, axis=1)
      X_test = X_test.drop(to_drop, axis=1)
      X_val = X_val.drop(to_drop, axis=1)
      
      X_test_lb_cv = X_test_lb.drop(to_drop, axis=1)
      
      cat = CatBoostRegressor(iterations=10000, od_type='Iter', od_wait=100, 
                              learning_rate=0.33,
                              loss_function='MAPE', eval_metric=MAPEMetric())
      cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)
      
      
      oos_predictions = np.exp(cat.predict(X_test))
      oos_error = custom_mape(oos_predictions, np.exp(y_test))
      print('Generating out-of-sample predictions. Fold #{}. Out-of-sample error = {}'.format(i + 1, oos_error))
      train_predictions[test_idx, :] = np.reshape(oos_predictions, (-1, 1))
      print(cat.predict(X_test_lb_cv).shape)
      test_predictions[:, i] = np.exp(cat.predict(X_test_lb_cv))
      
    return train_predictions, np.mean(test_predictions, axis=0)

In [32]:
import pickle
for recipe in [3, 15]:
    recipe_train_data = train_df[train_df['process_id'].isin(recipe_df[recipe_df['recipe'] == recipe].index)]
    recipe_test_data = test_df[test_df['process_id'].isin(recipe_df[recipe_df['recipe'] == recipe].index)]
    labels = label_df.loc[recipe_df[recipe_df['recipe'] == recipe].index, :]['final_rinse_total_turbidity_liter']
    for phase in process_comb_to_phases[recipe]:
        train_preds, test_preds = create_predictions_per_phase(recipe_train_data, 
                                                               recipe_test_data, 
                                                               labels, phase)
        pickle.dump(train_preds, open('/train_pred_{}_{}.p'.format()))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  after removing the cwd from sys.path.
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)

  0%|          | 0/41 [00:00<?, ?it/s][A
  2%|▏         | 1/41 [00:01<01:17,  1.94s/it][A
  5%|▍         | 2/41 [00:04<01:18,  2.00s/it][A
  7%|▋         | 3/41 [00:05<01:14,  1.95s/it][A
 10%|▉         | 4/41 [00:07<01:12,  1.95s/it][A
 12%|█▏        | 5/41 [00:09<01:10,  1.96s/it][A
 20%|█▉        | 8/41 [00:14<01:00,  1.85s/it][A
 29%|██▉       | 12/41 [00:16<00:42,  1.47s/it][A
 32%|███▏      | 13/41 [00:19<

480 correlated feature pairs left...
450 correlated feature pairs left...
422 correlated feature pairs left...
396 correlated feature pairs left...
372 correlated feature pairs left...
350 correlated feature pairs left...
330 correlated feature pairs left...
312 correlated feature pairs left...
296 correlated feature pairs left...
282 correlated feature pairs left...
252 correlated feature pairs left...
224 correlated feature pairs left...
198 correlated feature pairs left...
174 correlated feature pairs left...
152 correlated feature pairs left...
132 correlated feature pairs left...
114 correlated feature pairs left...
98 correlated feature pairs left...
86 correlated feature pairs left...
72 correlated feature pairs left...
60 correlated feature pairs left...
50 correlated feature pairs left...
42 correlated feature pairs left...
36 correlated feature pairs left...
26 correlated feature pairs left...
18 correlated feature pairs left...
12 correlated feature pairs left...
8 correlate


  0%|          | 0/42 [00:00<?, ?it/s][A
  2%|▏         | 1/42 [00:02<01:26,  2.11s/it][A
  5%|▍         | 2/42 [00:04<01:23,  2.08s/it][A
  7%|▋         | 3/42 [00:05<01:18,  2.00s/it][A
 10%|▉         | 4/42 [00:07<01:14,  1.97s/it][A
 12%|█▏        | 5/42 [00:09<01:12,  1.95s/it][A
 19%|█▉        | 8/42 [00:14<01:02,  1.83s/it][A
 29%|██▊       | 12/42 [00:16<00:43,  1.45s/it][A
 31%|███       | 13/42 [00:19<00:50,  1.73s/it][A
 33%|███▎      | 14/42 [00:21<00:52,  1.88s/it][A
 36%|███▌      | 15/42 [00:21<00:38,  1.43s/it][A
 38%|███▊      | 16/42 [00:22<00:34,  1.32s/it][A
 40%|████      | 17/42 [00:24<00:35,  1.41s/it][A
 43%|████▎     | 18/42 [00:24<00:25,  1.05s/it][A
 45%|████▌     | 19/42 [00:26<00:32,  1.39s/it][A
 48%|████▊     | 20/42 [00:26<00:22,  1.02s/it][A
 50%|█████     | 21/42 [00:27<00:18,  1.16it/s][A
 60%|█████▉    | 25/42 [00:29<00:12,  1.37it/s][A
 67%|██████▋   | 28/42 [00:30<00:08,  1.56it/s][A
 69%|██████▉   | 29/42 [00:31<00:10,  1.25it/

480 correlated feature pairs left...
450 correlated feature pairs left...
422 correlated feature pairs left...
396 correlated feature pairs left...
372 correlated feature pairs left...
350 correlated feature pairs left...
330 correlated feature pairs left...
312 correlated feature pairs left...
296 correlated feature pairs left...
282 correlated feature pairs left...
252 correlated feature pairs left...
224 correlated feature pairs left...
198 correlated feature pairs left...
174 correlated feature pairs left...
152 correlated feature pairs left...
132 correlated feature pairs left...
114 correlated feature pairs left...
98 correlated feature pairs left...
86 correlated feature pairs left...
72 correlated feature pairs left...
60 correlated feature pairs left...
50 correlated feature pairs left...
42 correlated feature pairs left...
36 correlated feature pairs left...
26 correlated feature pairs left...
18 correlated feature pairs left...
12 correlated feature pairs left...
8 correlate


0it [00:00, ?it/s][A
[A

9:	learn: 0.9830935	test: 0.9739011	best: 0.9739011 (9)	total: 2.07s	remaining: 0us

bestTest = 0.9739010966
bestIteration = 9

Generating out-of-sample predictions. Fold #5. Out-of-sample error = 0.9779760953693978
(522,)


UnboundLocalError: ignored