In [1]:
!pip install tsfresh
!pip install tqdm
!pip install catboost



In [2]:
# The essentials
import pandas as pd
import numpy as np

from collections import defaultdict

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive
# TSFRESH Feature Extraction
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table

from sklearn.model_selection import KFold, GridSearchCV

from collections import defaultdict, Counter
from scipy.stats import norm

from catboost import CatBoostRegressor

from sklearn.preprocessing import PowerTransformer, StandardScaler

  from pandas.core import datetools


In [3]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
better_prev_object_id_per_10.csv
dtw_distances_3.p
extended_phase_predictors.csv
final_phase_labels_15_14.csv
final_phase_labels_15_15.csv
final_phase_labels_15_1.csv
final_phase_labels_15_2.csv
final_phase_labels_15_3.csv
final_phase_labels_15_6.csv
final_phase_labels_15_7.csv
final_phase_labels_3_1.csv
final_phase_labels_3_2.csv
final_phase_labels_3_3.csv
final_phase_labels_9_8.csv
hcsta_features_3_3.csv
last_cleaned_test.csv
last_cleaned_train.csv
mds_embeddings_2d_3.csv
mds_embeddings_2d_3.p
model_per_recipe_simple.csv
more_features_with_preds_per_phase.csv
pca_features_with_preds_per_phase.csv
predictions_15_15.c

In [4]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')
all_data = pd.concat([train_df, test_df], axis=0)

train_df['phase_int'] = train_df['phase'].map({'pre_rinse': 1, 
                                               'caustic': 2, 
                                               'intermediate_rinse': 4, 
                                               'acid': 8,
                                               'final_rinse': 0})
test_df['phase_int'] = test_df['phase'].map({'pre_rinse': 1, 
                                             'caustic': 2, 
                                             'intermediate_rinse': 4, 
                                             'acid': 8})
train_process_combinations = pd.DataFrame(train_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
test_process_combinations = pd.DataFrame(test_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
process_combinations = pd.concat([train_process_combinations, test_process_combinations], axis=0)

recipe_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/recipe_metadata.csv', index_col='process_id')
recipe_df = recipe_df.drop('final_rinse', axis=1)
recipe_df['pre_rinse_num'] = recipe_df['pre_rinse'] * 1
recipe_df['caustic_num'] = recipe_df['caustic'] * 2
recipe_df['intermediate_rinse_num'] = recipe_df['intermediate_rinse'] * 4
recipe_df['acid_num'] = recipe_df['acid'] * 8
recipe_df['recipe'] = recipe_df['pre_rinse_num'] + recipe_df['caustic_num'] + recipe_df['intermediate_rinse_num'] + recipe_df['acid_num']

  mask |= (ar1 == a)


In [0]:
ts_real = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value',
    'flow_diff'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level',
    'tank_lsh_caustic',
    'tank_lsh_acid',
    'tank_lsh_clean_water',
    'tank_lsh_pre_rinse'
]

flow_cols = [
    'supply_flow',
    'return_flow',
    'target_value'
]

process_comb_to_phases = {
    15: ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid'],
    3:  ['pre_rinse', 'caustic'],
    7:  ['pre_rinse', 'caustic', 'intermediate_rinse'],
    1:  ['pre_rinse'],
    8:  ['acid'],
    2:  ['caustic'],
    6:  ['caustic', 'intermediate_rinse'],
    14: ['caustic', 'intermediate_rinse', 'acid'],
}

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

def encode_categorical(df):
    # Currently just copy-pasted from http://drivendata.co/blog/rinse-over-run-benchmark/
    
    # select process_id and pipeline
    meta = df[['process_id', 'pipeline', 'object_id']].drop_duplicates().set_index('process_id') 
    meta['object_id'] = meta['object_id'] // 5
    
    # convert categorical pipeline data to dummy variables
    meta = pd.get_dummies(meta, columns=['pipeline', 'object_id'])
    
    # pipeline L12 not in test data (so useless feature)
    if 'pipeline_L12' in meta:
        meta = meta.drop('pipeline_L12', axis=1)
    
    return meta
  
def count_zeros(x):
  return np.sum(x == 0)
  
def encode_real_timeseries(df):   
    ts_df = df[['process_id'] + ts_real].set_index('process_id')
    
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', 
                                                   'count', 'median', 'sum', 
                                                   lambda x: x.tail(5).mean(),
                                                   count_zeros])
    cols = []
    for col in ts_features.columns:
        cols.append('real_{}'.format(col))
    ts_features.columns = cols
    
    flow_df = df[['process_id', 'object_id'] + flow_cols].reset_index(drop=True)
    for machine in set(flow_df['object_id']):
        machine_data = flow_df[flow_df['object_id'] == machine]
        for col in flow_cols:
            perc = np.percentile(machine_data[col], 99)
            flow_df.loc[machine_data.index, :][col] = machine_data[col].clip(0, perc)
    flow_df = flow_df.set_index('process_id')
    flow_df = flow_df.drop('object_id', axis=1)
    flow_features = flow_df.groupby('process_id').agg(['max', 'mean', 'sum'])
    
    cols = []
    for col in flow_features.columns:
        cols.append('flow_{}'.format(col))
    flow_features.columns = cols
    
    ts_features = ts_features.merge(flow_features, left_index=True, right_index=True)
    
    return ts_features

def encode_binary_timeseries(df):
    ts_df = df[['process_id'] + bin_cols].set_index('process_id')
            
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['mean', 'std', 
                                                   lambda x: x.tail(5).mean(),
                                                   count_zeros])
    
    cols = []
    for col in ts_features.columns:
        cols.append('bin_{}'.format(col))
    ts_features.columns = cols
    
    return ts_features
  
def get_tsfresh_features(df):
    extraction_settings = EfficientFCParameters()
    filtered_funcs = ['abs_energy', 'mean_abs_change', 'mean_change', 
                      'skewness', 'kurtosis', 'absolute_sum_of_changes', 
                      'longest_strike_below_mean', 'longest_strike_above_mean', 
                      'count_above_mean', 'count_below_mean', 'last_location_of_maximum', 
                      'first_location_of_maximum', 'last_location_of_minimum', 
                      'first_location_of_minimum', 
                      'percentage_of_reoccurring_datapoints_to_all_datapoints', 
                      'percentage_of_reoccurring_values_to_all_values', 
                      'sum_of_reoccurring_values', 'sum_of_reoccurring_data_points', 
                      'ratio_value_number_to_time_series_length', 'maximum', 'minimum', 
                      'cid_ce', 'symmetry_looking', 'large_standard_deviation', 'quantile', 
                      'autocorrelation', 'number_peaks', 'binned_entropy', 'index_mass_quantile', 
                      'linear_trend',  'number_crossing_m']
#     new_funcs = ['augmented_dickey_fuller', 'number_cwt_peaks', 'agg_autocorrelation',
#                'spkt_welch_density', 'friedrich_coefficients', 'max_langevin_fixed_point',
#                'c3', 'ar_coefficient', 'mean_second_derivative_central', 'ratio_beyond_r_sigma',
#                'energy_ratio_by_chunks', 'partial_autocorrelation',
#                'fft_aggregated', 'time_reversal_asymmetry_statistic', 'range_count']
#     filtered_funcs += new_funcs
    filtered_settings = {}
    for func in filtered_funcs:
      filtered_settings[func] = extraction_settings[func]

    ts_features = extract_features(df[['process_id', 'timestamp', 'return_turbidity', 'return_flow', 'supply_flow', 'target_value', 'flow_diff']], 
                                   column_id='process_id', column_sort="timestamp", 
                                   column_kind=None, column_value=None,
                                   impute_function=impute, 
                                   default_fc_parameters=filtered_settings,
                                   show_warnings=False)
  
    return ts_features
                                       

def create_feature_matrix(df, processes, phases):
#     df['return_flow'] = df['return_flow'].apply(lambda x: max(x, 0))
#     df['supply_flow'] = df['supply_flow'].apply(lambda x: max(x, 0))
#     df['target_value'] = df['return_flow'] * df['return_turbidity']
#     df['flow_diff'] = df['supply_flow'] - df['return_flow']
    
#     phase_data = df[(df['process_id'].isin(processes)) &
#                     ((df['phase'].isin(phases)))]
    
    metadata = encode_categorical(df)
    time_series = encode_real_timeseries(df)
    binary_features = encode_binary_timeseries(df)
    
    if len(phases) > 1:
      last_phase_data = df[df['phase'] == phases[-1]]
      time_series_last_phase = encode_real_timeseries(last_phase_data)
      new_cols = []
      for col in time_series_last_phase.columns:
        new_cols.append('last_{}'.format(col))
      time_series_last_phase.columns = new_cols
      binary_features_last_phase = encode_binary_timeseries(last_phase_data)
      new_cols = []
      for col in binary_features_last_phase.columns:
        new_cols.append('last_{}'.format(col))
      binary_features_last_phase.columns = new_cols
    
#     tsfresh_features = get_tsfresh_features(phase_data)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = metadata
    feature_matrix = feature_matrix.merge(time_series, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(binary_features, left_index=True, right_index=True)
#     feature_matrix = feature_matrix.merge(tsfresh_features, left_index=True, right_index=True)
    
    if len(phases) > 1:
      feature_matrix = feature_matrix.merge(time_series_last_phase, left_index=True, right_index=True)
      feature_matrix = feature_matrix.merge(binary_features_last_phase, left_index=True, right_index=True)
    
    return feature_matrix
  
def create_augmented_feature_matrix(df, processes, phases, n_augments=5, mask_size=0.2):
    df['return_flow'] = df['return_flow'].apply(lambda x: max(x, 0))
    df['supply_flow'] = df['supply_flow'].apply(lambda x: max(x, 0))
    df['target_value'] = df['return_flow'] * df['return_turbidity']
    df['flow_diff'] = df['supply_flow'] - df['return_flow']
    
    phase_data = df[(df['process_id'].isin(processes)) &
                    ((df['phase'].isin(phases)))]
    
    all_features = []
    for it in range(n_augments):
        print(it)
        masked_phase_data = phase_data.sample(frac=0.8)
        masked_features = create_feature_matrix(masked_phase_data, processes, phases)
        all_features.append(masked_features)
        
    return pd.concat(all_features)
        
def custom_mape(approxes, targets):
  return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 0)) 
  
def get_processes(data, phases, train=True):
    filtered_processes = []
    phases = set(phases)
    processes = set(data['process_id'])
    for process in processes:
        process_phases = set(data[data['process_id'] == process]['phase'])
        if train:
            if phases.issubset(process_phases):
                filtered_processes.append(process)
        else:
            if len(phases) == len(process_phases) == len(phases.intersection(process_phases)):
                filtered_processes.append(process)
    return filtered_processes

In [0]:
recipe_train_data = train_df[train_df['process_id'].isin(recipe_df[recipe_df['recipe'] == 15].index)]
train_processes = get_processes(recipe_train_data, process_comb_to_phases[15])

In [7]:

X = create_augmented_feature_matrix(all_data, train_processes, process_comb_to_phases[15])
y = np.log(label_df.loc[X.index]['final_rinse_total_turbidity_liter'])

0
1
2
3
4


In [0]:
processes = list(set(X.index))
N_SPLITS = 5
splits = []
chunk_size = len(processes) // N_SPLITS
for i in range(N_SPLITS):
  if i < N_SPLITS - 1:
    test_processes = processes[i*chunk_size:(i+1)*chunk_size]
  else:
    test_processes = processes[i*chunk_size:]
    
  train_processes = list(set(processes) - set(test_processes))
  splits.append((train_processes, test_processes))

In [0]:
mapes = []
for train_procs, test_procs in splits:
  X_train = X.loc[train_procs, :]
  X_test = X.loc[test_procs, :]
  y_train = y.loc[train_procs]
  y_test = y.loc[test_procs]
  
  train_idx = np.random.choice(list(set(X_train.index)), replace=False, size=int(0.9 * len(set(X_train.index))))
  val_idx = list(set(X_train.index) - set(train_idx))

  X_val = X_train.loc[val_idx, :]
  y_val = y_train.loc[val_idx]
  X_train = X_train.loc[train_idx, :]
  y_train = y_train.loc[train_idx]
  
  cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, 
                          learning_rate=0.33,
                          loss_function='MAPE', eval_metric='MAPE', task_type='GPU')
  cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)
  
  predictions = np.exp(cat.predict(X_test))
  mape = custom_mape(predictions, np.exp(y_test))
  print('TEST MAPE = {}'.format(mape))
  mapes.append(mape)

0:	learn: 0.9980883	test: 0.9980869	best: 0.9980869 (0)	total: 103ms	remaining: 2h 51m 4s
50:	learn: 0.9022187	test: 0.9020816	best: 0.9020816 (50)	total: 2.91s	remaining: 1h 35m 9s
100:	learn: 0.8061344	test: 0.8059704	best: 0.8059704 (100)	total: 5.09s	remaining: 1h 23m 51s
150:	learn: 0.7100493	test: 0.7098734	best: 0.7098734 (150)	total: 7.18s	remaining: 1h 19m 10s
200:	learn: 0.6139628	test: 0.6137619	best: 0.6137619 (200)	total: 9.25s	remaining: 1h 16m 31s
250:	learn: 0.5178755	test: 0.5176676	best: 0.5176676 (250)	total: 11.4s	remaining: 1h 15m 30s
300:	learn: 0.4217879	test: 0.4215533	best: 0.4215533 (300)	total: 13.5s	remaining: 1h 14m 28s
350:	learn: 0.3296354	test: 0.3289841	best: 0.3289841 (350)	total: 16.3s	remaining: 1h 17m 2s
400:	learn: 0.2449080	test: 0.2441490	best: 0.2441490 (400)	total: 19.4s	remaining: 1h 20m 9s
450:	learn: 0.1713986	test: 0.1730618	best: 0.1730618 (450)	total: 22.6s	remaining: 1h 23m 7s
500:	learn: 0.1062286	test: 0.1111042	best: 0.1111042 (500)	t

In [0]:
print(np.mean(mapes), np.std(mapes))

#0.3856723675847643 0.09603416276181018

In [0]:
predictions = np.exp(cat.predict(X_test))
mape = custom_mape(predictions, np.exp(y_test))
print('TEST MAPE = {}'.format(mape))

In [0]:
y_val.isnull().sum()

In [22]:
train_procs

[]

In [19]:
X.shape, y.shape

((18630, 474), (18630,))

In [26]:
y_val

Series([], Name: final_rinse_total_turbidity_liter, dtype: float64)