In [1]:
!pip install tsfresh
!pip install catboost



In [2]:
# The essentials
import pandas as pd
import numpy as np

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

# Gradient Boosting
from catboost import CatBoostRegressor, Pool

from collections import defaultdict

from tsfresh.feature_selection.relevance import calculate_relevance_table

  from pandas.core import datetools


In [3]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
dtw_distances_3.p
extended_phase_predictors.csv
last_cleaned_test.csv
last_cleaned_train.csv
mds_embeddings_2d_3.csv
mds_embeddings_2d_3.p
more_features_with_preds_per_phase.csv
pca_features_with_preds_per_phase.csv
predictions_machine_405.csv
preds_feature_selection.csv
processes_all_phases.p
recipe_metadata.csv
test_features_14.csv
test_features_15.csv
test_features_1.csv
test_features_2.csv
test_features_3.csv
test_features_6.csv
test_features_7.csv
test_features_8.csv
test_features_per_phase_14.csv
test_features_per_phase_15.csv
test_features_per_phase_1.csv
test_features_per_phase_2.csv
test_features_per_phase_3.

In [4]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')

  mask |= (ar1 == a)


In [5]:
train_df = train_df[train_df['phase'] != 'final_rinse']

train_df['phase_int'] = train_df['phase'].map({'pre_rinse': 1, 
                                               'caustic': 2, 
                                               'intermediate_rinse': 4, 
                                               'acid': 8})
test_df['phase_int'] = test_df['phase'].map({'pre_rinse': 1, 
                                             'caustic': 2, 
                                             'intermediate_rinse': 4, 
                                             'acid': 8})
train_process_combinations = pd.DataFrame(train_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
test_process_combinations = pd.DataFrame(test_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
process_combinations = pd.concat([train_process_combinations, test_process_combinations], axis=0)
process_combinations['weight'] = process_combinations['phase_int'].map(
    {3: 0.39838220424671383, 15: 0.22615436467812605, 7: 0.22581732389619144, 
     1: 0.09841590832490732, 8: 0.04111897539602292, 2: 0.007751937984496124, 
     14: 0.0016852039096730705, 6: 0.0006740815638692282}
)
process_combinations.head(10)

Unnamed: 0_level_0,phase_int,weight
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1
20001,15,0.226154
20002,3,0.398382
20003,15,0.226154
20004,15,0.226154
20005,8,0.041119
20008,15,0.226154
20011,15,0.226154
20014,15,0.226154
20016,15,0.226154
20017,15,0.226154


In [0]:
recipe_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/recipe_metadata.csv', index_col='process_id')
recipe_df = recipe_df.drop('final_rinse', axis=1)
recipe_df['pre_rinse_num'] = recipe_df['pre_rinse'] * 1
recipe_df['caustic_num'] = recipe_df['caustic'] * 2
recipe_df['intermediate_rinse_num'] = recipe_df['intermediate_rinse'] * 4
recipe_df['acid_num'] = recipe_df['acid'] * 8
recipe_df['recipe'] = recipe_df['pre_rinse_num'] + recipe_df['caustic_num'] + recipe_df['intermediate_rinse_num'] + recipe_df['acid_num']

In [0]:
ts_real = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value'
]

# variables we'll use to create our time series features
ts_cols = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level'
]

process_comb_to_phases = {
    15: ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid'],
    3:  ['pre_rinse', 'caustic'],
    7:  ['pre_rinse', 'caustic', 'intermediate_rinse'],
    1:  ['pre_rinse'],
    8:  ['acid'],
    2:  ['caustic'],
    6:  ['caustic', 'intermediate_rinse'],
    14: ['caustic', 'intermediate_rinse', 'acid'],
}

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

def encode_categorical(df):
    # Currently just copy-pasted from http://drivendata.co/blog/rinse-over-run-benchmark/
    
    # select process_id and pipeline
    meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') 
    meta = meta.merge(recipe_df[['recipe', 'pre_rinse', 'caustic', 'intermediate_rinse', 'acid']], left_index=True, right_index=True)
    meta = meta.merge(process_combinations[['phase_int']], left_index=True, right_index=True)
    
    # convert categorical pipeline data to dummy variables
    meta = pd.get_dummies(meta, columns=['pipeline', 'recipe', 'phase_int'])
    
    # pipeline L12 not in test data (so useless feature)
    if 'pipeline_L12' in meta:
        meta = meta.drop('pipeline_L12', axis=1)
    
    # calculate number of phases for each process_object
    meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())
    
    return meta

def percentile_25(x):
  return np.percentile(x, 0.25)

def percentile_75(x):
  return np.percentile(x, 0.75)
  
def encode_real_timeseries(df):   
    ts_df = df[['process_id'] + ts_cols].set_index('process_id')
    
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', 
                                                   'count', 'median', 'sum', 
                                                   lambda x: x.tail(5).mean()])
    
    col_map = {}
    for col in ts_features.columns:
        col_map[col] = 'real_{}'.format(col)
    ts_features = ts_features.rename(columns=col_map)
    
    return ts_features

def encode_binary_timeseries(df):
    ts_df = df[['process_id'] + bin_cols].set_index('process_id')
            
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['mean', 'std', 
                                                   lambda x: x.tail(5).mean()])
    
    col_map = {}
    for col in ts_features.columns:
        col_map[col] = 'bin_{}'.format(col)
    ts_features = ts_features.rename(columns=col_map)
    
    return ts_features

def create_feature_matrix(df):
    df['return_flow_relu'] = df['return_flow'].apply(lambda x: max(0, x))
    df['target_value'] = df['return_flow_relu'] * df['return_turbidity']
    
    metadata = encode_categorical(df)
    time_series = encode_real_timeseries(df)
    binary_features = encode_binary_timeseries(df)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = metadata
    feature_matrix = feature_matrix.merge(time_series, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(binary_features, left_index=True, right_index=True)
    
    return feature_matrix
  
def get_processes(data, phases, train=True):
    filtered_processes = []
    phases = set(phases)
    processes = set(data['process_id'])
    for process in processes:
        process_phases = set(data[data['process_id'] == process]['phase'])
        if train:
            if phases.issubset(process_phases):
                filtered_processes.append(process)
        else:
            if len(phases) == len(process_phases) == len(phases.intersection(process_phases)):
                filtered_processes.append(process)
    return filtered_processes
  
def get_two_phase_processes(data, train=True):
    # Extract two phases of each process
    combinations = [['pre_rinse', 'caustic'], ['caustic', 'intermediate_rinse'], ['intermediate_rinse', 'acid'], ['pre_rinse', 'acid']]
    filtered_data = []
    for comb_nr, combination in zip([3, 6, 12, 9], combinations):
      print('Extracting all data corresponding to processes that have {} as phases'.format(combination))
      processes = get_processes(data, combination, train=train)
      df = data[(data['process_id'].isin(processes)) &
                ((data['phase'].isin(combination)))]
      features = create_feature_matrix(df)
      features['process_comb'] = comb_nr
      print(features.shape)
      filtered_data.append(features)
    return pd.concat(filtered_data)

In [9]:
one_phase_train_chunks = []
one_phase_test_chunks = []

for process_comb in tqdm([3]):
  
  train_procs = get_processes(train_df, process_comb_to_phases[process_comb])
  test_procs = get_processes(test_df, process_comb_to_phases[process_comb])
  
  train_phase_data = train_df[(train_df['process_id'].isin(train_procs)) &
                              ((train_df['phase'].isin(process_comb_to_phases[process_comb])))]
  test_phase_data = test_df[(test_df['process_id'].isin(test_procs)) &
                            ((test_df['phase'].isin(process_comb_to_phases[process_comb])))]
  
  #one_phase_train_chunks.append(get_two_phase_processes(train_df))
  #one_phase_test_chunks.append(get_two_phase_processes(test_df, train=False))
  
  #train_phase_data = get_two_phase_processes(train_df)
  #test_phase_data = get_two_phase_processes(test_df, train=False)
  
  all_phase_data = pd.concat([train_phase_data, test_phase_data], axis=0)
  phase_features = create_feature_matrix(all_phase_data)
  
  one_phase_train_chunks.append(phase_features.loc[list(set(train_phase_data['process_id']))])
  one_phase_test_chunks.append(phase_features.loc[list(set(test_phase_data['process_id']))])

one_phase_train_df = pd.concat(one_phase_train_chunks) 
one_phase_test_df = pd.concat(one_phase_test_chunks)

print(train_df.shape, one_phase_train_df.shape)

100%|██████████| 1/1 [01:20<00:00, 80.34s/it]

(4475493, 36) (4743, 178)





In [10]:
one_phase_train_df.iloc[:5, 10:20]

Unnamed: 0_level_0,pipeline_L6,pipeline_L7,pipeline_L8,pipeline_L9,recipe_3,recipe_15,phase_int_3,phase_int_7,phase_int_15,num_phases
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
24579,0,0,1,0,1,0,1,0,0,2
24581,0,0,1,0,0,1,0,0,1,2
24586,0,0,0,0,0,1,0,0,1,2
24588,0,0,0,0,0,1,0,0,1,2
24589,0,0,1,0,0,1,0,0,1,2


In [0]:
process_data = train_df[train_df['process_id'] == 20966]
phase2_start = process_data[process_data['phase'] == 'caustic'].iloc[0, :]['timestamp']
phase3_start = process_data[process_data['phase'] == 'intermediate_rinse'].iloc[0, :]['timestamp']
phase4_start = process_data[process_data['phase'] == 'acid'].iloc[0, :]['timestamp']
phase5_start = process_data[process_data['phase'] == 'final_rinse'].iloc[0, :]['timestamp']
timestamps = list(process_data['timestamp'])
return_flows = list(process_data['return_flow'].values)
plt.figure()
plt.plot(range(len(timestamps)), return_flows)
plt.axvline(x=timestamps.index(phase2_start), c='k', lw=2)
plt.axvline(x=timestamps.index(phase3_start), c='k', lw=2)
plt.axvline(x=timestamps.index(phase4_start), c='k', lw=2)
plt.axvline(x=timestamps.index(phase5_start), c='k', lw=2)
plt.show()

In [0]:
one_phase_train_df.head(5)

In [0]:
# Split the features per phase
features_per_phase = {}
for phase in phases:
  features_per_phase[phase] = one_phase_train_df[one_phase_train_df['phase'] == phase]

In [0]:
FOLDS = 5
process_ids = list(set(one_phase_train_df.index))
np.random.shuffle(process_ids)
validation_process_ids = []
chunk_size = len(process_ids) // FOLDS
for i in range(FOLDS):
  if i < FOLDS - 1:
    validation_process_ids.append(process_ids[i*chunk_size:(i+1)*chunk_size])
  else:
    validation_process_ids.append(process_ids[i*chunk_size:])

In [0]:
def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

def mape_1_row(x):
  return abs(x['prediction'] - x['target']) / max(290000, x['target'])

class MAPEMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, targets, weight):
        return custom_mape(np.exp(approxes), np.exp(targets)), len(targets)
      
class MAPEObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats (containers with only __len__ and __getitem__ defined).
        # weights parameter can be None.
        # Returns list of pairs (der1, der2)
        
        print(targets, approxes, weights)
        
        result = []
        for index in range(len(targets)):
            y = np.exp(targets[index])
            y_hat = np.exp(approxes[index])
            mape = (100*(y-y_hat)/max(np.log(290000), y), 100/y)
            if weights is not None:
              mape = (-100 * weights[index] * (y-y_hat)/max(np.log(290000), y), -100 * weights[index]/y)
              result.append(mape)
            else:
              result.append((-100 * (y-y_hat)/max(np.log(290000), y), -100/y))
            
        return result

In [15]:
one_phase_train_df = pd.get_dummies(one_phase_train_df, columns=['phase'])

ValueError: ignored

In [13]:
for fold_nr, test_idx in enumerate(validation_process_ids):
  train_idx = list(set(process_ids) - set(test_idx))
  val_idx = np.random.choice(train_idx, replace=False, size=int(0.1*len(train_idx)))
  train_idx = list(set(train_idx) - set(val_idx))
  
  X_train = one_phase_train_df.loc[train_idx]#.drop('process_comb', axis=1)
  X_test = one_phase_train_df.loc[test_idx]#.drop('process_comb', axis=1)
  X_val = one_phase_train_df.loc[val_idx]#.drop('process_comb', axis=1)
  
  y_train = np.log(label_df.loc[X_train.index])
  y_test = np.log(label_df.loc[X_test.index])
  y_val = np.log(label_df.loc[X_val.index])
  
  print(X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape)
  #train = Pool(X_train, y_train, weight=process_combinations.loc[X_train.index]['weight'].map({0.0016852039096730705: 1, 0.22615436467812605: 1, 0.39838220424671383: 10}))
  #val = Pool(X_val, y_val)
  #test = Pool(X_test)
  
  cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, learning_rate=0.33,
                          loss_function='MAPE', eval_metric=MAPEMetric())
  cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)
  
  preds_df = pd.DataFrame(index=X_test.index)
  preds_df['prediction'] = np.exp(cat.predict(X_test))
  
  agg_preds_df = pd.DataFrame(preds_df.groupby('process_id').mean()['prediction'])
  agg_preds_df['target'] = label_df.loc[test_idx]['final_rinse_total_turbidity_liter']
  agg_preds_df['mape'] = agg_preds_df[['prediction', 'target']].apply(mape_1_row, axis=1)
  agg_preds_df['weight'] = process_combinations.loc[agg_preds_df.index]['weight']
  print(agg_preds_df['mape'].mean())
  print(agg_preds_df.groupby('weight').mean())
  
  #mape_per_weight = agg_preds_df.groupby('weight')['mape'].mean()
  #mapes = list(mape_per_weight.values)
  #weights = list(mape_per_weight.index)
  #weights = [x/sum(weights) for x in weights]
  #print(sum([i * j for i, j in zip(mapes, weights)]))

(3416, 178) (948, 178) (379, 178) (3416, 1) (948, 1) (379, 1)
0:	learn: 0.8739074	test: 0.9104816	best: 0.9104816 (0)	total: 303ms	remaining: 8h 25m 15s
50:	learn: 0.8739033	test: 0.9104778	best: 0.9104778 (50)	total: 6.55s	remaining: 3h 33m 50s
100:	learn: 0.8738878	test: 0.9104641	best: 0.9104641 (100)	total: 9.01s	remaining: 2h 28m 30s
150:	learn: 0.8738308	test: 0.9104136	best: 0.9104136 (150)	total: 11.1s	remaining: 2h 2m 26s
200:	learn: 0.8736162	test: 0.9102255	best: 0.9102255 (200)	total: 13.6s	remaining: 1h 52m 48s
250:	learn: 0.8727988	test: 0.9095148	best: 0.9095148 (250)	total: 15.9s	remaining: 1h 45m 37s
300:	learn: 0.8696557	test: 0.9068012	best: 0.9068012 (300)	total: 18.1s	remaining: 1h 39m 58s
350:	learn: 0.8587863	test: 0.8974829	best: 0.8974829 (350)	total: 22s	remaining: 1h 43m 52s
400:	learn: 0.8316586	test: 0.8750484	best: 0.8750484 (400)	total: 29.5s	remaining: 2h 2m 3s
450:	learn: 0.7781979	test: 0.8276337	best: 0.8276337 (450)	total: 37.7s	remaining: 2h 18m 38s

In [31]:
agg_preds_df.groupby('weight').mean()

Unnamed: 0_level_0,prediction,target,mape
weight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.001685,1587419.0,1676502.0,0.305105
0.226154,1076383.0,1882513.0,0.255183
0.398382,1624592.0,2342949.0,0.329873


In [26]:
set(agg_preds_df['weight'])

{0.0016852039096730705, 0.22615436467812605, 0.39838220424671383}

In [0]:
X_train.describe()