In [0]:
# The essentials
import pandas as pd
import numpy as np

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

# Gradient Boosting
from catboost import CatBoostRegressor

from collections import defaultdict

In [2]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
dtw_distances_3.p
extended_phase_predictors.csv
last_cleaned_test.csv
last_cleaned_train.csv
mds_embeddings_2d_3.csv
mds_embeddings_2d_3.p
more_features_with_preds_per_phase.csv
pca_features_with_preds_per_phase.csv
predictions_machine_405.csv
processes_all_phases.p
test_features_14.csv
test_features_15.csv
test_features_1.csv
test_features_2.csv
test_features_3.csv
test_features_6.csv
test_features_7.csv
test_features_8.csv
test_features_per_phase_14.csv
test_features_per_phase_15.csv
test_features_per_phase_1.csv
test_features_per_phase_2.csv
test_features_per_phase_3.csv
test_features_per_phase_6.csv
test_features_

In [3]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')

  mask |= (ar1 == a)


In [0]:
# variables we'll use to create our time series features
ts_cols = [
    'process_id',
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level'
]

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

In [0]:
def encode_categorical(df):
    # Currently just copy-pasted from http://drivendata.co/blog/rinse-over-run-benchmark/
    
    # select process_id and pipeline
    meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') 
    
    # convert categorical pipeline data to dummy variables
    meta = pd.get_dummies(meta)
    
    # pipeline L12 not in test data (so useless feature)
    if 'pipeline_L12' in meta:
        meta = meta.drop('pipeline_L12', axis=1)
    
    # calculate number of phases for each process_object
    meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())
    
    return meta

def percentile_25(x):
  return np.percentile(x, 0.25)

def percentile_75(x):
  return np.percentile(x, 0.75)

def fft_coeffs_real(x, coef):
  return np.fft.rfft(x)[coef].real

def fft_coeffs_imag(x, coef):
  return np.fft.rfft(x)[coef].imag

def fft_coeffs_abs(x, coef):
  return np.fft.rfft(x)[coef].abs

def fft_coeffs_angle(x, coef):
  return np.fft.rfft(x)[coef].angle

def fft_mean(x):
  return np.mean(np.abs(np.fft.rfft(x)[:250]))

def fft_std(x):
  return np.std(np.abs(np.fft.rfft(x)[:250]))

def cwt_peaks(x):
  return number_cwt_peaks(x, 5)
  
def encode_real_timeseries(df):   
    ts_df = df[ts_cols].set_index('process_id')
    
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', 
                                                   'count', 'median', 'sum', 
                                                   'mad', percentile_25,
                                                   percentile_75])#, fft_mean, fft_std])
    
    # Now we will get the mean + variance value of the last K measurements for each phase
    all_vals_per_phase = []
    K = 5
    col_names = ['process_id'] 
    for phase in phases:
        for col in ts_cols:
            col_names.extend(['mean_{}_{}_{}'.format(col, K, phase), 
                              'std_{}_{}_{}'.format(col, K, phase)])
    for process in tqdm(ts_features.index, total=len(ts_features)):
        vals_per_phase = [process]
        process_filtered_df = df[df['process_id'] == process]
        for phase in phases:
            filtered_df = process_filtered_df[process_filtered_df['phase'] == phase].tail(K)
            for col in ts_cols:
                vals_per_phase.extend([filtered_df[col].mean(), filtered_df[col].std()])
                
        all_vals_per_phase.append(vals_per_phase)
    values_df = pd.DataFrame(all_vals_per_phase, columns=col_names)
    values_df = values_df.set_index('process_id')
    
    ts_features = ts_features.merge(values_df, left_index=True, right_index=True)
    
    return ts_features

def encode_binary_timeseries(df):
    ts_df = df[ts_cols].set_index('process_id')
            
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['mean', 'std', 'count', 'sum', 'mad'])
    
    # TODO: Count fraction of True in each phase
    feature_vectors = []
    col_names = ['process_id'] 
    for phase in phases:
        for col in bin_cols:
            col_names.append('fraction_{}_{}'.format(col, phase))
            
    # Get fraction of True values for each binary timeseries
    for process in tqdm(set(df['process_id']), total=len(set(df['process_id']))):
        vector = [process]
        process_filtered_df = df[df['process_id'] == process]
        for phase in phases:
            filtered_df = process_filtered_df[process_filtered_df['phase'] == phase]
            for col in bin_cols:
                if len(filtered_df):
                    vector.append(sum(filtered_df[col]) / len(filtered_df))
                else:
                    vector.append(np.NaN)
                
        feature_vectors.append(vector)
                
    feature_df = pd.DataFrame(feature_vectors, columns=col_names)
    feature_df = feature_df.set_index('process_id')
    
    feature_df = feature_df.merge(ts_features, left_index=True, right_index=True)
    
    return feature_df

def create_feature_matrix(df):
    df['return_flow_relu'] = df['return_flow'].apply(lambda x: max(0, x))
    df['target_value'] = df['return_flow_relu'] * df['return_turbidity']
    
    metadata = encode_categorical(df)
    time_series = encode_real_timeseries(df)
    binary_features = encode_binary_timeseries(df)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = metadata
    feature_matrix = feature_matrix.merge(time_series, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(binary_features, left_index=True, right_index=True)
    
    return feature_matrix

In [0]:
process_comb = 15
test_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_{}.csv'.format(process_comb), index_col=['process_id']).index
train_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index
val_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/val_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index

In [13]:
train_df_no_final = train_df[train_df['phase'] != 'final_rinse']

X_train = create_feature_matrix(train_df_no_final[train_df_no_final['process_id'].isin(train_features_index)])
X_val = create_feature_matrix(train_df_no_final[train_df_no_final['process_id'].isin(val_features_index)])
X_test = create_feature_matrix(test_df[test_df['process_id'].isin(test_features_index)])

y_train = np.log(label_df.loc[X_train.index])
y_val = np.log(label_df.loc[X_val.index])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 3354/3354 [01:12<00:00, 46.51it/s]
100%|██████████| 3354/3354 [00:47<00:00, 70.19it/s]
100%|██████████| 372/372 [00:05<00:00, 62.10it/s]
100%|██████████| 372/372 [00:03<00:00, 121.03it/s]
100%|██████████| 671/671 [00:11<00:00, 60.39it/s]
100%|██████████| 671/671 [00:05<00:00, 113.78it/s]


In [0]:
from sklearn.decomposition import PCA

null_columns = X_test.columns[X_test.isnull().sum() > 0]
for col in set(X_train.columns) - set(X_test.columns):
    X_train = X_train.drop(col, axis=1)
    X_val = X_val.drop(col, axis=1)
X_train_no_null = X_train.drop(null_columns, axis=1)
X_val_no_null = X_val.drop(null_columns, axis=1)
X_test_no_null = X_test.drop(null_columns, axis=1)

embeddings = PCA(n_components=3).fit_transform(pd.concat([X_train_no_null, X_val_no_null, X_test_no_null]))

In [0]:
X_tsne_train = pd.DataFrame(embeddings[:len(X_train), :], columns=['pca_0', 'pca_1', 'pca_2'], index=X_train.index)
X_tsne_val = pd.DataFrame(embeddings[len(X_train):(len(X_train)+len(X_val)), :], columns=['pca_0', 'pca_1', 'pca_2'], index=X_val.index)
X_tsne_test = pd.DataFrame(embeddings[(len(X_train)+len(X_val)):, :], columns=['pca_0', 'pca_1', 'pca_2'], index=X_test.index)

X_train_w_tsne = X_train.merge(X_tsne_train, left_index=True, right_index=True)
X_val_w_tsne = X_val.merge(X_tsne_val, left_index=True, right_index=True)
X_test_w_tsne = X_test.merge(X_tsne_test, left_index=True, right_index=True)

In [0]:
mds_embeddings = pd.read_csv('/content/drive/My Drive/Rinse Over Run/mds_embeddings_2d_3.csv', index_col='process_id')

X_train_w_mds = X_train.merge(mds_embeddings, how='left', left_index=True, right_index=True)
X_val_w_mds = X_val.merge(mds_embeddings, how='left', left_index=True, right_index=True)
X_test_w_mds = X_test.merge(mds_embeddings, how='left', left_index=True, right_index=True)

print(X_train.shape, X_train_w_mds.shape)

(4269, 427) (4269, 429)


In [0]:
def get_descript(data, functions, cols):
    ts_df = data.set_index('process_id').sort_values(by='timestamp')
    return ts_df.groupby('process_id')[cols].agg(functions)
  
def get_descript_prev_process(data, functions, cols):
    print(len(cols), len(functions))
    machines = set(data['object_id'])
    all_features = []
    for machine in tqdm(machines):
        machine_data = data[data['object_id'] == machine]
        machine_processes = machine_data['process_id'].unique()
        for process_ix, process in enumerate(machine_processes):
            if process_ix > 0:
                prev_process = machine_data[machine_data['process_id'] == machine_processes[process_ix - 1]]
                this_process = machine_data[machine_data['process_id'] == machine_processes[process_ix]]
                features = get_descript(prev_process, functions, cols)
                _columns = list(features.columns)
                assert len(features) == 1
                features = features.iloc[0, :].values
                time_delta = (this_process['timestamp'].values[0] - prev_process['timestamp'].values[-1]) / np.timedelta64(1, 'h')
                all_features.append([machine, process, time_delta] + list(features))
            else:
                all_features.append([machine, process, np.NaN] + ([np.NaN] * (len(cols) * len(functions))))
    print(_columns)
    all_features = pd.DataFrame(all_features, columns=['object_id', 'process_id', 'time_delta'] + _columns)
    all_features = all_features.set_index('process_id', drop=True)
    return all_features

In [21]:
ts_real = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid'
]

train_prev_proc_features = get_descript_prev_process(train_df_no_final[train_df_no_final['process_id'].isin(train_features_index)], ['mean', 'std', 'sum'], ts_real)
val_prev_proc_features = get_descript_prev_process(train_df_no_final[train_df_no_final['process_id'].isin(val_features_index)], ['mean', 'std', 'sum'], ts_real)

15 3


100%|██████████| 80/80 [01:21<00:00,  1.32it/s]


[('supply_flow', 'mean'), ('supply_flow', 'std'), ('supply_flow', 'sum'), ('supply_pressure', 'mean'), ('supply_pressure', 'std'), ('supply_pressure', 'sum'), ('return_temperature', 'mean'), ('return_temperature', 'std'), ('return_temperature', 'sum'), ('return_conductivity', 'mean'), ('return_conductivity', 'std'), ('return_conductivity', 'sum'), ('return_turbidity', 'mean'), ('return_turbidity', 'std'), ('return_turbidity', 'sum'), ('return_flow', 'mean'), ('return_flow', 'std'), ('return_flow', 'sum'), ('tank_level_pre_rinse', 'mean'), ('tank_level_pre_rinse', 'std'), ('tank_level_pre_rinse', 'sum'), ('tank_level_caustic', 'mean'), ('tank_level_caustic', 'std'), ('tank_level_caustic', 'sum'), ('tank_level_acid', 'mean'), ('tank_level_acid', 'std'), ('tank_level_acid', 'sum'), ('tank_level_clean_water', 'mean'), ('tank_level_clean_water', 'std'), ('tank_level_clean_water', 'sum'), ('tank_temperature_pre_rinse', 'mean'), ('tank_temperature_pre_rinse', 'std'), ('tank_temperature_pre_ri

  0%|          | 0/62 [00:00<?, ?it/s]

15 3


100%|██████████| 62/62 [00:07<00:00,  7.78it/s]

[('supply_flow', 'mean'), ('supply_flow', 'std'), ('supply_flow', 'sum'), ('supply_pressure', 'mean'), ('supply_pressure', 'std'), ('supply_pressure', 'sum'), ('return_temperature', 'mean'), ('return_temperature', 'std'), ('return_temperature', 'sum'), ('return_conductivity', 'mean'), ('return_conductivity', 'std'), ('return_conductivity', 'sum'), ('return_turbidity', 'mean'), ('return_turbidity', 'std'), ('return_turbidity', 'sum'), ('return_flow', 'mean'), ('return_flow', 'std'), ('return_flow', 'sum'), ('tank_level_pre_rinse', 'mean'), ('tank_level_pre_rinse', 'std'), ('tank_level_pre_rinse', 'sum'), ('tank_level_caustic', 'mean'), ('tank_level_caustic', 'std'), ('tank_level_caustic', 'sum'), ('tank_level_acid', 'mean'), ('tank_level_acid', 'std'), ('tank_level_acid', 'sum'), ('tank_level_clean_water', 'mean'), ('tank_level_clean_water', 'std'), ('tank_level_clean_water', 'sum'), ('tank_temperature_pre_rinse', 'mean'), ('tank_temperature_pre_rinse', 'std'), ('tank_temperature_pre_ri




In [22]:
from catboost import CatBoostRegressor
def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

class MAPEMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, targets, weight):
        return custom_mape(np.exp(approxes), np.exp(targets)), len(targets)
      
#for col in set(X_train.columns) - set(X_val.columns):
#  X_train = X_train.drop(col, axis=1)
  
#for col in set(X_val.columns) - set(X_train.columns):
#  X_val = X_val.drop(col, axis=1)

print(X_train.shape, X_val.shape)


last_cleaned_train = pd.read_csv('/content/drive/My Drive/Rinse Over Run/last_cleaned_train.csv', index_col='Unnamed: 0')

X_train_prev_proc = X_train.merge(train_prev_proc_features, left_index=True, right_index=True)
X_val_prev_proc = X_val.merge(val_prev_proc_features, left_index=True, right_index=True)
    
#print(X_train.shape, X_train_last_cleaned.shape)

cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=250, learning_rate=0.33,
                        loss_function='MAPE', eval_metric=MAPEMetric())
cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=50)

"""
Combination 15, w/o last_cleaned:
----------------------------
bestTest = 0.2735594349
bestIteration = 1109

"""

(3354, 427) (372, 427)
0:	learn: 0.8490310	test: 0.8194781	best: 0.8194781 (0)	total: 480ms	remaining: 13h 20m 37s
50:	learn: 0.8490263	test: 0.8194732	best: 0.8194732 (50)	total: 20.3s	remaining: 11h 3m
100:	learn: 0.8490085	test: 0.8194539	best: 0.8194539 (100)	total: 27.2s	remaining: 7h 27m 42s
150:	learn: 0.8489416	test: 0.8193798	best: 0.8193798 (150)	total: 33.9s	remaining: 6h 13m 23s
200:	learn: 0.8486871	test: 0.8190915	best: 0.8190915 (200)	total: 40s	remaining: 5h 31m 25s
250:	learn: 0.8477075	test: 0.8179574	best: 0.8179574 (250)	total: 47.1s	remaining: 5h 12m 3s
300:	learn: 0.8438991	test: 0.8134602	best: 0.8134602 (300)	total: 53.1s	remaining: 4h 53m 13s
350:	learn: 0.8298248	test: 0.7972877	best: 0.7972877 (350)	total: 1m 2s	remaining: 4h 56m 32s
400:	learn: 0.7945754	test: 0.7596150	best: 0.7596150 (400)	total: 1m 25s	remaining: 5h 55m 50s
450:	learn: 0.7378302	test: 0.7049724	best: 0.7049724 (450)	total: 1m 49s	remaining: 6h 42m 41s
500:	learn: 0.5974069	test: 0.5932655

'\nCombination 15, w/o last_cleaned:\n----------------------------\nbestTest = 0.2735594349\nbestIteration = 1109\n\n'

In [0]:
for col in X_train.columns:
  if 'target' in col:
    print(col)

In [0]:
import os
from sklearn.manifold import TSNE

test_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_{}.csv'.format(process_comb), index_col=['process_id'])
val_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/val_features_adv_{}.csv'.format(process_comb), index_col=['process_id'])
train_features = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_adv_{}.csv'.format(process_comb), index_col=['process_id'])

all_labels = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col=['process_id'])
labels = pd.concat([all_labels.loc[train_features.index], all_labels.loc[val_features.index]])

embeddings = TSNE().fit_transform(pd.concat([train_features, val_features]))