In [1]:
!pip install catboost



In [0]:
# The essentials
import pandas as pd
import numpy as np

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

# Gradient Boosting
from catboost import CatBoostRegressor

from collections import defaultdict

In [3]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
dtw_distances_3.p
more_features_with_preds_per_phase.csv
predictions_machine_405.csv
test_features_14.csv
test_features_15.csv
test_features_1.csv
test_features_2.csv
test_features_3.csv
test_features_6.csv
test_features_7.csv
test_features_8.csv
test_features_per_phase_14.csv
test_features_per_phase_15.csv
test_features_per_phase_1.csv
test_features_per_phase_2.csv
test_features_per_phase_3.csv
test_features_per_phase_6.csv
test_features_per_phase_7.csv
test_features_per_phase_8.csv
test_preds_per_phase.p
test_values.csv
train_features_14.csv
train_features_15.csv
train_features_1.csv
train_features_2.csv
train_featu

In [0]:
# variables we'll use to create our time series features
ts_cols = [
    'process_id',
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    'target_value'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level'
]

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

def encode_categorical(df):
    # Currently just copy-pasted from http://drivendata.co/blog/rinse-over-run-benchmark/
    
    # select process_id and pipeline
    meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') 
    
    # convert categorical pipeline data to dummy variables
    meta = pd.get_dummies(meta)
    
    # pipeline L12 not in test data (so useless feature)
    if 'pipeline_L12' in meta:
        meta = meta.drop('pipeline_L12', axis=1)
    
    # calculate number of phases for each process_object
    meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())
    
    return meta

def percentile_25(x):
  return np.percentile(x, 0.25)

def percentile_75(x):
  return np.percentile(x, 0.75)
  
def encode_real_timeseries(df):   
    ts_df = df[ts_cols].set_index('process_id')
    
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', 
                                                   'count', 'median', 'sum', 
                                                   'mad', percentile_25,
                                                   percentile_75])
    
    # Now we will get the mean + variance value of the last K measurements for each phase
    all_vals_per_phase = []
    K = 5
    col_names = ['process_id'] 
    for phase in phases:
        for col in ts_cols:
            col_names.extend(['mean_{}_{}_{}'.format(col, K, phase), 
                              'std_{}_{}_{}'.format(col, K, phase)])
    for process in tqdm(ts_features.index, total=len(ts_features)):
        vals_per_phase = [process]
        process_filtered_df = df[df['process_id'] == process]
        for phase in phases:
            filtered_df = process_filtered_df[process_filtered_df['phase'] == phase].tail(K)
            for col in ts_cols:
                vals_per_phase.extend([filtered_df[col].mean(), filtered_df[col].std()])
                
        all_vals_per_phase.append(vals_per_phase)
    values_df = pd.DataFrame(all_vals_per_phase, columns=col_names)
    values_df = values_df.set_index('process_id')
    
    ts_features = ts_features.merge(values_df, left_index=True, right_index=True)
    
    return ts_features

def encode_binary_timeseries(df):
    ts_df = df[ts_cols].set_index('process_id')
            
    # create features: count, min, max, mean, standard deviation
    ts_features = ts_df.groupby('process_id').agg(['mean', 'std', 'count', 'sum', 'mad'])
    
    # TODO: Count fraction of True in each phase
    feature_vectors = []
    col_names = ['process_id'] 
    for phase in phases:
        for col in bin_cols:
            col_names.append('fraction_{}_{}'.format(col, phase))
            
    # Get fraction of True values for each binary timeseries
    for process in tqdm(set(df['process_id']), total=len(set(df['process_id']))):
        vector = [process]
        process_filtered_df = df[df['process_id'] == process]
        for phase in phases:
            filtered_df = process_filtered_df[process_filtered_df['phase'] == phase]
            for col in bin_cols:
                if len(filtered_df):
                    vector.append(sum(filtered_df[col]) / len(filtered_df))
                else:
                    vector.append(np.NaN)
                
        feature_vectors.append(vector)
                
    feature_df = pd.DataFrame(feature_vectors, columns=col_names)
    feature_df = feature_df.set_index('process_id')
    
    feature_df = feature_df.merge(ts_features, left_index=True, right_index=True)
    
    return feature_df

def create_feature_matrix(df):
    df['return_flow_relu'] = df['return_flow'].apply(lambda x: max(0, x))
    df['target_value'] = df['return_flow_relu'] * df['return_turbidity']
    
    metadata = encode_categorical(df)
    time_series = encode_real_timeseries(df)
    binary_features = encode_binary_timeseries(df)
    
    # join metadata and time series features into a single dataframe
    feature_matrix = metadata
    feature_matrix = feature_matrix.merge(time_series, left_index=True, right_index=True)
    feature_matrix = feature_matrix.merge(binary_features, left_index=True, right_index=True)
    
    return feature_matrix

In [5]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')

  mask |= (ar1 == a)


In [7]:
process_comb = 15

test_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_features_{}.csv'.format(process_comb), index_col=['process_id']).index
train_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index
val_features_index = pd.read_csv('/content/drive/My Drive/Rinse Over Run/val_features_adv_{}.csv'.format(process_comb), index_col=['process_id']).index

X_train = create_feature_matrix(train_df[train_df['process_id'].isin(train_features_index)])
X_val = create_feature_matrix(train_df[train_df['process_id'].isin(val_features_index)])

y_train = np.log(label_df.loc[train_features_index])
y_val = np.log(label_df.loc[val_features_index])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 3354/3354 [01:35<00:00, 35.13it/s]
100%|██████████| 3354/3354 [01:25<00:00, 39.21it/s]
100%|██████████| 372/372 [00:10<00:00, 35.18it/s]
100%|██████████| 372/372 [00:05<00:00, 69.23it/s]


In [0]:
def custom_mape(approxes, targets):
    return np.mean(np.abs(np.subtract(approxes, targets)) / np.maximum(np.abs(targets), 290000))

class MAPEMetric(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, targets, weight):
        return custom_mape(np.exp(approxes), np.exp(targets)), len(targets)

In [20]:
import scipy.stats as stats

for _ in range(10):
  subsample = np.random.choice(list(X_train.index), replace=False, size=250)
  X_train_sub = X_train.loc[subsample, :]
  y_train_sub = y_train.loc[subsample]
  
  # Sample random tree depth
  tree_depth = stats.randint(4, 10).rvs(1)[0]
  
  # Sample random num_count
  num_count = stats.randint(1, 255).rvs(1)[0]
  
  # Sample random regularizer strength
  reg_strength = stats.randint(1, 10).rvs(1)[0]
  
  cat = CatBoostRegressor(iterations=100000, od_type='Iter', od_wait=100, learning_rate=0.33,
                          loss_function='MAPE', eval_metric=MAPEMetric(), depth=8,
                          border_count=254, l2_leaf_reg=1)
  cat.fit(X_train_sub, y_train_sub, eval_set=(X_val, y_val), verbose=50)#, logging_level='Silent')
  
  print('tree depth = {}, border_count = {}, l2_leaf_reg = {}'.format(tree_depth, num_count, reg_strength))
  print(custom_mape(np.exp(cat.predict(X_val)), np.exp(y_val.values.flatten())))

0:	learn: 0.8463738	test: 0.8194781	best: 0.8194781 (0)	total: 1.92s	remaining: 2d 5h 14m 32s
50:	learn: 0.8463693	test: 0.8194741	best: 0.8194741 (50)	total: 50.4s	remaining: 1d 3h 25m 15s
100:	learn: 0.8463531	test: 0.8194591	best: 0.8194591 (100)	total: 52.5s	remaining: 14h 26m 11s
150:	learn: 0.8462951	test: 0.8194064	best: 0.8194064 (150)	total: 54.9s	remaining: 10h 5m 9s
200:	learn: 0.8460883	test: 0.8192213	best: 0.8192213 (200)	total: 57.2s	remaining: 7h 53m 19s
250:	learn: 0.8453500	test: 0.8185708	best: 0.8185708 (250)	total: 59.5s	remaining: 6h 34m 25s
300:	learn: 0.8426899	test: 0.8162738	best: 0.8162738 (300)	total: 1m 2s	remaining: 5h 44m 3s
350:	learn: 0.8331765	test: 0.8081914	best: 0.8081914 (350)	total: 1m 4s	remaining: 5h 5m 17s
400:	learn: 0.8016973	test: 0.7825931	best: 0.7825931 (400)	total: 1m 8s	remaining: 4h 41m 57s
450:	learn: 0.7275551	test: 0.7293945	best: 0.7293945 (450)	total: 1m 47s	remaining: 6h 34m 32s
500:	learn: 0.5780433	test: 0.6636836	best: 0.66368

KeyboardInterrupt: ignored

In [16]:
print(custom_mape(np.exp(cat.predict(X_val)), np.exp(y_val.values.flatten())))

0.6494277841736327
