In [1]:
!pip install catboost
!pip install tsfresh
!pip install xgboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/98/03/777a0e1c12571a7f3320a4fa6d5f123dba2dd7c0bca34f4f698a6396eb48/catboost-0.12.2-cp36-none-manylinux1_x86_64.whl (55.5MB)
[K    100% |████████████████████████████████| 55.5MB 682kB/s 
[?25hCollecting enum34 (from catboost)
  Downloading https://files.pythonhosted.org/packages/af/42/cb9355df32c69b553e72a2e28daee25d1611d2c0d9c272aa1d34204205b2/enum34-1.1.6-py3-none-any.whl
Installing collected packages: enum34, catboost
Successfully installed catboost-0.12.2 enum34-1.1.6


Collecting tsfresh
[?25l  Downloading https://files.pythonhosted.org/packages/2f/32/265c651f4fd70751f5ada348af0f9e322b058eddcda6a6f9bb305c8d270a/tsfresh-0.11.1-py2.py3-none-any.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 17.0MB/s 
Installing collected packages: tsfresh
Successfully installed tsfresh-0.11.1


In [2]:
# The essentials
import pandas as pd
import numpy as np

from collections import defaultdict

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Progress bars
from tqdm import tqdm

# Access our Google Drive
from google.colab import drive

# Gradient Boosting
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor

# TSFRESH Feature Extraction
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table

from sklearn.model_selection import KFold

from collections import defaultdict, Counter
from scipy.stats import norm

  from pandas.core import datetools


In [3]:
drive.mount('/content/drive', force_remount=True)
!ls "/content/drive/My Drive/Rinse Over Run"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
20178.png
20451.png
20899.png
22112.png
22369.png
22414.png
22487.png
23011.png
23142.png
23599.png
23872.png
24804.png
24845.png
24872.png
25129.png
25908.png
25983.png
26270.png
27115.png
27243.png
27346.png
27366.png
27418.png
27508.png
all_train_preds_per_phase.p
baseline_features_with_preds_per_phase.csv
baseline_model_per_nunique_phases.csv
better_prev_object_id_per_10.csv
dtw_distances_3.p
extended_phase_predictors.csv
last_cleaned_test.csv
last_cleaned

In [4]:
train_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_values.csv', index_col=0, parse_dates=['timestamp'])
test_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/test_values.csv', index_col=0, parse_dates=['timestamp'])
label_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/train_labels.csv', index_col='process_id')
all_data = pd.concat([train_df, test_df], axis=0)

  mask |= (ar1 == a)


In [0]:
train_df = train_df[train_df['phase'] != 'final_rinse']

train_df['phase_int'] = train_df['phase'].map({'pre_rinse': 1, 
                                               'caustic': 2, 
                                               'intermediate_rinse': 4, 
                                               'acid': 8})
test_df['phase_int'] = test_df['phase'].map({'pre_rinse': 1, 
                                             'caustic': 2, 
                                             'intermediate_rinse': 4, 
                                             'acid': 8})
train_process_combinations = pd.DataFrame(train_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
test_process_combinations = pd.DataFrame(test_df.groupby('process_id')['phase_int'].unique().apply(lambda x: sum(x)))
process_combinations = pd.concat([train_process_combinations, test_process_combinations], axis=0)

recipe_df = pd.read_csv('/content/drive/My Drive/Rinse Over Run/recipe_metadata.csv', index_col='process_id')
recipe_df = recipe_df.drop('final_rinse', axis=1)
recipe_df['pre_rinse_num'] = recipe_df['pre_rinse'] * 1
recipe_df['caustic_num'] = recipe_df['caustic'] * 2
recipe_df['intermediate_rinse_num'] = recipe_df['intermediate_rinse'] * 4
recipe_df['acid_num'] = recipe_df['acid'] * 8
recipe_df['recipe'] = recipe_df['pre_rinse_num'] + recipe_df['caustic_num'] + recipe_df['intermediate_rinse_num'] + recipe_df['acid_num']

In [0]:
ts_real = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    #'target_value',
    #'flow_diff'
]

# variables we'll use to create our time series features
ts_cols = [
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    #'target_value',
    #'flow_diff'
]

# variables for binary time series features
bin_cols = [
    'supply_pump',
    'supply_pre_rinse',
    'supply_caustic',
    'return_caustic',
    'supply_acid',
    'return_acid',
    'supply_clean_water',
    'return_recovery_water',
    'return_drain',
    'object_low_level',
    'tank_lsh_caustic',
    'tank_lsh_acid',
    'tank_lsh_clean_water',
    'tank_lsh_pre_rinse'
]

process_comb_to_phases = {
    15: ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid'],
    3:  ['pre_rinse', 'caustic'],
    7:  ['pre_rinse', 'caustic', 'intermediate_rinse'],
    1:  ['pre_rinse'],
    8:  ['acid'],
    2:  ['caustic'],
    6:  ['caustic', 'intermediate_rinse'],
    14: ['caustic', 'intermediate_rinse', 'acid'],
}

# phases, ordered from earliest to latest
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

def get_descript(data, functions, cols):
    ts_df = data.set_index('process_id').sort_values(by='timestamp')
    return ts_df.groupby('process_id')[cols].agg(functions)  

  
def get_descript_prev_process(data, data_procs):
    machines = set(data['object_id'])
    all_features = []
    for machine in tqdm(machines):
        machine_data = data[data['object_id'] == machine]
        machine_data = machine_data.sort_values(by='timestamp')
        machine_processes = machine_data['process_id'].unique()
        for process_ix, process in enumerate(machine_processes):
          if process in data_procs:
            if process_ix > 0:
                prev_process = machine_data[machine_data['process_id'] == machine_processes[process_ix - 1]]
                last_phase = None
                for phase in phases:
                  if phase in set(prev_process['phase']):
                    last_phase = phase
                prev_process = prev_process[prev_process['phase'] == last_phase]
                
                features = get_descript(prev_process, ['mean', 'std', 'min', 'max', 'count'], ts_cols)
                _columns = list(features.columns)
                assert len(features) == 1
                features = features.iloc[0, :].values
                
                this_process = machine_data[machine_data['process_id'] == machine_processes[process_ix]]
                prev_recipe = recipe_df.loc[machine_processes[process_ix - 1]]['recipe']
                prev_phase_int = process_combinations.loc[machine_processes[process_ix - 1]]['phase_int']
                time_delta = (this_process['timestamp'].values[0] - prev_process['timestamp'].values[-1]) / np.timedelta64(1, 'h')
                assert time_delta > 0
                all_features.append([machine, process, time_delta, prev_recipe, prev_phase_int] + list(features))
            else:
                all_features.append([machine, process, np.NaN, np.NaN, np.NaN]  + ([np.NaN] * 60))
   
    all_features = pd.DataFrame(all_features, columns=['object_id', 'process_id', 'time_delta', 'prev_recipe', 'prev_phase_int'] + _columns)
    all_features = all_features.set_index('process_id', drop=True)
    col_map = {}
    for col in all_features.columns:
        col_map[col] = 'prev_{}'.format(col)
    all_features = all_features.rename(columns=col_map)
    all_features = all_features.drop('prev_object_id', axis=1)
    return all_features
  
def get_processes(data, phases, train=True):
    filtered_processes = []
    phases = set(phases)
    processes = set(data['process_id'])
    for process in processes:
        process_phases = set(data[data['process_id'] == process]['phase'])
        if train:
            if phases.issubset(process_phases):
                filtered_processes.append(process)
        else:
            if len(phases) == len(process_phases) == len(phases.intersection(process_phases)):
                filtered_processes.append(process)
    return filtered_processes

In [60]:
sub_df = all_data[all_data['object_id'] == 945]
sub_df.groupby('process_id')['timestamp'].max().sort_values()

process_id
21409   2018-02-22 11:39:23
24815   2018-02-23 12:17:13
23719   2018-02-24 12:13:11
23878   2018-02-25 14:52:25
20136   2018-03-03 10:46:54
25852   2018-03-06 10:22:25
21166   2018-03-07 11:18:24
22491   2018-03-08 05:05:37
24293   2018-03-08 17:17:59
23325   2018-03-12 11:48:22
25408   2018-03-13 11:47:42
20301   2018-03-14 11:55:12
20390   2018-03-15 10:37:23
22410   2018-03-18 12:27:58
27104   2018-03-21 11:19:02
23437   2018-03-23 11:19:46
21831   2018-03-24 11:24:21
22683   2018-03-25 12:34:54
25144   2018-03-26 06:33:37
27849   2018-03-27 14:40:12
21215   2018-03-28 10:33:40
22528   2018-03-29 10:49:34
24469   2018-03-30 10:43:59
26520   2018-03-31 10:51:45
24675   2018-04-01 10:49:24
20998   2018-04-01 12:18:29
21763   2018-04-02 05:45:29
24674   2018-04-03 11:30:21
24369   2018-04-04 08:23:13
21926   2018-04-05 10:55:43
                ...        
25085   2018-04-21 11:53:36
24075   2018-04-22 14:52:29
27957   2018-04-24 16:34:55
23647   2018-04-25 11:12:56
22733   2

In [70]:
features = get_descript_prev_process(sub_df, list(set(sub_df['process_id'])))

100%|██████████| 1/1 [00:02<00:00,  2.43s/it]


In [71]:

features

Unnamed: 0_level_0,prev_time_delta,prev_prev_recipe,prev_prev_phase_int,"prev_('supply_flow', 'mean')","prev_('supply_flow', 'std')","prev_('supply_flow', 'min')","prev_('supply_flow', 'max')","prev_('supply_flow', 'count')","prev_('supply_pressure', 'mean')","prev_('supply_pressure', 'std')",...,"prev_('tank_concentration_caustic', 'mean')","prev_('tank_concentration_caustic', 'std')","prev_('tank_concentration_caustic', 'min')","prev_('tank_concentration_caustic', 'max')","prev_('tank_concentration_caustic', 'count')","prev_('tank_concentration_acid', 'mean')","prev_('tank_concentration_acid', 'std')","prev_('tank_concentration_acid', 'min')","prev_('tank_concentration_acid', 'max')","prev_('tank_concentration_acid', 'count')"
process_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21409,,,,,,,,,,,...,,,,,,,,,,
24815,23.603889,15.0,15.0,31661.805798,4879.402006,-14.467592,45312.500,1140.0,0.362112,0.103495,...,45.281091,0.213716,44.671623,46.302723,1140.0,44.587229,0.541692,43.587605,46.856990,1140.0
23719,22.826667,15.0,15.0,31639.386421,4536.689780,-14.467592,45572.914,1133.0,0.358917,0.100633,...,45.078843,0.167920,44.736460,46.032440,1133.0,44.447638,0.684209,38.486984,53.482544,1133.0
23878,25.500556,15.0,15.0,31593.993094,4445.702953,0.000000,45630.785,1138.0,0.361726,0.098839,...,45.247055,0.842668,41.271225,58.484760,1138.0,44.406275,0.383791,43.575665,46.108400,1138.0
20136,138.724167,15.0,15.0,31636.316749,4507.197825,-14.467592,45601.850,1182.0,0.365398,0.100285,...,44.959330,0.342919,44.661636,46.011920,1182.0,44.651873,0.231866,43.923570,45.632748,1182.0
25852,70.411944,15.0,15.0,31578.537496,4666.493284,-43.402780,45442.710,1139.0,0.360593,0.102069,...,45.300134,0.392748,44.324820,47.518192,1139.0,44.611400,0.426470,42.417030,46.057670,1139.0
21166,24.516389,15.0,15.0,31852.357200,4348.287640,14.467592,45413.773,1134.0,0.364127,0.098877,...,45.197970,0.399090,43.717133,47.528614,1134.0,44.753736,0.474308,43.232155,46.461678,1134.0
22491,16.612500,15.0,8.0,32320.543712,2735.625943,25737.846000,45529.516,744.0,0.372435,0.087010,...,45.250688,0.371016,44.030193,46.807580,744.0,44.587334,0.230807,44.243970,45.880337,744.0
24293,11.032222,15.0,15.0,31674.128178,4591.441345,-14.467592,45717.594,1137.0,0.363030,0.100871,...,45.031200,0.341312,43.802418,46.694770,1137.0,44.394832,0.470791,43.164734,46.667557,1137.0
23325,89.311944,15.0,15.0,31622.188608,4380.839729,0.000000,44415.508,1141.0,0.359098,0.099231,...,45.396234,0.255259,44.790283,47.367012,1141.0,44.582352,0.311532,43.833620,46.397110,1141.0
