In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

In [4]:
DATA = Path.home() / 'work' / 'data'
DATA_RAW = DATA / 'raw'
DATA_PROCESSED = DATA / 'processed'

# Load raw data

In [3]:
%%time
train_values = pd.read_csv(DATA_RAW / 'train_values.zip',
                           index_col=0,
                           parse_dates=['timestamp'])

  mask |= (ar1 == a)


CPU times: user 24.9 s, sys: 1.13 s, total: 26 s
Wall time: 26 s


In [4]:
%%time
test_values = pd.read_csv(DATA_RAW / 'test_values.zip',
                         index_col=0,
                         parse_dates=['timestamp'])

CPU times: user 7.67 s, sys: 75.8 ms, total: 7.74 s
Wall time: 7.74 s


In [5]:
train_labels = pd.read_csv(DATA_RAW / 'train_labels.csv', index_col=0)

# Drop final rinse

In [6]:
train_view = train_values[train_values['phase'] != 'final_rinse']

This drops processes that only had a final rinse. We drop them from target values for consistency:

In [7]:
train_labels = train_labels.drop(list(set(train_values['process_id'].unique()).difference(set(train_view['process_id'].unique()))))

# Split train data

In [8]:
np.random.seed(123)

In [9]:
train_target = train_labels[['final_rinse_total_turbidity_liter']].sample(frac=0.7)
test_target  = train_labels[['final_rinse_total_turbidity_liter']].drop(train_target.index)

In [10]:
train_timeseries = train_view[train_view['process_id'].isin(train_target.index)]
test_timeseries  = train_view[train_view['process_id'].isin(test_target.index)]

# Truncate

In [11]:
def count_phases(data):
    phase_count = data.groupby('process_id')['phase'].nunique().value_counts().sort_index().rename('count')
    phase_count = (phase_count.rename('pct') / phase_count.sum()).mul(100).round(1)
    return phase_count

In [12]:
count_phases(train_view)

1     4.8
2    20.3
3     0.4
4    74.4
Name: pct, dtype: float64

In [13]:
count_phases(test_values)

1    14.7
2    39.9
3    22.8
4    22.6
Name: pct, dtype: float64

## Truncation method

We randomly select **51.8%** of the 74.4% processes with 4 phases, then randomly:
* select **9.9%** and truncate them to 1 phase;
* select **19.6%** and truncate them to 2 phases;
* select **22.4%** and truncate them to 3 phases.


In [14]:
trunc_params = {
    1: 0.099,  # truncate to 1 phase
    2: 0.196,
    3: 0.224,
}
phases = ['pre_rinse', 'caustic', 'intermediate_rinse', 'acid']

In [15]:
def truncate(data):
    phase_count = data.groupby('process_id')['phase'].nunique()
    nb_processes = len(phase_count)  # percentages are expressed in terms of the total population
    sample_pool = phase_count[phase_count == 4]
    for truncate, pct in trunc_params.items():
        count = round(pct * nb_processes)
        processes = sample_pool.sample(count).index.tolist()
        data = data[~(data['process_id'].isin(processes) & data['phase'].isin(phases[truncate:]))]
        # drop selected processes so they are not selected again
        sample_pool = sample_pool[~sample_pool.index.isin(processes)]
    return data

In [16]:
train_ts_truncated = truncate(train_timeseries)
test_ts_truncated = truncate(test_timeseries)

## Visual check against hidden test set

In [17]:
def drop_consecutive_duplicates(s):
    return ' + '.join(s[s.shift() != s])

def recipe_count(data):
    return data.groupby('process_id')['phase'].apply(drop_consecutive_duplicates).value_counts().rename('count')

In [18]:
pd.concat([recipe_count(test_values).rename('hidden_test'), recipe_count(test_ts_truncated).rename('our_test')], axis=1, sort=False).fillna(0).astype(int)

Unnamed: 0,hidden_test,our_test
pre_rinse + caustic,1182,589
pre_rinse + caustic + intermediate_rinse + acid,671,351
pre_rinse + caustic + intermediate_rinse,670,336
pre_rinse,292,149
acid,122,58
caustic,23,11
caustic + intermediate_rinse + acid,5,7
caustic + intermediate_rinse,2,0


In [19]:
pd.concat([recipe_count(test_values).rename('hidden_test'), recipe_count(train_ts_truncated).rename('our_train')], axis=1, sort=False).fillna(0).astype(int)

Unnamed: 0,hidden_test,our_train
pre_rinse + caustic,1182,1409
pre_rinse + caustic + intermediate_rinse + acid,671,777
pre_rinse + caustic + intermediate_rinse,670,785
pre_rinse,292,350
acid,122,141
caustic,23,27
caustic + intermediate_rinse + acid,5,15
caustic + intermediate_rinse,2,0


Looks good - we can ignore low-count recipes.

# Sanity checks

Did we drop data points or columns?

In [20]:
assert len(train_view) == len(train_timeseries) + len(test_timeseries)

In [21]:
assert train_view.shape[1] == train_timeseries.shape[1] == test_timeseries.shape[1]

Have we lost any processes?

In [22]:
assert train_view['process_id'].nunique() == train_ts_truncated['process_id'].nunique() + test_ts_truncated['process_id'].nunique()

Did we bleed data?

In [23]:
assert not set(train_ts_truncated['process_id'].unique()).intersection(test_ts_truncated['process_id'].unique())

Do we have target values for all processes?

In [24]:
assert not set(train_ts_truncated['process_id'].unique()).difference(set(train_target.index))

In [25]:
assert not set(test_ts_truncated['process_id'].unique()).difference(set(test_target.index))

# Save data

In [26]:
# we sort indices to make matching of features and targets easier
train_target = train_target.sort_index()
test_target  =  test_target.sort_index()

In [27]:
train_target.to_parquet(DATA_PROCESSED / 'train_target.parquet')
test_target.to_parquet( DATA_PROCESSED / 'test_target.parquet')

train_ts_truncated.to_parquet(DATA_PROCESSED / 'train_ts_truncated.parquet')
test_ts_truncated.to_parquet( DATA_PROCESSED / 'test_ts_truncated.parquet')

In [28]:
# save to parquet without any change
test_values.to_parquet(DATA_RAW / 'test_values.parquet')