In [21]:
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt

import pandas_profiling
from scipy.stats import gmean

In [2]:
%matplotlib inline
np.random.seed(42)

In [3]:
def import_data(filepath):
    
    col_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3']
    col_names.extend(['sensor%d' % i for i in range(1, 22)])
    col_names.append('dummy')
    
    df = pd.read_csv(filepath, sep=' ', index_col=False, names=col_names, header=None)
    df.drop(labels='dummy', axis=1, inplace=True)

    return df

In [4]:
def adjust_RUL(df):
    
    def total_time(engine_id):
        return max(df[df['engine_id'] == engine_id]['cycle'])
    
    df['RUL'] = df.apply(lambda row: int(row['RUL'] + total_time(row['engine_id']) - row['cycle']), axis=1)
    return df


def add_warnings(df, w0, w1):
    df['w1'] = df.apply(lambda row: 1 if row['RUL'] <= w1 else 0, axis=1)
    df['w0'] = df.apply(lambda row: int(row['w1'] + 1 if row['RUL'] <= w0 else row['w1']), axis=1)
    return df


def label_df(df, w0=15, w1=30):
    df = adjust_RUL(df)
    df = add_warnings(df, w0, w1)
    return df

In [33]:
def preprocess_train(filepath='./data/PM_train.txt'):
    train = import_data(filepath)
    train['RUL'] = 0
    train = label_df(train)
    print('Train DF shape:', train.shape)
    
    return train


def preprocess_val_test(filepath='./data/PM_test.txt'):
    val_test = import_data(filepath)

    RULs = pd.read_csv('./data/PM_truth.txt', names=['RUL'])
    RULs['engine_id'] = pd.Series([i for i in range(1, 101)])

    val_test = val_test.merge(RULs, on='engine_id')
    val_test = label_df(val_test)
    
    val_ids = np.random.choice([i for i in range(1, 101)], size=50, replace=False)

    val = val_test[val_test['engine_id'].isin(val_ids)].reset_index(drop=True)
    test = val_test[~val_test['engine_id'].isin(val_ids)].reset_index(drop=True)

    print('Validation DF shape:', val.shape)
    print('Test DF shape:', test.shape)
    
    return val, test

In [34]:
train = preprocess_train()
val, test = preprocess_val_test()

Train DF shape: (20631, 29)
Validation DF shape: (6485, 29)
Test DF shape: (6611, 29)


In [60]:
def check_overwrite(filepath):
    if os.path.exists(filepath):
        overwrite = input('Pickle exists, overwrite? (y/n) ')
        overwrite = (overwrite.lower() == 'y')
    else:
        overwrite = True
    
    return overwrite


def checkpoint_items(checkpoint_name, items):
    filepath = os.path.join('pickle', str(checkpoint_name) + '.pkl')
    
    if check_overwrite(filepath):
        print('Saving items to', filepath)
        with open(filepath, 'wb') as f:
            pickle.dump(items, f)
            f.close()
        print('Items saved!')
    
    else:
        print('Did not save', checkpoint_name)
        

def load_checkpoint(checkpoint_name):
    filepath = os.path.join('pickle', str(checkpoint_name) + '.pkl')
    print('Loading', filepath)
    with open(filepath, 'rb') as f:
        items = pickle.load(f)
        f.close()
    print('Items loaded!')
    return items

In [22]:
pandas_profiling.ProfileReport(train)



In [38]:
pandas_profiling.ProfileReport(val)



In [50]:
drop_cols = ['sensor1', 'sensor10', 'sensor16', 'sensor18', 'sensor19', 'sensor5', 'setting3', 'sensor6']

In [59]:
checkpoint_items('initial_dfs', (train, val, test))

Pickle exists, overwrite? (y/n) n
Did not save initial_dfs


In [61]:
train, val, test = load_checkpoint('initial_dfs')

Loading pickle/initial_dfs.pkl
Items loaded!
