In [1]:
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt

import pandas_profiling
from scipy.stats import gmean

In [2]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [117]:
%matplotlib inline
random_seed = 42

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [22]:
def import_data(filepath):
    '''Converts space-separated values file into DataFrame.
    
    Arguments:
    filepath -- str, local file location
    
    Returns:
    df -- pd.DataFrame, labelled DataFrame
    '''
    
    col_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3']
    col_names.extend(['sensor%d' % i for i in range(1, 22)])
    col_names.append('dummy')  # Accounts for the trailing separator in the source data
    
    df = pd.read_csv(filepath, sep=' ', index_col=False, names=col_names)
    df.drop(labels='dummy', axis=1, inplace=True)  # Now drop that extra column

    return df

In [162]:
def add_target_cols(df, w0=30, w1=15):
    '''Adds regression and classification target columns to df.
    RUL -- Remaining Useful Life, the number of cycles until engine failure
    RUL_frac -- RUL fraction, the proportion of the engine's total lifetime remaining
    w0 -- cycles of RUL remaining at first warning
    w1 -- cycles of RUL remaining at last warning
    
    Arguments:
    df -- pd.DataFrame, fresh from import_data
    w0 -- int, (default 30)
    w1 -- int, (default 15)
    
    Returns:
    df -- pd.DataFrame, original df plus target columns
    '''
    
    df['max_cycles'] = df.groupby('engine_id')['cycle'].transform(max)  # Temporary column for memoization
    
    df['RUL'] = pd.Series(df['RUL'] + df['max_cycles'] - df['cycle'], dtype=int)
    df['max_RUL'] = df.groupby('engine_id')['RUL'].transform(max)  # Temporary column for memoization
    
    df['RUL_frac'] = df['RUL'] / df['max_RUL']
    
    df['w0'] = pd.Series(df['RUL'] <= w0, dtype=int)
    df['w1'] = df['w0'] + pd.Series(df['RUL'] <= w1, dtype=int)
    
    df.drop(labels=['max_cycles', 'max_RUL'], axis=1, inplace=True)  # Now drop those temp columns
    
    return df

In [163]:
def preprocess_train(filepath='./data/PM_train.txt'):
    train = import_data(filepath)
    train['RUL'] = 0
    train = add_target_cols(train)
    
    return train


def preprocess_val_test(filepath='./data/PM_test.txt'):
    np.random.seed(random_seed)
    val_test = import_data(filepath)

    RULs = pd.read_csv('./data/PM_truth.txt', names=['RUL'])
    RULs['engine_id'] = pd.Series([i for i in range(1, 101)])

    val_test = val_test.merge(RULs, on='engine_id')
    val_test = add_target_cols(val_test)
    
    val_ids = np.random.choice([i for i in range(1, 101)], size=50, replace=False, )

    val = val_test[val_test['engine_id'].isin(val_ids)].reset_index(drop=True)
    test = val_test[~val_test['engine_id'].isin(val_ids)].reset_index(drop=True)
    
    return val, test

In [164]:
def check_df_shapes(dfs):
    assert len(set([df.shape[1] for df in dfs])) == 1
    print('All dfs have the same number of columns')

In [165]:
train = preprocess_train()
val, test = preprocess_val_test()

check_df_shapes([train, val, test])

All dfs have the same number of columns


In [166]:
train.iloc[185:195]

Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6,sensor7,sensor8,sensor9,sensor10,sensor11,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,RUL,RUL_frac,w0,w1
185,1,186,0.0027,-0.0003,100.0,518.67,643.51,1595.16,1426.3,14.62,21.61,552.57,2388.21,9051.37,1.3,48.12,520.08,2388.25,8123.45,8.5227,0.03,397,2388,100.0,38.47,23.0564,6,0.031414,1,2
186,1,187,-0.0047,-0.0,100.0,518.67,643.32,1592.1,1427.27,14.62,21.61,551.08,2388.29,9037.71,1.3,48.23,519.53,2388.28,8115.67,8.5218,0.03,396,2388,100.0,38.42,23.0822,5,0.026178,1,2
187,1,188,-0.0067,0.0003,100.0,518.67,643.75,1602.38,1422.78,14.62,21.61,551.94,2388.31,9037.91,1.3,48.0,519.79,2388.23,8117.69,8.5207,0.03,396,2388,100.0,38.51,22.9588,4,0.020942,1,2
188,1,189,-0.0006,0.0002,100.0,518.67,644.18,1596.17,1428.01,14.62,21.61,550.7,2388.27,9044.55,1.3,48.08,519.58,2388.33,8117.51,8.5183,0.03,395,2388,100.0,38.48,23.1127,3,0.015707,1,2
189,1,190,-0.0027,0.0001,100.0,518.67,643.64,1599.22,1425.95,14.62,21.61,551.29,2388.29,9040.58,1.3,48.33,520.04,2388.35,8112.58,8.5223,0.03,398,2388,100.0,38.49,23.0675,2,0.010471,1,2
190,1,191,-0.0,-0.0004,100.0,518.67,643.34,1602.36,1425.77,14.62,21.61,550.92,2388.28,9042.76,1.3,48.15,519.57,2388.3,8114.61,8.5174,0.03,394,2388,100.0,38.45,23.1295,1,0.005236,1,2
191,1,192,0.0009,-0.0,100.0,518.67,643.54,1601.41,1427.2,14.62,21.61,551.25,2388.32,9033.22,1.3,48.25,520.08,2388.32,8110.93,8.5113,0.03,396,2388,100.0,38.48,22.9649,0,0.0,1,2
192,2,1,-0.0018,0.0006,100.0,518.67,641.89,1583.84,1391.28,14.62,21.6,554.53,2388.01,9054.72,1.3,46.93,522.33,2388.06,8137.72,8.3905,0.03,391,2388,100.0,38.94,23.4585,286,1.0,0,0
193,2,2,0.0043,-0.0003,100.0,518.67,641.82,1587.05,1393.13,14.62,21.61,554.77,2387.98,9051.31,1.3,47.24,522.7,2387.98,8131.09,8.4167,0.03,392,2388,100.0,39.06,23.4085,285,0.996503,0,0
194,2,3,0.0018,0.0003,100.0,518.67,641.55,1588.32,1398.96,14.62,21.6,555.14,2388.04,9054.24,1.3,47.22,522.58,2387.99,8140.58,8.3802,0.03,391,2388,100.0,39.11,23.425,284,0.993007,0,0


In [167]:
val.iloc[25:35]

Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6,sensor7,sensor8,sensor9,sensor10,sensor11,sensor12,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,RUL,RUL_frac,w0,w1
25,1,26,0.0047,-0.0005,100.0,518.67,642.48,1583.28,1408.07,14.62,21.61,554.59,2388.08,9053.43,1.3,47.33,521.95,2388.07,8129.12,8.3949,0.03,391,2388,100.0,38.77,23.3557,117,0.823944,0,0
26,1,27,-0.0007,0.0001,100.0,518.67,642.08,1586.65,1400.31,14.62,21.61,554.35,2388.09,9046.1,1.3,47.34,521.82,2388.02,8127.24,8.4494,0.03,392,2388,100.0,38.87,23.3931,116,0.816901,0,0
27,1,28,0.0022,0.0005,100.0,518.67,641.93,1594.25,1401.29,14.62,21.61,553.56,2388.07,9055.56,1.3,47.05,521.84,2388.07,8134.89,8.447,0.03,392,2388,100.0,38.83,23.3502,115,0.809859,0,0
28,1,29,0.0014,0.0001,100.0,518.67,641.95,1587.15,1398.11,14.62,21.61,554.15,2388.08,9046.11,1.3,47.42,522.39,2388.07,8133.13,8.4212,0.03,392,2388,100.0,39.02,23.3621,114,0.802817,0,0
29,1,30,-0.0025,0.0004,100.0,518.67,642.79,1585.72,1400.97,14.62,21.61,554.1,2388.09,9047.45,1.3,47.4,521.78,2388.1,8134.79,8.411,0.03,391,2388,100.0,39.09,23.4069,113,0.795775,0,0
30,1,31,-0.0006,0.0004,100.0,518.67,642.58,1581.22,1398.91,14.62,21.61,554.42,2388.08,9056.4,1.3,47.23,521.79,2388.06,8130.11,8.4024,0.03,393,2388,100.0,38.81,23.3552,112,0.788732,0,0
31,5,1,0.0024,-0.0004,100.0,518.67,642.43,1590.79,1402.02,14.62,21.61,553.5,2388.07,9051.54,1.3,47.37,522.2,2388.07,8130.9,8.4426,0.03,394,2388,100.0,39.04,23.3916,188,1.0,0,0
32,5,2,0.0011,0.0002,100.0,518.67,642.85,1581.46,1409.7,14.62,21.61,553.44,2388.06,9051.9,1.3,47.54,521.63,2388.11,8128.71,8.4429,0.03,393,2388,100.0,38.87,23.3181,187,0.994681,0,0
33,5,3,0.0056,-0.0,100.0,518.67,642.88,1586.05,1407.64,14.62,21.61,553.51,2388.06,9057.29,1.3,47.27,521.8,2388.11,8132.02,8.4657,0.03,393,2388,100.0,39.02,23.3529,186,0.989362,0,0
34,5,4,-0.001,0.0003,100.0,518.67,642.79,1585.93,1403.47,14.62,21.61,553.93,2388.15,9048.64,1.3,47.43,521.8,2388.09,8130.1,8.4147,0.03,392,2388,100.0,38.98,23.3574,185,0.984043,0,0


In [143]:
RUL_cols = ['engine_id', 'cycle', 'RUL', 'RUL_frac', 'w0', 'w1']
train.iloc[172:193][RUL_cols]

Unnamed: 0,engine_id,cycle,RUL,RUL_frac,w0,w1
172,1,173,20,0.104167,1,1
173,1,174,19,0.098958,1,1
174,1,175,18,0.09375,1,1
175,1,176,17,0.088542,1,1
176,1,177,16,0.083333,1,1
177,1,178,15,0.078125,1,2
178,1,179,14,0.072917,1,2
179,1,180,13,0.067708,1,2
180,1,181,12,0.0625,1,2
181,1,182,11,0.057292,1,2


In [31]:
def check_overwrite(filepath):
    if os.path.exists(filepath):
        overwrite = input('Pickle exists, overwrite? (y/n) ')
        overwrite = (overwrite.lower() == 'y')
    else:
        overwrite = True
    
    return overwrite


def checkpoint_items(checkpoint_name, items):
    filepath = os.path.join('pickle', str(checkpoint_name) + '.pkl')
    
    if check_overwrite(filepath):
        print('Saving items to', filepath)
        with open(filepath, 'wb') as f:
            pickle.dump(items, f)
            f.close()
        print('Items saved!')
    
    else:
        print('Did not save', checkpoint_name)
        

def load_checkpoint(checkpoint_name):
    filepath = os.path.join('pickle', str(checkpoint_name) + '.pkl')
    print('Loading', filepath)
    with open(filepath, 'rb') as f:
        items = pickle.load(f)
        f.close()
    print('Items loaded!')
    return items

In [16]:
pandas_profiling.ProfileReport(train)



In [17]:
pandas_profiling.ProfileReport(val)



In [10]:
checkpoint_items('initial_dfs', (train, val, test))

Pickle exists, overwrite? (y/n) y
Saving items to pickle/initial_dfs.pkl
Items saved!


In [32]:
train, val, test = load_checkpoint('initial_dfs')

Loading pickle/initial_dfs.pkl
Items loaded!


In [33]:
dfs = (train, val, test)

In [34]:
cols_to_drop = ['sensor1', 'sensor10', 'sensor16', 'sensor18', 'sensor19', 'sensor5', 'setting3', 'sensor6']
target_cols = ['RUL', 'RUL_frac', 'w0', 'w1']

In [35]:
def drop_col_list(dfs, cols_to_drop):
    for df in dfs:
        df.drop(cols_to_drop, inplace=True, axis=1)

In [36]:
drop_col_list(dfs, cols_to_drop)
check_df_shapes([train, val, test])

In [37]:
def geo_std(sequence):
    if 0 in sequence.unique():
        return 1
    else:
        geo_mean = gmean(sequence)
        summand = sum([np.log(value / geo_mean) ** 2 for value in sequence])
        return np.exp(np.sqrt(summand / len(sequence)))

In [38]:
stat_map = {'amean': np.mean,
            'gmean': gmean,
            'median': np.median,
            'std': np.std,
            'geo_std': geo_std}

In [41]:
def add_windowed_aggs(dfs, window_size, stat_map, verbose=True):
    for df in dfs:
        for col in df.columns:
            if col.startswith('sensor'):
                if verbose:
                    print(col)
                add_stats(df, col, window_size, stat_map)
    

def add_stats(df, col, window_size, stat_map):
    for stat in stat_map.keys():
        new_name = col + f'_{stat}'
        df[new_name] = (df.groupby('engine_id')[col]
                        .rolling(window_size)
                        .apply(lambda x: stat_map[stat](pd.Series(x)))
                        .reset_index()
                        [col]
                        .fillna(method='bfill'))

In [42]:
add_windowed_aggs(dfs, 5, stat_map)
check_df_shapes(dfs)

sensor2


  from ipykernel import kernelapp as app


sensor3
sensor4
sensor7
sensor8
sensor9
sensor11
sensor12
sensor13
sensor14
sensor15
sensor17
sensor20
sensor21
sensor2
sensor3
sensor4
sensor7
sensor8
sensor9
sensor11
sensor12
sensor13
sensor14
sensor15
sensor17
sensor20
sensor21
sensor2
sensor3
sensor4
sensor7
sensor8
sensor9
sensor11
sensor12
sensor13
sensor14
sensor15
sensor17
sensor20
sensor21


In [45]:
checkpoint_items('aggregate_dfs', dfs)

Pickle exists, overwrite? (y/n) y
Saving items to pickle/aggregate_dfs.pkl
Items saved!


In [15]:
train, val, test = load_checkpoint('aggregate_dfs')

Loading pickle/aggregate_dfs.pkl
Items loaded!


In [43]:
train.head(10)

Unnamed: 0,engine_id,cycle,setting1,setting2,sensor2,sensor3,sensor4,sensor7,sensor8,sensor9,sensor11,sensor12,sensor13,sensor14,sensor15,sensor17,sensor20,sensor21,RUL,RUL_frac,w1,w0,sensor2_amean,sensor2_gmean,sensor2_median,sensor2_std,sensor2_geo_std,sensor3_amean,sensor3_gmean,sensor3_median,sensor3_std,sensor3_geo_std,sensor4_amean,sensor4_gmean,sensor4_median,sensor4_std,sensor4_geo_std,sensor7_amean,sensor7_gmean,sensor7_median,sensor7_std,sensor7_geo_std,sensor8_amean,sensor8_gmean,sensor8_median,sensor8_std,sensor8_geo_std,sensor9_amean,sensor9_gmean,sensor9_median,sensor9_std,sensor9_geo_std,sensor11_amean,sensor11_gmean,sensor11_median,sensor11_std,sensor11_geo_std,sensor12_amean,sensor12_gmean,sensor12_median,sensor12_std,sensor12_geo_std,sensor13_amean,sensor13_gmean,sensor13_median,sensor13_std,sensor13_geo_std,sensor14_amean,sensor14_gmean,sensor14_median,sensor14_std,sensor14_geo_std,sensor15_amean,sensor15_gmean,sensor15_median,sensor15_std,sensor15_geo_std,sensor17_amean,sensor17_gmean,sensor17_median,sensor17_std,sensor17_geo_std,sensor20_amean,sensor20_gmean,sensor20_median,sensor20_std,sensor20_geo_std,sensor21_amean,sensor21_gmean,sensor21_median,sensor21_std,sensor21_geo_std
0,1,1,-0.0007,-0.0004,641.82,1589.7,1400.6,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419,191,1.0,0,0,642.208,642.207966,642.35,0.20999,1.000327,1587.03,1587.025813,1587.99,3.645397,1.0023,1403.206,1403.204671,1403.14,1.931462,1.001377,554.164,554.163941,554.26,0.256016,1.000462,2388.07,2388.07,2388.06,0.023664,1.00001,9049.566,9049.56507,9049.48,4.103065,1.000453,47.328,47.327807,47.28,0.135115,1.002859,522.282,522.281857,522.28,0.386906,1.000741,2388.048,2388.048,2388.04,0.023152,1.00001,8134.194,8134.193654,8133.8,2.371418,1.000292,8.41334,8.413308,8.4195,0.023213,1.002768,391.8,391.798773,392.0,0.979796,1.002507,38.958,38.957944,38.95,0.065848,1.001691,23.39302,23.393001,23.4044,0.029962,1.001282
1,1,2,0.0019,-0.0003,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236,190,0.994764,0,0,642.208,642.207966,642.35,0.20999,1.000327,1587.03,1587.025813,1587.99,3.645397,1.0023,1403.206,1403.204671,1403.14,1.931462,1.001377,554.164,554.163941,554.26,0.256016,1.000462,2388.07,2388.07,2388.06,0.023664,1.00001,9049.566,9049.56507,9049.48,4.103065,1.000453,47.328,47.327807,47.28,0.135115,1.002859,522.282,522.281857,522.28,0.386906,1.000741,2388.048,2388.048,2388.04,0.023152,1.00001,8134.194,8134.193654,8133.8,2.371418,1.000292,8.41334,8.413308,8.4195,0.023213,1.002768,391.8,391.798773,392.0,0.979796,1.002507,38.958,38.957944,38.95,0.065848,1.001691,23.39302,23.393001,23.4044,0.029962,1.001282
2,1,3,-0.0043,0.0003,642.35,1587.99,1404.2,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,189,0.989529,0,0,642.208,642.207966,642.35,0.20999,1.000327,1587.03,1587.025813,1587.99,3.645397,1.0023,1403.206,1403.204671,1403.14,1.931462,1.001377,554.164,554.163941,554.26,0.256016,1.000462,2388.07,2388.07,2388.06,0.023664,1.00001,9049.566,9049.56507,9049.48,4.103065,1.000453,47.328,47.327807,47.28,0.135115,1.002859,522.282,522.281857,522.28,0.386906,1.000741,2388.048,2388.048,2388.04,0.023152,1.00001,8134.194,8134.193654,8133.8,2.371418,1.000292,8.41334,8.413308,8.4195,0.023213,1.002768,391.8,391.798773,392.0,0.979796,1.002507,38.958,38.957944,38.95,0.065848,1.001691,23.39302,23.393001,23.4044,0.029962,1.001282
3,1,4,0.0007,0.0,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,188,0.984293,0,0,642.208,642.207966,642.35,0.20999,1.000327,1587.03,1587.025813,1587.99,3.645397,1.0023,1403.206,1403.204671,1403.14,1.931462,1.001377,554.164,554.163941,554.26,0.256016,1.000462,2388.07,2388.07,2388.06,0.023664,1.00001,9049.566,9049.56507,9049.48,4.103065,1.000453,47.328,47.327807,47.28,0.135115,1.002859,522.282,522.281857,522.28,0.386906,1.000741,2388.048,2388.048,2388.04,0.023152,1.00001,8134.194,8134.193654,8133.8,2.371418,1.000292,8.41334,8.413308,8.4195,0.023213,1.002768,391.8,391.798773,392.0,0.979796,1.002507,38.958,38.957944,38.95,0.065848,1.001691,23.39302,23.393001,23.4044,0.029962,1.001282
4,1,5,-0.0019,-0.0002,642.37,1582.85,1406.22,554.0,2388.06,9055.15,47.28,522.19,2388.04,8133.8,8.4294,393,38.9,23.4044,187,0.979058,0,0,642.208,642.207966,642.35,0.20999,1.000327,1587.03,1587.025813,1587.99,3.645397,1.0023,1403.206,1403.204671,1403.14,1.931462,1.001377,554.164,554.163941,554.26,0.256016,1.000462,2388.07,2388.07,2388.06,0.023664,1.00001,9049.566,9049.56507,9049.48,4.103065,1.000453,47.328,47.327807,47.28,0.135115,1.002859,522.282,522.281857,522.28,0.386906,1.000741,2388.048,2388.048,2388.04,0.023152,1.00001,8134.194,8134.193654,8133.8,2.371418,1.000292,8.41334,8.413308,8.4195,0.023213,1.002768,391.8,391.798773,392.0,0.979796,1.002507,38.958,38.957944,38.95,0.065848,1.001691,23.39302,23.393001,23.4044,0.029962,1.001282
5,1,6,-0.0043,-0.0001,642.1,1584.47,1398.37,554.67,2388.02,9049.68,47.16,521.68,2388.03,8132.85,8.4108,391,38.98,23.3669,186,0.973822,0,0,642.264,642.26399,642.35,0.114822,1.000179,1585.984,1585.980195,1584.47,3.475593,1.002192,1402.76,1402.757557,1403.14,2.617396,1.001868,554.226,554.225905,554.26,0.324382,1.000585,2388.062,2388.062,2388.06,0.031241,1.000013,9050.264,9050.263223,9049.68,3.751142,1.000415,47.266,47.265831,47.27,0.126586,1.002679,522.286,522.285861,522.28,0.380505,1.000729,2388.05,2388.05,2388.04,0.020976,1.000009,8133.04,8133.039955,8133.23,0.857485,1.000105,8.4116,8.411568,8.4178,0.023011,1.002744,391.6,391.598671,392.0,1.019804,1.002609,38.942,38.941973,38.95,0.045782,1.001176,23.3826,23.382583,23.3739,0.028118,1.001203
6,1,7,0.001,0.0001,642.48,1592.32,1397.77,554.34,2388.02,9059.13,47.36,522.32,2388.03,8132.32,8.3974,392,39.1,23.3774,185,0.968586,0,0,642.33,642.329988,642.35,0.12474,1.000194,1586.084,1586.079816,1584.47,3.645126,1.002299,1401.686,1401.682202,1401.87,3.263192,1.002331,554.344,554.343956,554.34,0.220418,1.000398,2388.058,2388.058,2388.06,0.034871,1.000015,9053.276,9053.27528,9052.94,3.612061,1.000399,47.24,47.239925,47.27,0.084143,1.001783,522.294,522.293861,522.32,0.380715,1.000729,2388.042,2388.042,2388.03,0.019391,1.000008,8133.206,8133.20598,8133.23,0.575243,1.000071,8.40472,8.404694,8.4108,0.020997,1.002504,391.6,391.598671,392.0,1.019804,1.002609,38.962,38.961923,38.95,0.077563,1.001991,23.37336,23.373352,23.3739,0.01935,1.000828
7,1,8,-0.0034,0.0003,642.56,1582.96,1400.97,553.85,2388.0,9040.8,47.24,522.47,2388.03,8131.07,8.4076,391,38.97,23.3106,184,0.963351,0,0,642.372,642.371981,642.37,0.155872,1.000243,1585.078,1585.073751,1582.96,3.674264,1.002317,1401.04,1401.036766,1400.97,3.011578,1.00215,554.262,554.261919,554.34,0.298757,1.000539,2388.042,2388.042,2388.02,0.039192,1.000016,9050.848,9050.845886,9049.68,6.185413,1.000684,47.234,47.233927,47.24,0.082849,1.001755,522.304,522.303858,522.32,0.384531,1.000737,2388.042,2388.042,2388.03,0.019391,1.000008,8132.774,8132.773935,8132.85,1.027942,1.000126,8.40268,8.402656,8.4076,0.020104,1.002397,391.8,391.799286,392.0,0.748331,1.001911,38.966,38.965923,38.97,0.077356,1.001986,23.36664,23.36662,23.3739,0.030773,1.001319
8,1,9,0.0008,0.0001,642.12,1590.98,1394.8,553.69,2388.05,9046.46,47.29,521.79,2388.05,8125.69,8.3728,392,39.05,23.4066,183,0.958115,0,0,642.326,642.325973,642.37,0.186505,1.00029,1586.716,1586.710729,1584.47,4.091076,1.00258,1399.626,1399.620747,1398.37,3.836658,1.002743,554.11,554.109888,554.0,0.352874,1.000637,2388.03,2388.03,2388.02,0.021909,1.000009,9050.244,9050.241714,9049.68,6.43204,1.000711,47.266,47.265954,47.28,0.065605,1.001389,522.09,522.089911,522.19,0.30509,1.000585,2388.036,2388.036,2388.03,0.008,1.000003,8131.146,8131.145494,8132.32,2.867031,1.000353,8.4036,8.40358,8.4076,0.01855,1.002211,391.8,391.799286,392.0,0.748331,1.001911,39.0,38.999939,38.98,0.068993,1.001771,23.37318,23.373154,23.3774,0.034828,1.001492
9,1,10,-0.0033,0.0001,641.71,1591.24,1400.46,553.59,2388.05,9051.7,47.03,521.79,2388.06,8129.38,8.4286,393,38.95,23.4694,182,0.95288,0,0,642.194,642.193928,642.12,0.304736,1.000475,1588.394,1588.389267,1590.98,3.876254,1.002445,1398.474,1398.47227,1398.37,2.199042,1.001574,554.028,554.027847,553.85,0.41165,1.000743,2388.028,2388.028,2388.02,0.019391,1.000008,9049.554,9049.551983,9049.68,6.041959,1.000668,47.216,47.215863,47.24,0.113596,1.00241,522.01,522.009902,521.79,0.320437,1.000614,2388.04,2388.04,2388.03,0.012649,1.000005,8130.262,8130.261591,8131.07,2.579422,1.000317,8.40344,8.40342,8.4076,0.018329,1.002185,391.8,391.799286,392.0,0.748331,1.001911,39.01,39.00996,38.98,0.056214,1.001441,23.38618,23.386122,23.3774,0.051969,1.002224


In [44]:
train.iloc[20]

engine_id              1.000000
cycle                 21.000000
setting1              -0.001200
setting2               0.000100
sensor2              642.370000
sensor3             1586.070000
sensor4             1398.130000
sensor7              554.080000
sensor8             2388.110000
sensor9             9048.150000
sensor11              47.150000
sensor12             522.420000
sensor13            2388.080000
sensor14            8134.020000
sensor15               8.404900
sensor17             392.000000
sensor20              39.090000
sensor21              23.310100
RUL                  171.000000
RUL_frac               0.895288
w1                     0.000000
w0                     0.000000
sensor2_amean        642.480000
sensor2_gmean        642.479871
sensor2_median       642.580000
sensor2_std            0.407774
sensor2_geo_std        1.000635
sensor3_amean       1586.148000
sensor3_gmean       1586.144676
sensor3_median      1586.070000
sensor3_std            3.247284
sensor3_

In [18]:
input_cols = ['engine', 'cycle']
linreg_tng_cols = list(set(train.columns).difference(set(target_cols)).difference(set(input_cols)))

In [19]:
linreg_noregular = LinearRegression()
linreg_noregular_X = train[linreg_tng_cols].copy()
linreg_noregular_y = train['RUL_frac'].copy()

linreg_noregular.fit(linreg_noregular_X, linreg_noregular_y)
linreg_noregular.score(linreg_noregular_X, linreg_noregular_y)

0.6781285259519296

In [20]:
linreg_noregular_val_X = val[linreg_tng_cols].copy()
linreg_noregular_val_y = val['RUL_frac'].copy()
linreg_noregular.score(linreg_noregular_val_X, linreg_noregular_val_y)

0.447466288759126

In [21]:
train.groupby('engine_id')['cycle'].max().reset_index()['cycle'].unique() > 125

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [22]:
val.groupby('engine_id')['cycle'].max().reset_index()['cycle'].unique() > 125

array([False, False, False,  True,  True, False,  True,  True, False,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False, False, False,  True,  True,  True, False, False,
       False,  True,  True, False,  True, False,  True,  True, False,
        True])

In [23]:
lr_noreg = LinearRegression()

cutoff_train_125_cycles = train['cycle'] > 125

lr_noreg_X = train[cutoff_train_125_cycles][linreg_tng_cols].copy()
lr_noreg_y = train[cutoff_train_125_cycles]['RUL_frac'].copy()

lr_noreg.fit(lr_noreg_X, lr_noreg_y)
lr_noreg.score(lr_noreg_X, lr_noreg_y)

0.8139820027846307

In [24]:
cutoff_val_125_cycles = val['cycle'] > 125

lr_noreg_val_X = val[cutoff_val_125_cycles][linreg_tng_cols].copy()
lr_noreg_val_y = val[cutoff_val_125_cycles]['RUL_frac'].copy()
lr_noreg.score(lr_noreg_val_X, lr_noreg_val_y)

0.6340534166392304