In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('processminer-rare-event-mts - data.csv')
df.head()

Unnamed: 0,time,y,x1,x2,x3,x4,x5,x6,x7,x8,...,x52,x53,x54,x55,x56,x57,x58,x59,x60,x61
0,5/1/99 0:00,0,0.376665,-4.596435,-4.095756,13.497687,-0.11883,-20.669883,0.000732,-0.061114,...,10.091721,0.053279,-4.936434,-24.590146,18.515436,3.4734,0.033444,0.953219,0.006076,0
1,5/1/99 0:02,0,0.47572,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,-0.061114,...,10.095871,0.062801,-4.937179,-32.413266,22.760065,2.682933,0.033536,1.090502,0.006083,0
2,5/1/99 0:04,0,0.363848,-4.681394,-4.353147,14.127998,-0.138636,-17.836632,0.010803,-0.061114,...,10.100265,0.072322,-4.937924,-34.183774,27.004663,3.537487,0.033629,1.84054,0.00609,0
3,5/1/99 0:06,0,0.30159,-4.758934,-4.023612,13.161567,-0.148142,-18.517601,0.002075,-0.061114,...,10.10466,0.0816,-4.938669,-35.954281,21.672449,3.986095,0.033721,2.55488,0.006097,0
4,5/1/99 0:08,0,0.265578,-4.749928,-4.33315,15.26734,-0.155314,-17.505913,0.000732,-0.061114,...,10.109054,0.091121,-4.939414,-37.724789,21.907251,3.601573,0.033777,1.410494,0.006105,0


In [3]:
display(df.shape)
one_indexes = df.index[df['y'] == 1]
display(one_indexes.shape)

(18398, 63)

(124,)

In [4]:
sign = lambda x: (1, -1)[x < 0]
def curve_shift(df, shift_by):
    vector = df['y'].copy()
    for s in range(abs(shift_by)):
        tmp = vector.shift(sign(shift_by))
        tmp = tmp.fillna(0)
        vector += tmp
    labelcol = 'y'
    # Add vector to the df
    df.insert(loc=0, column=labelcol+'tmp', value=vector)
    # Remove the rows with labelcol == 1.
    df = df.drop(df[df[labelcol] == 1].index)
    # Drop labelcol and rename the tmp col as labelcol
    df = df.drop(labelcol, axis=1)
    df = df.rename(columns={labelcol+'tmp': labelcol})
    # Make the labelcol binary
    df.loc[df[labelcol] > 0, labelcol] = 1
    return df

In [5]:
one_indexes = df.index[df['y'] == 1]
one_indexes.shape

(124,)

In [6]:
print('Before shifting')
df.iloc[(one_indexes[0]-3):(one_indexes[0]+4), 0:5].head()

Before shifting


Unnamed: 0,time,y,x1,x2,x3
256,5/1/99 8:32,0,1.016235,-4.058394,-1.097158
257,5/1/99 8:34,0,1.005602,-3.876199,-1.074373
258,5/1/99 8:36,0,0.933933,-3.868467,-1.249954
259,5/1/99 8:38,1,0.892311,-13.332664,-10.006578
260,5/1/99 10:50,0,0.020062,-3.987897,-1.248529


In [8]:
# Shift the response column y by 2 rows to do a 4-min ahead prediction.
df = curve_shift(df, shift_by = -2)

In [9]:
print('After shifting')  # Validating if the shift happened correctly.
df.iloc[(one_indexes[0]-5):(one_indexes[0]+5), 0:5].head()

After shifting


Unnamed: 0,y,time,x1,x2,x3
254,0.0,5/1/99 8:28,0.975947,-3.913736,-1.304682
255,1.0,5/1/99 8:30,0.997107,-3.86572,-1.133779
256,1.0,5/1/99 8:32,1.016235,-4.058394,-1.097158
260,0.0,5/1/99 10:50,0.020062,-3.987897,-1.248529
261,0.0,5/1/99 10:52,-0.109346,-5.0711,-2.409911


In [10]:
# Remove time column, and the categorical columns
df = df.drop(['time', 'x28', 'x61'], axis=1)

In [11]:
df.shape

(18027, 60)

In [13]:
input_X = df.loc[:, df.columns != 'y'].values  # converts the df to a numpy array
input_y = df['y'].values
n_features = input_X.shape[1]  # number of features

In [14]:
def temporalize(X, y, lookback):
    X = []
    y = []
    for i in range(len(input_X)-lookback-1):
        t = []
        for j in range(1,lookback+1):
            # Gather past records upto the lookback period
            t.append(input_X[[(i+j+1)], :])
        X.append(t)
        y.append(input_y[i+lookback+1])
    return X, y

In [15]:
'''
Test: The 3D tensors (arrays) for LSTM are forming correctly.
'''
print('First instance of y = 1 in the original data')
display(df.iloc[(np.where(np.array(input_y) == 1)[0][0]-5):(np.where(np.array(input_y) == 1)[0][0]+1), ])

lookback = 5  # Equivalent to 10 min of past data.
# Temporalize the data
X, y = temporalize(X = input_X, y = input_y, lookback = lookback)

print('For the same instance of y = 1, we are keeping past 5 samples in the 3D predictor array, X.')
display(pd.DataFrame(np.concatenate(X[np.where(np.array(y) == 1)[0][0]], axis=0 )))

First instance of y = 1 in the original data


Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x51,x52,x53,x54,x55,x56,x57,x58,x59,x60
250,0.0,0.97908,-3.979111,-1.137012,-2.321619,0.023183,11.405922,-0.011353,-0.049696,-0.040129,...,29.984624,11.239426,-0.752385,-5.014704,-67.454037,66.650995,4.114269,0.034271,2.984553,0.007808
251,0.0,0.95035,-4.217456,-1.159475,-4.261438,0.618902,13.127663,-0.009339,-0.051043,-0.059966,...,29.984624,11.244064,-0.752385,-5.014798,-67.454037,70.301904,4.474437,0.034003,3.929323,0.007792
252,0.0,0.987078,-4.025989,-1.210205,0.899603,0.450338,14.098854,0.000732,-0.051043,-0.059966,...,29.984624,11.248703,-0.752385,-5.014893,-67.454037,66.232568,4.114269,0.033726,4.845087,0.007776
253,0.0,0.921726,-3.728572,-1.230373,-1.598718,0.227178,14.594612,6.1e-05,-0.051043,-0.040129,...,29.984624,11.253342,-0.752385,-5.014987,-58.029477,66.310022,3.537487,0.032518,4.9695,0.00776
254,0.0,0.975947,-3.913736,-1.304682,0.561987,0.004034,14.630532,0.000732,-0.051043,-0.040129,...,29.984624,11.257736,-0.752385,-5.015081,-61.783749,71.917352,3.4734,0.03131,2.981432,0.007743
255,1.0,0.997107,-3.86572,-1.133779,0.377295,-0.219126,14.66642,0.000732,-0.061114,-0.040129,...,29.984624,11.262375,-0.752385,-5.015176,-70.151791,73.876977,3.4734,0.030776,2.563593,0.007727


For the same instance of y = 1, we are keeping past 5 samples in the 3D predictor array, X.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,0.95035,-4.217456,-1.159475,-4.261438,0.618902,13.127663,-0.009339,-0.051043,-0.059966,0.001791,...,29.984624,11.244064,-0.752385,-5.014798,-67.454037,70.301904,4.474437,0.034003,3.929323,0.007792
1,0.987078,-4.025989,-1.210205,0.899603,0.450338,14.098854,0.000732,-0.051043,-0.059966,0.001791,...,29.984624,11.248703,-0.752385,-5.014893,-67.454037,66.232568,4.114269,0.033726,4.845087,0.007776
2,0.921726,-3.728572,-1.230373,-1.598718,0.227178,14.594612,6.1e-05,-0.051043,-0.040129,0.001791,...,29.984624,11.253342,-0.752385,-5.014987,-58.029477,66.310022,3.537487,0.032518,4.9695,0.00776
3,0.975947,-3.913736,-1.304682,0.561987,0.004034,14.630532,0.000732,-0.051043,-0.040129,0.001791,...,29.984624,11.257736,-0.752385,-5.015081,-61.783749,71.917352,3.4734,0.03131,2.981432,0.007743
4,0.997107,-3.86572,-1.133779,0.377295,-0.219126,14.66642,0.000732,-0.061114,-0.040129,0.001791,...,29.984624,11.262375,-0.752385,-5.015176,-70.151791,73.876977,3.4734,0.030776,2.563593,0.007727
