In [1]:
import pandas as pd
import numpy as np
import featuretools as ft
import composeml as cp
from tqdm.notebook import tqdm
import datetime
import os

# Set working dir and file location
# os.getcwd()
os.chdir('/Users/epreble/Documents/Data')
file_name = "example_data.xlsx"

# df_raw

In [2]:
column_names = [
        "Notebook_ID",
        "Stage_of_Operation",
        "Timestamp",
        "device_1",
        "device_2",
        "time_until_overpressure"]

df_raw = pd.read_excel(file_name, header=0)
df_raw.columns = column_names

# df_clean

In [3]:
df_clean = df_raw.copy()

### Create Elapsed Time for entire run by Experiment

In [4]:
df_clean["experiment_time"] = (
        df_clean.Timestamp.astype(int)
        .div(10 ** 9)
        .groupby(df_clean.Notebook_ID)
        .transform(lambda x: x.diff().fillna(0).cumsum())
    )

In [5]:
# Create index_column for FeatureTools to use
df_clean['index_column'] = df_clean.index

In [6]:
df_clean.head()

Unnamed: 0,Notebook_ID,Stage_of_Operation,Timestamp,device_1,device_2,time_until_overpressure,experiment_time,index_column
0,Experiment_1,1,2019-11-06 05:00:10,-1.3784,0.63279,6000,0.0,0
1,Experiment_1,1,2019-11-06 05:00:40,-1.3779,0.63258,6000,30.0,1
2,Experiment_1,1,2019-11-06 05:01:10,-1.3779,0.63306,6000,60.0,2
3,Experiment_1,1,2019-11-06 05:01:40,-1.3781,0.63305,6000,90.0,3
4,Experiment_1,1,2019-11-06 05:02:10,-1.3779,0.63252,6000,120.0,4


# Create FeatureTools Functions

In [7]:
def time_until_overpressure(df):
    return df['time_until_overpressure']

In [8]:
def make_entityset(data):
    es = ft.EntitySet('Dataset')

    es.entity_from_dataframe(
        dataframe=data,
        entity_id='recordings',
        index='index_column',
        time_index='Timestamp',
    )

    es.normalize_entity(
        base_entity_id='recordings',
        new_entity_id='experiment',
        index='Notebook_ID',
    )

    return es

In [9]:
# Featuretools Function
def run_ft(df, columns_to_engineer, primitives_to_engineer, primitives_depth):
    
    # Adapted from 
    # https://github.com/Featuretools/predict-remaining-useful-life/blob/master/Simple%20Featuretools%20RUL%20Demo.ipynb

    lm = cp.LabelMaker(
        target_entity='Notebook_ID',
        time_index='Timestamp',
        labeling_function=time_until_overpressure)
    
    label_times = pd.DataFrame(columns=['Notebook_ID','time','time_until_overpressure'])
    label_times.index.names = ['id']
    label_times[['Notebook_ID','time','time_until_overpressure']] = df[['Notebook_ID','Timestamp','time_until_overpressure']]
    # print(label_times.head())

    # create entityset
    es = make_entityset(df[columns_to_engineer])
    
    # Prevents a warning, not sure what this does
    es.add_last_time_indexes()
    
    # run feature engineering
    fm, features = ft.dfs(
        entityset=es,
        target_entity='experiment',
        agg_primitives=primitives_to_engineer,
        trans_primitives=[],
        cutoff_time=label_times,
        max_depth=primitives_depth,
        training_window="30 m",
        verbose=True,
        )
    
    return fm, features

In [10]:
def merge_features_back_onto_df(df, fm):
    # Combine the new features back onto the starting dataframe
    # set fm index to be identical with index of experiment in df_transform_ft
    fm = fm.set_index(df.index)

    # Confirm that two indices are equal
    if fm.index.equals(df.index):
        print('Indices line up, concatenate possible')
        # Determine which columns are new, which can be concatenated to the dataframe
        new_columns = []
        new_columns = list(set(fm.columns) - set(df.columns))

        if new_columns:
            new_columns.sort()
            df = pd.concat([df,pd.DataFrame(columns=new_columns)])

        # For columns that aren't new, the update method will update them (can't concatenate existing columns on an index)
        df.update(fm)
    else:       
        print('UNEQUAL INDEX ISSUE:')
        exit()
    return df

# Featuretools to generate 1st order vars (features on raw data)

In [11]:
df_transform_ft_1 = df_clean.copy()

In [12]:
df_transform_ft_1.head()

Unnamed: 0,Notebook_ID,Stage_of_Operation,Timestamp,device_1,device_2,time_until_overpressure,experiment_time,index_column
0,Experiment_1,1,2019-11-06 05:00:10,-1.3784,0.63279,6000,0.0,0
1,Experiment_1,1,2019-11-06 05:00:40,-1.3779,0.63258,6000,30.0,1
2,Experiment_1,1,2019-11-06 05:01:10,-1.3779,0.63306,6000,60.0,2
3,Experiment_1,1,2019-11-06 05:01:40,-1.3781,0.63305,6000,90.0,3
4,Experiment_1,1,2019-11-06 05:02:10,-1.3779,0.63252,6000,120.0,4


In [13]:
# list of all columns that we want to make features from, plus columns that featuretools needs
columns_to_engineer = ['Notebook_ID', 'Timestamp', 'device_1', 'device_2', 'index_column']

In [14]:
primitives_to_engineer = ['max', 'min', 'mean', 'std', 'trend']
primitives_depth = 2

In [15]:
# Create 5 features for each of two device columns
fm, features = run_ft(df_transform_ft_1, columns_to_engineer, primitives_to_engineer, primitives_depth)
df_transform_ft_1 = merge_features_back_onto_df(df_transform_ft_1, fm)
df_transform_ft_1.head()

Built 10 features
Elapsed: 01:28 | Progress: 100%|██████████
Indices line up, concatenate possible


Unnamed: 0,Notebook_ID,Stage_of_Operation,Timestamp,device_1,device_2,time_until_overpressure,experiment_time,index_column,MAX(recordings.device_1),MAX(recordings.device_2),MEAN(recordings.device_1),MEAN(recordings.device_2),MIN(recordings.device_1),MIN(recordings.device_2),STD(recordings.device_1),STD(recordings.device_2),"TREND(recordings.device_1, Timestamp)","TREND(recordings.device_2, Timestamp)"
0,Experiment_1,1.0,2019-11-06 05:00:10,-1.3784,0.63279,6000.0,0.0,0.0,-1.3784,0.63279,-1.3784,0.63279,-1.3784,0.63279,,,,
1,Experiment_1,1.0,2019-11-06 05:00:40,-1.3779,0.63258,6000.0,30.0,1.0,-1.3779,0.63279,-1.37815,0.632685,-1.3784,0.63258,0.000353553,0.000148492,,
2,Experiment_1,1.0,2019-11-06 05:01:10,-1.3779,0.63306,6000.0,60.0,2.0,-1.3779,0.63306,-1.37807,0.63281,-1.3784,0.63258,0.000288675,0.000240624,8.33333e-06,4.5e-06
3,Experiment_1,1.0,2019-11-06 05:01:40,-1.3781,0.63305,6000.0,90.0,3.0,-1.3779,0.63306,-1.37807,0.63287,-1.3784,0.63258,0.000236291,0.000230217,3e-06,4.2e-06
4,Experiment_1,1.0,2019-11-06 05:02:10,-1.3779,0.63252,6000.0,120.0,4.0,-1.3779,0.63306,-1.37804,0.6328,-1.3784,0.63252,0.000219089,0.000253476,2.66667e-06,-2.33333e-07


# Featuretools to generate 2nd order vars (features on 1st order vars)

Generate second order features based on the first order vars so far:
    e.g. The stdev of the max, or the trend of the stdev etc..

In [16]:
# Test Cases
primitives_to_engineer = ['trend']
primitives_depth = 2

# Generate trend for 'device_1' (original raw data column)
# works fine
columns_to_engineer_2a = ['Timestamp','Notebook_ID','index_column','device_1']

# Generate trend for 'STD(recordings.device_1)'
# Fails - assertion error
columns_to_engineer_2b = ['Timestamp','Notebook_ID','index_column','STD(recordings.device_1)']

# Generate trend for 'device_1' & 'STD(recordings.device_1)'
# Works for 'device_1' only, 'STD(recordings.device_1)' is ignored
columns_to_engineer_2c = ['Timestamp','Notebook_ID','index_column','device_1','STD(recordings.device_1)']

# Rename 'STD(recordings.device_1)' --> 'renamed_column'
# Also doesn't work
columns_to_engineer_2d = ['Timestamp','Notebook_ID','index_column','renamed_column']

In [17]:
# Case 2a
fm, features = run_ft(df_transform_ft_1, columns_to_engineer_2a, primitives_to_engineer, primitives_depth)
df_transform_ft_2a = merge_features_back_onto_df(df_transform_ft_1, fm)
df_transform_ft_2a.head()

Built 1 features
Elapsed: 00:55 | Progress: 100%|██████████
Indices line up, concatenate possible


Unnamed: 0,Notebook_ID,Stage_of_Operation,Timestamp,device_1,device_2,time_until_overpressure,experiment_time,index_column,MAX(recordings.device_1),MAX(recordings.device_2),MEAN(recordings.device_1),MEAN(recordings.device_2),MIN(recordings.device_1),MIN(recordings.device_2),STD(recordings.device_1),STD(recordings.device_2),"TREND(recordings.device_1, Timestamp)","TREND(recordings.device_2, Timestamp)"
0,Experiment_1,1.0,2019-11-06 05:00:10,-1.3784,0.63279,6000.0,0.0,0.0,-1.3784,0.63279,-1.3784,0.63279,-1.3784,0.63279,,,,
1,Experiment_1,1.0,2019-11-06 05:00:40,-1.3779,0.63258,6000.0,30.0,1.0,-1.3779,0.63279,-1.37815,0.632685,-1.3784,0.63258,0.000353553,0.000148492,,
2,Experiment_1,1.0,2019-11-06 05:01:10,-1.3779,0.63306,6000.0,60.0,2.0,-1.3779,0.63306,-1.37807,0.63281,-1.3784,0.63258,0.000288675,0.000240624,8.33333e-06,4.5e-06
3,Experiment_1,1.0,2019-11-06 05:01:40,-1.3781,0.63305,6000.0,90.0,3.0,-1.3779,0.63306,-1.37807,0.63287,-1.3784,0.63258,0.000236291,0.000230217,3e-06,4.2e-06
4,Experiment_1,1.0,2019-11-06 05:02:10,-1.3779,0.63252,6000.0,120.0,4.0,-1.3779,0.63306,-1.37804,0.6328,-1.3784,0.63252,0.000219089,0.000253476,2.66667e-06,-2.33333e-07


In [18]:
# Case 2b
fm, features = run_ft(df_transform_ft_1, columns_to_engineer_2b, primitives_to_engineer, primitives_depth)
df_transform_ft_2b = merge_features_back_onto_df(df_transform_ft_1, fm)
df_transform_ft_2b.head()

Built 0 features


  agg_primitives: ['trend']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


AssertionError: features must be a non-empty list of features

In [19]:
# Case 2c
fm, features = run_ft(df_transform_ft_1, columns_to_engineer_2c, primitives_to_engineer, primitives_depth)
df_transform_ft_2c = merge_features_back_onto_df(df_transform_ft_1, fm)
df_transform_ft_2c.head()

Built 1 features
Elapsed: 00:56 | Progress: 100%|██████████
Indices line up, concatenate possible


Unnamed: 0,Notebook_ID,Stage_of_Operation,Timestamp,device_1,device_2,time_until_overpressure,experiment_time,index_column,MAX(recordings.device_1),MAX(recordings.device_2),MEAN(recordings.device_1),MEAN(recordings.device_2),MIN(recordings.device_1),MIN(recordings.device_2),STD(recordings.device_1),STD(recordings.device_2),"TREND(recordings.device_1, Timestamp)","TREND(recordings.device_2, Timestamp)"
0,Experiment_1,1.0,2019-11-06 05:00:10,-1.3784,0.63279,6000.0,0.0,0.0,-1.3784,0.63279,-1.3784,0.63279,-1.3784,0.63279,,,,
1,Experiment_1,1.0,2019-11-06 05:00:40,-1.3779,0.63258,6000.0,30.0,1.0,-1.3779,0.63279,-1.37815,0.632685,-1.3784,0.63258,0.000353553,0.000148492,,
2,Experiment_1,1.0,2019-11-06 05:01:10,-1.3779,0.63306,6000.0,60.0,2.0,-1.3779,0.63306,-1.37807,0.63281,-1.3784,0.63258,0.000288675,0.000240624,8.33333e-06,4.5e-06
3,Experiment_1,1.0,2019-11-06 05:01:40,-1.3781,0.63305,6000.0,90.0,3.0,-1.3779,0.63306,-1.37807,0.63287,-1.3784,0.63258,0.000236291,0.000230217,3e-06,4.2e-06
4,Experiment_1,1.0,2019-11-06 05:02:10,-1.3779,0.63252,6000.0,120.0,4.0,-1.3779,0.63306,-1.37804,0.6328,-1.3784,0.63252,0.000219089,0.000253476,2.66667e-06,-2.33333e-07


In [20]:
# Case 2d
# Rename column 'STD(recordings.device_1)' --> 'device_1_std_renamed'
df_transform_ft_2d = df_transform_ft_1.copy()
df_transform_ft_2d = df_transform_ft_2d.rename(columns={"STD(recordings.device_1)": "renamed_column"})
# df_transform_ft_2d = df_transform_ft_2d.rename(columns={"device_1": "renamed_column"})
# df_transform_ft_2d = df_transform_ft_2d.rename(columns={"MAX(recordings.device_2)": "renamed_column"})
df_transform_ft_2d.head()

Unnamed: 0,Notebook_ID,Stage_of_Operation,Timestamp,device_1,device_2,time_until_overpressure,experiment_time,index_column,MAX(recordings.device_1),MAX(recordings.device_2),MEAN(recordings.device_1),MEAN(recordings.device_2),MIN(recordings.device_1),MIN(recordings.device_2),renamed_column,STD(recordings.device_2),"TREND(recordings.device_1, Timestamp)","TREND(recordings.device_2, Timestamp)"
0,Experiment_1,1.0,2019-11-06 05:00:10,-1.3784,0.63279,6000.0,0.0,0.0,-1.3784,0.63279,-1.3784,0.63279,-1.3784,0.63279,,,,
1,Experiment_1,1.0,2019-11-06 05:00:40,-1.3779,0.63258,6000.0,30.0,1.0,-1.3779,0.63279,-1.37815,0.632685,-1.3784,0.63258,0.000353553,0.000148492,,
2,Experiment_1,1.0,2019-11-06 05:01:10,-1.3779,0.63306,6000.0,60.0,2.0,-1.3779,0.63306,-1.37807,0.63281,-1.3784,0.63258,0.000288675,0.000240624,8.33333e-06,4.5e-06
3,Experiment_1,1.0,2019-11-06 05:01:40,-1.3781,0.63305,6000.0,90.0,3.0,-1.3779,0.63306,-1.37807,0.63287,-1.3784,0.63258,0.000236291,0.000230217,3e-06,4.2e-06
4,Experiment_1,1.0,2019-11-06 05:02:10,-1.3779,0.63252,6000.0,120.0,4.0,-1.3779,0.63306,-1.37804,0.6328,-1.3784,0.63252,0.000219089,0.000253476,2.66667e-06,-2.33333e-07


In [21]:
fm, features = run_ft(df_transform_ft_2d, columns_to_engineer_2d, primitives_to_engineer, primitives_depth)
df_transform_ft_2d = merge_features_back_onto_df(df_transform_ft_2d, fm)
df_transform_ft_2d.head()

Built 0 features


  agg_primitives: ['trend']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible variable types for the primitive were found in the data.


AssertionError: features must be a non-empty list of features