In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from tpot import TPOTClassifier

# Helper functions

In [2]:
def create_train_test(df, nsplits, subject_column, label_column):
    '''
    Creates a train, test split. Each subject is in one of the sets with all his/her data.
    Takes a df, number of splits, names of the subject and label column.
    Returns 2 dataframes with train and test set.
    '''
    X = df.drop(columns=[subject_column, label_column])
    y = df[[label_column]]
    groups = df[subject_column]
    
    gkf = GroupKFold(n_splits=nsplits)
    train_idx, test_idx = next(gkf.split(X, y, groups))

    X_train = X.iloc[train_idx.tolist(), :]
    y_train = y.iloc[train_idx.tolist(), :]
    groups_train = groups.iloc[train_idx.tolist()]
    
    X_test = X.iloc[test_idx.tolist(), :]
    y_test = y.iloc[test_idx.tolist(), :]
    groups_test = groups.iloc[test_idx.tolist()]
    
    res_train = pd.concat([X_train, groups_train, y_train], axis=1)
    res_test = pd.concat([X_test, groups_test, y_test], axis=1)
    
    return res_train, res_test

In [3]:
def create_train_val_test(df, nsplits_val, nplsits_test, subject_column, label_column):
    '''
    Creates a train, val, test split. Each subject is in one of the sets with all his/her data.
    Takes a df, number of splits for train/val split, number of splits for train+val/test split,
    names of the subject and label column.
    Returns 3 dataframes with train, val, and test set.
    '''
    # get train_temp and test
    res_train_temp, res_test = create_train_test(df, nplsits_test, subject_column, label_column)
    
    # get train and val from train_temp
    res_train, res_val = create_train_test(res_train_temp, nsplits_val, subject_column, label_column)
    
    return res_train, res_val, res_test

In [4]:
def split_df(df, subject_column, label_column):
    '''
    Takes a df and splits it into three dfs, containing the features, labels, and groups.
    '''
    X = df.drop(columns=['subject', 'label'])
    y = df[['label']]
    groups = df['subject']
    return X, y, groups

In [5]:
def remove_correlated_features(df, correlation):
    '''
    Takes a df and removed the correlated features with a correlation value equal or higher to the given value.
    Returns the resulting df and a list of the retained features.
    '''
    cor = df.corr(numeric_only = True)
    keep_columns = np.full(cor.shape[0], True)
    for i in range(cor.shape[0] - 1):
        for j in range(i + 1, cor.shape[0] - 1):
            if (np.abs(cor.iloc[i, j]) >= 0.8):
                keep_columns[j] = False
    selected_columns = df.columns[keep_columns]
    df_reduced = df[selected_columns]
    
    return df_reduced, selected_columns

# Read data

In [6]:
df = pd.read_parquet('data-input/flirt-60-1.parquet')

In [7]:
df

Unnamed: 0,x_mean,x_std,x_min,x_max,x_ptp,x_sum,x_energy,x_skewness,x_kurtosis,x_peaks,...,l2_n_sign_changes,l2_iqr,l2_iqr_5_95,l2_pct_5,l2_pct_95,l2_entropy,l2_perm_entropy,l2_svd_entropy,subject,label
0,-39.333854,21.911315,-119.0,41.0,160.0,-75521.0,3892335.0,0.667670,1.122428,480,...,0,6.716873,38.096548,49.909418,88.005966,7.546237,0.999670,0.456264,15,0
1,-39.522917,21.929207,-119.0,41.0,160.0,-75884.0,3922466.0,0.690015,1.133307,485,...,0,6.711957,38.096548,49.909418,88.005966,7.546232,0.999489,0.456358,15,0
2,-39.884375,21.849108,-119.0,41.0,160.0,-76578.0,3970842.0,0.734060,1.226828,490,...,0,6.757474,37.567903,50.438062,88.005966,7.546386,0.999269,0.455761,15,0
3,-40.117188,21.847705,-119.0,41.0,160.0,-77025.0,4006485.0,0.754623,1.275292,490,...,0,6.441981,36.429907,50.546513,86.976421,7.546820,0.999406,0.455364,15,0
4,-40.265625,21.854092,-119.0,41.0,160.0,-77310.0,4029930.0,0.772855,1.290254,494,...,0,6.300316,35.156783,50.943094,86.099877,7.547107,0.999406,0.452354,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640,59.100629,0.877353,56.0,64.0,8.0,9397.0,555491.0,1.088573,8.819387,17,...,0,0.261592,1.994493,61.926548,63.921042,5.068814,0.904938,0.083176,S9,1
641,59.125984,0.980054,56.0,64.0,8.0,7509.0,444099.0,0.900382,6.406320,17,...,0,0.535871,2.331194,61.762448,64.093642,4.844074,0.957356,0.091233,S9,1
642,59.200000,1.110713,56.0,64.0,8.0,5624.0,333058.0,0.660031,4.386405,15,...,0,0.946680,2.588040,61.552431,64.140471,4.553731,0.978071,0.101184,S9,1
643,59.444444,1.192274,56.0,64.0,8.0,3745.0,222709.0,0.496950,4.219449,10,...,0,0.956527,2.421021,61.943523,64.364543,4.142958,0.979869,0.109214,S9,1


# Split data

In [8]:
# remove fixed columns (see EDA)
df = df.drop(columns=['l2_n_sign_changes', 'x_entropy', 'y_entropy', 'z_entropy'])

In [9]:
# split into train and test
df_train, df_test = create_train_test(df, 5, 'subject', 'label')

X_train, y_train, groups_train = split_df(df_train, 'subject', 'label')
X_test, y_test, groups_test = split_df(df_test, 'subject', 'label')

# remove correlated features from train
X_train, selected_features = remove_correlated_features(X_train, 0.8)

# remove the same columns from test
X_test = X_test[selected_features]

In [10]:
# Check train and test set sizes
print('Percentage train set:', len(y_train)/(len(y_train)+len(y_test)))
print('Percentage test set:', len(y_test)/(len(y_train)+len(y_test)))

print('\nClass distribution in train set: \n', y_train['label'].value_counts(normalize=True), '\n')

print('Class distribution in test set: \n', y_test['label'].value_counts(normalize=True), '\n')

Percentage train set: 0.7986583333050424
Percentage test set: 0.20134166669495754

Class distribution in train set: 
 1    0.795827
0    0.204173
Name: label, dtype: float64 

Class distribution in test set: 
 1    0.789097
0    0.210903
Name: label, dtype: float64 



# TPOT

In [11]:
tpot = TPOTClassifier(generations=5,
                      population_size=5,
                      scoring='f1',
                      cv=5,
                      n_jobs=-1,
                      verbosity=3,
                      random_state=0)

In [12]:
%%time

tpot.fit(X_train, y_train.values.ravel(), groups=groups_train.values.ravel())
print(f"Tpop score on test data: {tpot.score(X_test, y_test.values.ravel()):.2f}")

32 operators have been imported by TPOT.


  from pandas import MultiIndex, Int64Index


Optimization Progress:   0%|          | 0/30 [00:00<?, ?pipeline/s]

Skipped pipeline #9 due to time out. Continuing to the next pipeline.
Skipped pipeline #11 due to time out. Continuing to the next pipeline.

Generation 1 - Current Pareto front scores:

-1	0.8863019858646887	LogisticRegression(input_matrix, LogisticRegression__C=0.0001, LogisticRegression__dual=False, LogisticRegression__penalty=l2)

-2	0.8863072582622928	LogisticRegression(Binarizer(input_matrix, Binarizer__threshold=0.30000000000000004), LogisticRegression__C=0.0001, LogisticRegression__dual=False, LogisticRegression__penalty=l2)
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.
Skipped pipeline #14 due to time out. Continuing to the next pipeline.

Generation 2 - Current Pareto front scores:

-1	0.8863019858646887	LogisticRegression(input_matrix, LogisticRegression__C=0.0001, LogisticRegression__dual=False, LogisticRegression__penalty=l2)

-2	0.8863072582622928	LogisticRegression(Binarizer(input_ma

In [13]:
print(f"TPOT score (F1) on test data: {tpot.score(X_test, y_test.values.ravel()):.2f}")

TPOT score (F1) on test data: 0.88


In [18]:
tpot.export('tpot_5_5_5_flirt_60_1.py')

# Interpretation

I created a TPOT baseline model in this notebook.

The data used was created with ```FLIRT``` with a ```window_size``` of ```60``` and a step size of ```1```.

The data of each user is either in the train or in the test set. Internally, TPOT does a cross-validation with the training data, and again only uses the data of each user in either training or validation set.

Hyperparameters, i.e., aspects that could be changed in future iterations:
* Calculating the features with another library instead of FLIRT
* Using different window_sizes and step sizes.

The results so far are very promising: TPOT returned a pipeline with a Logistic Regression and the performance on the test set is F1=0.88.