<h1> Manual Feature Extraction with Tsfresh </h1>

<h2> 1. Load data </h2>

In [None]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from scipy.signal import decimate
from tsfresh import extract_features
from tsfresh.feature_selection import select_features

In [1]:
class DataProcessor:
    def __init__(self, input_path, file_names):
        self.input_path = input_path
        self.file_names = file_names
        
    def read_files(self):
        self.data = {}
        print("Reading files...")
        for file in self.file_names:
            with open(self.input_path + file + '.txt', 'r') as f:
                self.data[file] = pd.read_csv(f, header=None, sep='\t')
        return self.data
    
    def print_shape(self):
        print("Files read:")
        for file in self.data:
            print(f"{file}: {self.data[file].shape}")
            
    def create_target_df(self):
        target_columns = ['Cooler_Condition', 'Valve_Condition', 
                        'Internal_Pump_Leakage', 'Hydraulic_Accumulator', 
                        'Stable_Flag']
        self.data['target'].columns = target_columns
        self.valve_condition = self.data['target']['Valve_Condition']
        #del self.data['target']
        return self.valve_condition

def process_data():
    input_path = "input_data/"
    file_names = [
        "ce", "cp", "eps1", "se", "vs1", 
        "fs1", "fs2", 
        "ps1", "ps2", "ps3", "ps4", "ps5", "ps6",
        "ts1", "ts2", "ts3", "ts4", "target"
    ]
    
    processor = DataProcessor(input_path, file_names)
    data = processor.read_files()
    processor.print_shape()
    df_target = processor.create_target_df()
    df_target = processor.valve_condition
    return data, df_target

data, df_target = process_data()

Reading files...
Files read:
ce: (2205, 60)
cp: (2205, 60)
eps1: (2205, 6000)
se: (2205, 60)
vs1: (2205, 60)
fs1: (2205, 600)
fs2: (2205, 600)
ps1: (2205, 6000)
ps2: (2205, 6000)
ps3: (2205, 6000)
ps4: (2205, 6000)
ps5: (2205, 6000)
ps6: (2205, 6000)
ts1: (2205, 60)
ts2: (2205, 60)
ts3: (2205, 60)
ts4: (2205, 60)
target: (2205, 5)


<h2> 2. Signal Preprocessing </h2>

<h3> Input data </h3>

Steps:

<ul>
    <li>If the signal frequency is > 1 Hz, the signal gets downsampled to 1 Hz </li>
    <li>Downsampled signals are stored in a new dictionary</li>
    <li>An ID column gets added to the downsampled signals</li>
    <li>The downsampled signals are concatenated in one dataframe</li>
</ul>

In [2]:
df_list = ['se', 'fs1', 'ps3']

df_downsampled = {}

for df in df_list:
    filtered_signals = []  # Reset for each DataFrame
    if data[df].shape[1] == 6000:
        downsample_factor = 100
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            filtered_signal = decimate(row, downsample_factor, ftype='fir')  # Downsample
            filtered_signals.append(filtered_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_downsampled[df] = pd.DataFrame(filtered_signals)
        df_downsampled[df]["id"] = df_downsampled[df].index

    elif data[df].shape[1] == 600:
        downsample_factor = 10
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            filtered_signal = decimate(row, downsample_factor, ftype='fir')  # Downsample
            filtered_signals.append(filtered_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_downsampled[df] = pd.DataFrame(filtered_signals)
        df_downsampled[df]["id"] = df_downsampled[df].index

    else:
        df_downsampled[df] = data[df]
        df_downsampled[df]["id"] = df_downsampled[df].index

# Combine all DataFrames
df_combined = pd.concat([df_downsampled[df] for df in df_list], ignore_index=True)

df_combined


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,id
0,68.039000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,69.141000,...,68.101000,68.101000,68.420000,68.420000,68.223000,68.223000,68.159000,68.159000,68.264000,0
1,68.264000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,65.715000,...,68.536000,68.536000,68.465000,68.465000,68.491000,68.491000,68.528000,68.528000,68.595000,1
2,68.595000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,67.320000,...,68.901000,68.901000,68.805000,68.805000,68.456000,68.456000,68.758000,68.758000,68.628000,2
3,68.628000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,68.991000,...,68.860000,68.860000,68.946000,68.946000,69.021000,69.021000,68.851000,68.851000,68.868000,3
4,68.868000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,59.523000,...,68.483000,68.483000,68.819000,68.819000,68.862000,68.862000,69.036000,69.036000,68.972000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6610,0.172926,0.011496,-0.018428,0.024396,-0.043644,0.063811,-0.104500,0.168088,-0.308903,0.980114,...,2.317221,2.344719,2.342248,2.311307,2.359219,2.300111,2.390136,2.225378,2.583649,2200
6611,0.174585,0.011612,-0.018216,0.023935,-0.042724,0.062371,-0.102110,0.162070,-0.301379,0.960469,...,2.351193,2.298340,2.330884,2.312942,2.351669,2.291068,2.402735,2.230432,2.545339,2201
6612,0.170944,0.011664,-0.018810,0.025126,-0.044919,0.065827,-0.107497,0.169427,-0.318937,1.016304,...,2.301769,2.340973,2.351554,2.324610,2.339859,2.286507,2.403170,2.205626,2.521674,2202
6613,0.173025,0.011461,-0.018365,0.024314,-0.043512,0.063590,-0.104051,0.163943,-0.308924,0.981163,...,2.350127,2.354437,2.351759,2.355719,2.375095,2.312541,2.435032,2.259279,2.562707,2203


<h3> Transform Input Data into long format </h3>

Since Tsfresh needs the input data in the long format, we transform our input DataFrame

In [3]:
df_combined_long = pd.melt(df_combined, id_vars=['id'], var_name='time', value_name='value')

<h3>Target Data</h3>

Steps:

<li>Encode Target labels</li>

In [4]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df_target)

<h2>3. Extract Features </h2>

In [5]:
features = extract_features(df_combined_long, column_id="id", column_sort="time")   

Feature Extraction: 100%|██████████| 30/30 [02:20<00:00,  4.70s/it]


In [7]:
# clean the features
features = features.dropna(axis=1)

features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.dropna(how = "all", axis= "columns")

# Feature-Selektion basierend auf Zielwerten
features = select_features(features, y=y_encoded)

selector = VarianceThreshold()
features = selector.fit_transform(features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.replace([np.inf, -np.inf], np.nan, inplace=True)


In [9]:
# check shape of feature and target
if features.shape[0] == y_encoded.shape[0]:
    print("Data is ready for Modelling!")
    print(f"Shape features: {features.shape}")
    print(f"Shape target: {y_encoded.shape}")
else:
    print("Shape of the Inputs and target don't match. Please check preprocesing steps")

Data is ready for Modelling!
Shape features: (2205, 683)
Shape target: (2205,)


<h2>4. Making predictions </h2>

In [10]:
states = [27, 6728, 49122]
accs = []
features = features
target = y_encoded

<h3>4.1. AdaBoost Classifier </h3>

In [None]:
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=2),
    n_estimators=30
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

<h3> 4.2. Support Vector Machines </h3>

In [None]:
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    clf = svm.SVC(kernel='linear')
    
    clf.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

<h3>4.3. K-Nearest Neighbours</h3>

In [None]:
for RANDOM_STATE in states:
 
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
    )
    
    # Standardise features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    
    preds = knn.predict(X_test)  # Korrigiert von `model.predict` zu `knn.predict`
    accs.append(accuracy_score(y_test, preds))
    
    # Ergebnisse ausgeben
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))


accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")