<h1> Manual Feature Extraction with Tsfresh </h1>

<h2> 1. Load data </h2>

In [31]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from scipy.signal import decimate
from tsfresh import extract_features
from tsfresh.feature_selection import select_features
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.feature_extraction import MinimalFCParameters
import xgboost
from xgboost import XGBClassifier

In [32]:
class DataProcessor:
    def __init__(self, input_path, file_names):
        self.input_path = input_path
        self.file_names = file_names
        
    def read_files(self):
        self.data = {}
        print("Reading files...")
        for file in self.file_names:
            with open(self.input_path + file + '.txt', 'r') as f:
                self.data[file] = pd.read_csv(f, header=None, sep='\t')
        return self.data
    
    def print_shape(self):
        print("Files read:")
        for file in self.data:
            print(f"{file}: {self.data[file].shape}")
            
    def create_target_df(self):
        target_columns = ['Cooler_Condition', 'Valve_Condition', 
                        'Internal_Pump_Leakage', 'Hydraulic_Accumulator', 
                        'Stable_Flag']
        self.data['target'].columns = target_columns
        self.valve_condition = self.data['target']['Valve_Condition']
        #del self.data['target']
        return self.valve_condition

def process_data():
    input_path = "input_data/"
    file_names = [
        "ce", "cp", "eps1", "se", "vs1", 
        "fs1", "fs2", 
        "ps1", "ps2", "ps3", "ps4", "ps5", "ps6",
        "ts1", "ts2", "ts3", "ts4", "target"
    ]
    
    processor = DataProcessor(input_path, file_names)
    data = processor.read_files()
    processor.print_shape()
    df_target = processor.create_target_df()
    df_target = processor.valve_condition
    return data, df_target

data, df_target = process_data()

Reading files...
Files read:
ce: (2205, 60)
cp: (2205, 60)
eps1: (2205, 6000)
se: (2205, 60)
vs1: (2205, 60)
fs1: (2205, 600)
fs2: (2205, 600)
ps1: (2205, 6000)
ps2: (2205, 6000)
ps3: (2205, 6000)
ps4: (2205, 6000)
ps5: (2205, 6000)
ps6: (2205, 6000)
ts1: (2205, 60)
ts2: (2205, 60)
ts3: (2205, 60)
ts4: (2205, 60)
target: (2205, 5)


<h2> 2. Signal Preprocessing </h2>

<h3> Input data </h3>

Steps:

<ul>
    <li>If the signal frequency is > 1 Hz, the signal gets downsampled to 1 Hz </li>
    <li>Downsampled signals are stored in a new dictionary</li>
    <li>An ID column gets added to the downsampled signals</li>
    <li>The downsampled signals are concatenated in one dataframe</li>
</ul>

In [33]:
df_list = ['ps3', 'eps1']

df_downsampled = {}

for df in df_list:
    filtered_signals = []  # Reset for each DataFrame
    if data[df].shape[1] == 6000:
        downsample_factor = 100
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            filtered_signal = decimate(row, downsample_factor, ftype='fir')  # Downsample
            filtered_signals.append(filtered_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_downsampled[df] = pd.DataFrame(filtered_signals)
        df_downsampled[df]["id"] = df_downsampled[df].index

    elif data[df].shape[1] == 600:
        downsample_factor = 10
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            filtered_signal = decimate(row, downsample_factor, ftype='fir')  # Downsample
            filtered_signals.append(filtered_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_downsampled[df] = pd.DataFrame(filtered_signals)
        df_downsampled[df]["id"] = df_downsampled[df].index

    else:
        df_downsampled[df] = data[df]
        df_downsampled[df]["id"] = df_downsampled[df].index
        
        
for i in df_downsampled.keys():
    print(f"shape of {i}: {df_downsampled[i].shape}")
    
# Combine all DataFrames
df_combined = pd.concat([df_downsampled[df] for df in df_list], ignore_index=True)

df_combined


shape of ps3: (2205, 61)
shape of eps1: (2205, 61)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,id
0,0.181910,0.013952,-0.022135,0.030688,-0.053816,0.079900,-0.129521,0.205582,-0.388058,1.287525,...,2.295209,2.331077,2.314609,2.331772,2.344258,2.274758,2.365340,2.164389,2.467429,0
1,0.179051,0.013746,-0.022035,0.030437,-0.053478,0.079484,-0.128867,0.204474,-0.385877,1.277397,...,2.265444,2.286975,2.338045,2.303720,2.327984,2.248921,2.348491,2.192770,2.486595,1
2,0.170626,0.013492,-0.021850,0.030302,-0.053269,0.078988,-0.127972,0.203145,-0.384168,1.282277,...,2.316311,2.311031,2.280636,2.286729,2.301869,2.259966,2.346658,2.208926,2.500617,2
3,0.180196,0.013913,-0.022369,0.031245,-0.054976,0.081985,-0.132920,0.211098,-0.396899,1.301068,...,2.240574,2.255356,2.253251,2.286907,2.258094,2.221323,2.304355,2.134268,2.424769,3
4,0.170937,0.013715,-0.022236,0.031246,-0.054744,0.081630,-0.132241,0.210173,-0.396245,1.313553,...,2.257005,2.252278,2.216802,2.261641,2.255471,2.224936,2.311261,2.131310,2.413929,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,1234.934906,3076.867147,2859.827368,3009.134788,2913.552540,2978.420808,2929.005675,2970.437712,2928.859439,2989.468366,...,2420.803885,2412.767902,2425.717742,2400.605049,2442.957722,2376.234975,2481.914084,2309.885416,2627.510636,2200
4406,1234.772948,3074.417377,2856.152473,3005.782318,2909.340225,2975.922563,2927.147952,2969.091887,2925.779865,2987.212389,...,2421.463321,2412.626815,2427.166514,2402.269938,2443.010993,2376.712279,2482.112030,2310.765586,2628.666642,2201
4407,1233.976886,3073.982803,2856.287302,3004.786072,2908.614940,2975.053319,2926.890941,2967.192549,2924.768865,2985.296063,...,2420.301023,2413.420808,2427.152135,2401.224877,2443.268488,2376.058788,2481.534859,2311.079525,2627.836426,2202
4408,1234.061432,3073.819344,2856.631926,3005.322384,2910.016808,2977.294824,2928.233888,2969.053481,2925.984835,2987.626099,...,2421.084618,2414.223594,2428.264113,2402.747634,2444.599235,2377.097999,2483.039476,2311.033054,2628.961053,2203


In [34]:
# from scipy.signal import resample_poly, decimate
# df_list = ['se', 'fs1', 'ps3']

# df_resampled = {}

# for df in df_list:
#     resampled_signals = [] 
#     if data[df].shape[1] == 6000:
#         downsample_factor = 10
#         for i in range(data[df].shape[0]):
#             row = data[df].iloc[i].values  # Extract row as a 1D array
#             resampled_signal= decimate(row, downsample_factor, ftype='fir')  # Downsample
#             resampled_signals.append(resampled_signal)  # Store the result
#         # Create a new DataFrame with the filtered signals and add the 'id' column
#         df_resampled[df] = pd.DataFrame(resampled_signals)
#         df_resampled[df]["id"] = df_resampled[df].index

#     elif data[df].shape[1] == 60:
#         target_rate = 10
#         up_factor =  10
#         for i in range(data[df].shape[0]):
#             row = data[df].iloc[i].values  # Extract row as a 1D array
#             resampled_signal = resample_poly(row, up=up_factor, down=1)
#             resampled_signals.append(resampled_signal)  # Store the result
#         # Create a new DataFrame with the filtered signals and add the 'id' column
#         df_resampled[df] = pd.DataFrame(resampled_signals)
#         df_resampled[df]["id"] = df_resampled[df].index

#     else:
#         df_resampled[df] = data[df]
#         df_resampled[df]["id"] = df_resampled[df].index

# # Combine all DataFrames

# for i in df_resampled.keys():
#     print(f"shape of {i}: {df_resampled[i].shape}")
    
# df_combined = pd.concat([df_resampled[df] for df in df_list], ignore_index=True)

# df_combined


<h3> Transform Input Data into long format </h3>

Since Tsfresh needs the input data in the long format, we transform our input DataFrame

In [35]:
df_combined_long = pd.melt(df_combined, id_vars=['id'], var_name='time', value_name='value')

<h3>Target Data</h3>

Steps:

<li>Encode Target labels</li>

In [36]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df_target)

<h2>3. Extract Features </h2>

In [None]:

# Verwende minimierte Einstellungen
settings = MinimalFCParameters()

# Feature-Extraktion
extracted_features = extract_features(df_combined_long, 
                                      column_id="id", 
                                      column_sort="time", 
                                      default_fc_parameters=settings)


Feature Extraction: 100%|██████████| 30/30 [00:04<00:00,  6.77it/s]


In [38]:
# clean the features
features = extracted_features.dropna(axis=1)

features.replace([np.inf, -np.inf], np.nan, inplace=True)
features = features.dropna(how = "all", axis= "columns")

# Feature-Selektion basierend auf Zielwerten
features = select_features(features, y=y_encoded)

selector = VarianceThreshold()
features = selector.fit_transform(features)

In [39]:
# check shape of feature and target
if features.shape[0] == y_encoded.shape[0]:
    print("Data is ready for Modelling!")
    print(f"Shape features: {features.shape}")
    print(f"Shape target: {y_encoded.shape}")
else:
    print("Shape of the Inputs and target don't match. Please check preprocesing steps")

Data is ready for Modelling!
Shape features: (2205, 6)
Shape target: (2205,)


<h2>4. Making predictions </h2>

In [40]:
states = [27, 6728, 49122]
features = features
target = y_encoded

<h3>4.1. AdaBoost Classifier </h3>

In [41]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=50
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")



Random State: 27
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        72
           1       0.99      0.99      0.99        72
           2       0.99      1.00      0.99        72
           3       1.00      1.00      1.00       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441





Random State: 6728
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        72
           2       0.97      0.97      0.97        72
           3       0.99      0.99      0.99       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441





Random State: 49122
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.99      0.99        72
           2       0.96      1.00      0.98        72
           3       1.00      0.99      0.99       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Mean Accuracy: 0.9917
Std Accuracy: 0.0011


<h3> 4.2. Support Vector Machines </h3>

In [42]:
# accs = []
# for RANDOM_STATE in states:
#     X_train, X_test, y_train, y_test = train_test_split(
#         features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
#     )
    
#     clf = svm.SVC(kernel='linear')
    
#     clf.fit(X_train, y_train)
#     preds = model.predict(X_test)
#     accs.append(accuracy_score(y_test, preds))
#     print(f"Random State: {RANDOM_STATE}")
#     print(classification_report(y_test, preds, zero_division=0.0))

# accs_mean = round(np.mean(accs), 4)
# accs_std = round(np.std(accs), 4)

# print(f"Mean Accuracy: {accs_mean}")
# print(f"Std Accuracy: {accs_std}")

<h3>4.3. K-Nearest Neighbours</h3>

In [43]:
accs = []
for RANDOM_STATE in states:
 
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
    )
    
    # Standardise features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    
    preds = knn.predict(X_test)  # Korrigiert von `model.predict` zu `knn.predict`
    accs.append(accuracy_score(y_test, preds))
    
    # Ergebnisse ausgeben
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))


accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

Random State: 27
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.97      0.99        72
           2       0.96      1.00      0.98        72
           3       1.00      0.99      1.00       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Random State: 6728
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        72
           2       0.97      0.99      0.98        72
           3       1.00      0.99      0.99       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Random State: 49122
              precision    recall  f1-score   support

           0       

In [44]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
    )


    xgb_clf = XGBClassifier(n_estimators = 50,
                            learning_rate = 0.05,
                            use_label_encoder = False,
                            eval_metric = "logloss",
                            n_jobs = -1)
    xgb_clf.fit(X_train, y_train)

    preds = xgb_clf.predict(X_test)

    accs.append(accuracy_score(y_test, preds))

    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random State: 27
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.99      0.99        72
           2       0.99      1.00      0.99        72
           3       1.00      1.00      1.00       225

    accuracy                           1.00       441
   macro avg       0.99      1.00      0.99       441
weighted avg       1.00      1.00      1.00       441

Random State: 6728
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        72
           2       0.97      0.99      0.98        72
           3       1.00      0.99      0.99       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Random State: 49122
              precision    recall  f1-score   support

           0       

Parameters: { "use_label_encoder" } are not used.

