In [3]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from scipy.signal import decimate
from tsfresh import extract_features
from tsfresh.feature_selection import select_features

In [4]:
class DataProcessor:
    def __init__(self, input_path, file_names):
        self.input_path = input_path
        self.file_names = file_names
        
    def read_files(self):
        self.data = {}
        print("Reading files...")
        for file in self.file_names:
            with open(self.input_path + file + '.txt', 'r') as f:
                self.data[file] = pd.read_csv(f, header=None, sep='\t')
        return self.data
    
    def print_shape(self):
        print("Files read:")
        for file in self.data:
            print(f"{file}: {self.data[file].shape}")
            
    def create_target_df(self):
        target_columns = ['Cooler_Condition', 'Valve_Condition', 
                        'Internal_Pump_Leakage', 'Hydraulic_Accumulator', 
                        'Stable_Flag']
        self.data['target'].columns = target_columns
        self.valve_condition = self.data['target']['Valve_Condition']
        #del self.data['target']
        return self.valve_condition

def process_data():
    input_path = "input_data/"
    file_names = [
        "ce", "cp", "eps1", "se", "vs1", 
        "fs1", "fs2", 
        "ps1", "ps2", "ps3", "ps4", "ps5", "ps6",
        "ts1", "ts2", "ts3", "ts4", "target"
    ]
    
    processor = DataProcessor(input_path, file_names)
    data = processor.read_files()
    processor.print_shape()
    df_target = processor.create_target_df()
    df_target = processor.valve_condition
    return data, df_target

data, df_target = process_data()

Reading files...
Files read:
ce: (2205, 60)
cp: (2205, 60)
eps1: (2205, 6000)
se: (2205, 60)
vs1: (2205, 60)
fs1: (2205, 600)
fs2: (2205, 600)
ps1: (2205, 6000)
ps2: (2205, 6000)
ps3: (2205, 6000)
ps4: (2205, 6000)
ps5: (2205, 6000)
ps6: (2205, 6000)
ts1: (2205, 60)
ts2: (2205, 60)
ts3: (2205, 60)
ts4: (2205, 60)
target: (2205, 5)


In [40]:
df = data['ps3']
# df["id"] = df.index
# df

In [76]:

# Fenstergröße und Überlappung festlegen
window_size = 10  # alle 10 Spalten sind ein Fenster
overlap = 5  # 3 Spalten Überlappung

window_df = df

# Liste für die Fenster
windows = []

# Schleife zur Fenstererstellung mit Überlappung
for i, start_col in enumerate(range(0, window_df.shape[1] - window_size + 1, window_size - overlap)):
    # Bestimme das Ende des Fensters
    end_col = start_col + window_size
    
    # Wähle das Fenster aus den Daten
    window = window_df.iloc[:, start_col:end_col]
    
    # Füge die 'time'-Spalte hinzu
    # window["id"] = window.index
    # Berechne die Standardabweichung pro Zeile
    std_values = window.std(axis=1)
    
    # Erstelle ein DataFrame mit dem Fensterindex
    std_df = pd.DataFrame(std_values, columns=[f"{i}"])
    # Fenster zur Liste hinzufügen
    windows.append(std_df)

# Ergebnis
print(f"Anzahl der Fenster: {len(windows)}")


Anzahl der Fenster: 1199


In [77]:
features = pd.concat(windows, axis=1)

In [78]:
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1189,1190,1191,1192,1193,1194,1195,1196,1197,1198
0,1.183231,1.191174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028612,0.032029,0.045295,0.049176,0.043497,0.041939,0.042768,0.037792,0.052332,0.061796
1,1.172262,1.159251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028135,0.034616,0.104292,0.084125,0.064946,0.042282,0.058108,0.074052,0.077407,0.034804
2,1.134758,1.127452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.057436,0.038277,0.054146,0.041939,0.036679,0.055227,0.074474,0.119132,0.027138,0.043690
3,1.177085,1.175140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.059470,0.050582,0.027993,0.051727,0.056718,0.068255,0.037717,0.043574,0.057998,0.054928
4,1.115662,1.110105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.056010,0.060830,0.043298,0.042469,0.065685,0.040860,0.030219,0.037721,0.037142,0.042021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2200,1.068008,1.006954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.057866,0.070909,0.051021,0.036038,0.028719,0.058295,0.057426,0.054210,0.029938,0.045709
2201,1.068807,1.057290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.057277,0.037770,0.044524,0.069473,0.062676,0.068056,0.061458,0.057130,0.059296,0.067184
2202,1.074502,1.008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.036059,0.045891,0.029509,0.020592,0.047467,0.057650,0.060595,0.049261,0.039230,0.022818
2203,1.097106,1.009321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.065583,0.052559,0.046229,0.040399,0.047477,0.070323,0.038768,0.039497,0.029661,0.032094


In [73]:
# Spaltennamen des ersten Fensters extrahieren
reference_columns = windows[0].columns.tolist()

# Angleichung der Spaltennamen aller Fenster an die des ersten Fensters
for i in range(1, len(windows)):
    windows[i].columns = reference_columns

In [20]:
windows_long = []
for window in windows:
    window_long = pd.melt(window, id_vars=['id'], var_name='time', value_name='value')
    windows_long.append(window_long)

In [None]:
windows_long[0]

In [None]:
feature_list = []
for window in windows_long:
    features = extract_features(
        window,
        column_id="id",        # Zeitreihen-ID
        column_sort="time",    # Zeitstempel # Sensor-Typ
        column_value="value"   # Wert
    )
    feature_list.append(features)

In [None]:
features = feature_list[1]

In [4]:
row_std = df.std(axis=1)

In [None]:
df_list = ['se', 'fs1', 'ps3']

df_downsampled = {}

for df in df_list:
    filtered_signals = []  # Reset for each DataFrame
    if data[df].shape[1] == 6000:
        downsample_factor = 100
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            filtered_signal = decimate(row, downsample_factor, ftype='fir')  # Downsample
            filtered_signals.append(filtered_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_downsampled[df] = pd.DataFrame(filtered_signals)
        df_downsampled[df]["id"] = df_downsampled[df].index

    elif data[df].shape[1] == 600:
        downsample_factor = 10
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            filtered_signal = decimate(row, downsample_factor, ftype='fir')  # Downsample
            filtered_signals.append(filtered_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_downsampled[df] = pd.DataFrame(filtered_signals)
        df_downsampled[df]["id"] = df_downsampled[df].index

    else:
        df_downsampled[df] = data[df]
        df_downsampled[df]["id"] = df_downsampled[df].index

# Combine all DataFrames
df_combined = pd.concat([df_downsampled[df] for df in df_list], ignore_index=True)

df_combined

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4), dpi=100)
axes = axes.flatten()

for idx, s in enumerate(df_list):
    ax = axes[idx]
    df = df_downsampled[s].drop(columns=['id'])  # Adjusted to use `s` and drop the column correctly
    
    for i in range(df.shape[0] - 1):
        ax.plot(df.iloc[i], color='blue', linewidth=0.5, label=f'Series {i}' if i == 0 else "")  # Add label only once
    
    ax.set_title(s)
    ax.set_xlabel("Time")
    ax.set_ylabel("Test")

    ax.legend(loc='best', fontsize='small')  # Legend specific to each subplot

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Fenstergröße und Überlappung festlegen
window_size = 10  # alle 10 Spalten sind ein Fenster
overlap = 5  # 3 Spalten Überlappung

# Entferne 'id' und setze die Zielspalte 'id'
window_df = df_combined.drop(columns=['id'])
id = df_combined["id"]

# Liste für die Fenster
windows = []

# Schleife zur Fenstererstellung mit Überlappung
for start_col in range(0, window_df.shape[1] - window_size + 1, window_size - overlap):
    # Bestimme das Ende des Fensters
    end_col = start_col + window_size
    
    # Wähle das Fenster aus den Daten
    window = window_df.iloc[:, start_col:end_col]
    
    # Füge die 'time'-Spalte hinzu
    window["id"] = id
    
    # Fenster zur Liste hinzufügen
    windows.append(window)

# Ergebnis
print(f"Anzahl der Fenster: {len(windows)}")

# # Optional: Ausgabe eines Fensters zur Kontrolle
# # print(final_df.head())


In [34]:
# Spaltennamen des ersten Fensters extrahieren
reference_columns = windows[0].columns.tolist()

# Angleichung der Spaltennamen aller Fenster an die des ersten Fensters
for i in range(1, len(windows)):
    windows[i].columns = reference_columns

In [35]:
windows_combined = pd.concat([i for i in windows], ignore_index=True)

In [36]:
window_long = pd.melt(windows_combined, id_vars=['id'], var_name='time', value_name='value')

In [None]:
# Sensor-Daten zusammenfügen
df_downsampled["se"]["sensor"] = "sensor_1"

df_downsampled["fs1"]["sensor"] = "sensor_2"

df_downsampled["ps3"]["sensor"] = "sensor_3"

# Daten ins lange Format bringen
sensor_1_long = df_downsampled["se"].melt(id_vars=["id", "sensor"], var_name="time", value_name="value")
sensor_2_long = df_downsampled["fs1"].melt(id_vars=["id", "sensor"], var_name="time", value_name="value")
sensor_3_long = df_downsampled["ps3"].melt(id_vars=["id", "sensor"], var_name="time", value_name="value")

# Alle Sensor-Daten kombinieren
all_sensors_long = pd.concat([sensor_1_long, sensor_2_long, sensor_3_long])

all_sensors_long



In [None]:
# Merkmalsextraktion mit TSFresh
features = extract_features(
    all_sensors_long,
    column_id="id",        # Zeitreihen-ID
    column_sort="time",    # Zeitstempel
    column_kind="sensor",  # Sensor-Typ
    column_value="value"   # Wert
)

print(features.head())

In [53]:
extracted_features = pd.DataFrame(features)

In [None]:
extracted_features

In [None]:
extracted_features = extract_features(window_long, 
                                        column_id="id", 
                                        column_sort="time")

In [62]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df_target)

In [None]:
# clean the features
extracted_features = extracted_features.dropna(axis=1)
extracted_features.replace([np.inf, -np.inf], np.nan, inplace=True)
extracted_features = extracted_features.dropna(how = "all", axis= "columns")
# Feature-Selektion basierend auf Zielwerten
extracted_features = select_features(extracted_features, y=y_encoded)
selector = VarianceThreshold()
extracted_features = selector.fit_transform(extracted_features)

In [None]:
# check shape of feature and target
if extracted_features.shape[0] == y_encoded.shape[0]:
    print("Data is ready for Modelling!")
    print(f"Shape features: {extracted_features.shape}")
    print(f"Shape target: {y_encoded.shape}")
else:
    print("Shape of the Inputs and target don't match. Please check preprocesing steps")

In [79]:
states = [27, 6728, 49122]
features = features
target = df_target

In [80]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")



Random State: 27
              precision    recall  f1-score   support

          73       1.00      1.00      1.00        72
          80       0.50      1.00      0.67        72
          90       0.00      0.00      0.00        72
         100       1.00      1.00      1.00       225

    accuracy                           0.84       441
   macro avg       0.62      0.75      0.67       441
weighted avg       0.76      0.84      0.78       441





Random State: 6728
              precision    recall  f1-score   support

          73       1.00      1.00      1.00        72
          80       0.50      1.00      0.67        72
          90       0.00      0.00      0.00        72
         100       1.00      1.00      1.00       225

    accuracy                           0.84       441
   macro avg       0.62      0.75      0.67       441
weighted avg       0.76      0.84      0.78       441





Random State: 49122
              precision    recall  f1-score   support

          73       1.00      1.00      1.00        72
          80       0.50      1.00      0.67        72
          90       0.00      0.00      0.00        72
         100       1.00      1.00      1.00       225

    accuracy                           0.84       441
   macro avg       0.62      0.75      0.67       441
weighted avg       0.76      0.84      0.78       441

Mean Accuracy: 0.8367
Std Accuracy: 0.0


In [81]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    clf = svm.SVC(kernel='linear')
    
    clf.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

Random State: 27
              precision    recall  f1-score   support

          73       1.00      1.00      1.00        72
          80       0.50      1.00      0.67        72
          90       0.00      0.00      0.00        72
         100       1.00      1.00      1.00       225

    accuracy                           0.84       441
   macro avg       0.62      0.75      0.67       441
weighted avg       0.76      0.84      0.78       441

Random State: 6728
              precision    recall  f1-score   support

          73       1.00      1.00      1.00        72
          80       0.50      1.00      0.67        72
          90       0.00      0.00      0.00        72
         100       1.00      1.00      1.00       225

    accuracy                           0.84       441
   macro avg       0.62      0.75      0.67       441
weighted avg       0.76      0.84      0.78       441

Random State: 49122
              precision    recall  f1-score   support

          73       

In [82]:
accs = []
for RANDOM_STATE in states:
 
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
    )
    
    # Standardise features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    
    preds = knn.predict(X_test)  
    accs.append(accuracy_score(y_test, preds))
    
    # Ergebnisse ausgeben
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))


accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

Random State: 27
              precision    recall  f1-score   support

          73       0.84      0.88      0.86        72
          80       0.71      0.71      0.71        72
          90       0.66      0.61      0.63        72
         100       0.87      0.88      0.87       225

    accuracy                           0.80       441
   macro avg       0.77      0.77      0.77       441
weighted avg       0.80      0.80      0.80       441

Random State: 6728
              precision    recall  f1-score   support

          73       0.90      0.85      0.87        72
          80       0.73      0.74      0.73        72
          90       0.70      0.68      0.69        72
         100       0.90      0.92      0.91       225

    accuracy                           0.84       441
   macro avg       0.81      0.80      0.80       441
weighted avg       0.84      0.84      0.84       441

Random State: 49122
              precision    recall  f1-score   support

          73       