In [141]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from scipy.signal import decimate
from tsfresh import extract_features
from tsfresh.feature_selection import select_features

In [3]:
class DataProcessor:
    def __init__(self, input_path, file_names):
        self.input_path = input_path
        self.file_names = file_names
        
    def read_files(self):
        self.data = {}
        print("Reading files...")
        for file in self.file_names:
            with open(self.input_path + file + '.txt', 'r') as f:
                self.data[file] = pd.read_csv(f, header=None, sep='\t')
        return self.data
    
    def print_shape(self):
        print("Files read:")
        for file in self.data:
            print(f"{file}: {self.data[file].shape}")
            
    def create_target_df(self):
        target_columns = ['Cooler_Condition', 'Valve_Condition', 
                        'Internal_Pump_Leakage', 'Hydraulic_Accumulator', 
                        'Stable_Flag']
        self.data['target'].columns = target_columns
        self.valve_condition = self.data['target']['Valve_Condition']
        #del self.data['target']
        return self.valve_condition

def process_data():
    input_path = "input_data/"
    file_names = [
        "ce", "cp", "eps1", "se", "vs1", 
        "fs1", "fs2", 
        "ps1", "ps2", "ps3", "ps4", "ps5", "ps6",
        "ts1", "ts2", "ts3", "ts4", "target"
    ]
    
    processor = DataProcessor(input_path, file_names)
    data = processor.read_files()
    processor.print_shape()
    df_target = processor.create_target_df()
    df_target = processor.valve_condition
    return data, df_target

data, df_target = process_data()

Reading files...
Files read:
ce: (2205, 60)
cp: (2205, 60)
eps1: (2205, 6000)
se: (2205, 60)
vs1: (2205, 60)
fs1: (2205, 600)
fs2: (2205, 600)
ps1: (2205, 6000)
ps2: (2205, 6000)
ps3: (2205, 6000)
ps4: (2205, 6000)
ps5: (2205, 6000)
ps6: (2205, 6000)
ts1: (2205, 60)
ts2: (2205, 60)
ts3: (2205, 60)
ts4: (2205, 60)
target: (2205, 5)


In [4]:
from scipy.signal import resample_poly, decimate
df_list = ['se', 'fs1', 'ps3']

df_resampled = {}

for df in df_list:
    resampled_signals = [] 
    if data[df].shape[1] == 6000:
        downsample_factor = 10
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            resampled_signal= decimate(row, downsample_factor, ftype='fir')  # Downsample
            resampled_signals.append(resampled_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_resampled[df] = pd.DataFrame(resampled_signals)
        df_resampled[df]["id"] = df_resampled[df].index

    elif data[df].shape[1] == 60:
        target_rate = 10
        up_factor =  10
        for i in range(data[df].shape[0]):
            row = data[df].iloc[i].values  # Extract row as a 1D array
            resampled_signal = resample_poly(row, up=up_factor, down=1)
            resampled_signals.append(resampled_signal)  # Store the result
        # Create a new DataFrame with the filtered signals and add the 'id' column
        df_resampled[df] = pd.DataFrame(resampled_signals)
        df_resampled[df]["id"] = df_resampled[df].index

    else:
        df_resampled[df] = data[df]
        df_resampled[df]["id"] = df_resampled[df].index

# Combine all DataFrames

for i in df_resampled.keys():
    print(f"shape of {i}: {df_resampled[i].shape}")
    
df_combined = pd.concat([df_resampled[df] for df in df_list], ignore_index=True)

df_combined


shape of se: (2205, 601)
shape of fs1: (2205, 601)
shape of ps3: (2205, 601)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,591,592,593,594,595,596,597,598,599,id
0,68.084518,67.004909,63.735781,58.472067,51.525887,43.304054,34.279007,24.955454,15.835382,7.384111,...,63.042752,56.775728,49.699019,42.053569,34.116221,26.182378,18.547245,11.487055,5.241687,0
1,68.309669,67.223859,63.941248,58.657706,51.686681,43.436603,34.381677,25.028401,15.880423,7.404480,...,63.348518,57.051406,49.940751,42.258553,34.282915,26.310632,18.638335,11.543615,5.267561,1
2,68.640890,67.550741,64.253154,58.944846,51.940679,43.650970,34.552150,25.153131,15.960003,7.441808,...,63.368599,57.061229,49.942979,42.255692,34.277208,26.303968,18.632191,11.539037,5.265163,2
3,68.673912,67.584419,64.286443,58.976666,51.969970,43.676747,34.573564,25.169527,15.970966,7.447205,...,63.594756,57.268133,50.126315,42.412283,34.405146,26.402670,18.702376,11.582618,5.285082,3
4,68.914073,67.813779,64.497172,59.162394,52.126217,43.801182,34.666078,25.232098,16.007355,7.462487,...,63.691327,57.356101,50.204525,42.479682,34.460921,26.446366,18.733978,11.602593,5.294378,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6610,1.413650,0.711093,-0.232004,0.128203,-0.079498,0.050507,-0.031521,0.018792,-0.010514,0.005630,...,2.401040,2.360553,2.372151,2.352367,2.442485,2.296834,2.383129,2.321585,2.528414,2200
6611,1.413906,0.735279,-0.238565,0.131707,-0.081651,0.051875,-0.032380,0.019309,-0.010806,0.005786,...,2.327278,2.354860,2.300761,2.395477,2.384843,2.229442,2.440372,2.284959,2.570547,2201
6612,1.404983,0.695732,-0.227869,0.125979,-0.078129,0.049638,-0.030977,0.018465,-0.010329,0.005531,...,2.300923,2.403128,2.324703,2.263556,2.345514,2.259792,2.337613,2.154176,2.443541,2202
6613,1.422501,0.703707,-0.231483,0.128062,-0.079435,0.050468,-0.031491,0.018769,-0.010497,0.005620,...,2.341401,2.443357,2.450324,2.251303,2.389362,2.271598,2.352720,2.241676,2.495724,2203


In [5]:
import pandas as pd
from tsfresh import extract_features
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Fenstergröße und Überlappung
window_size = 200  # Fenstergröße
overlap = 0  # Überlappung

# Entferne 'id' und setze die Zielspalte 'id'
window_df = df_combined.drop(columns=['id'])
id = df_combined["id"]

# Liste für die Fenster
windows = []

# Schleife zur Fenstererstellung
for start_col in range(0, window_df.shape[1] - window_size + 1, window_size - overlap):
    end_col = start_col + window_size
    window = window_df.iloc[:, start_col:end_col]
    window["id"] = id
    windows.append(window)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window["id"] = id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window["id"] = id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window["id"] = id


In [6]:
# Spaltennamen des ersten Fensters extrahieren
reference_columns = windows[0].columns.tolist()

# Angleichung der Spaltennamen aller Fenster an die des ersten Fensters
for i in range(1, len(windows)):
    windows[i].columns = reference_columns

In [7]:
window_data = pd.concat(windows, ignore_index=True)

In [8]:
window_long = pd.melt(window_data, id_vars=['id'], var_name='time', value_name='value')

In [9]:
window_long

Unnamed: 0,id,time,value
0,0,0,68.084518
1,1,0,68.309669
2,2,0,68.640890
3,3,0,68.673912
4,4,0,68.914073
...,...,...,...
3968995,2200,199,2.528414
3968996,2201,199,2.570547
3968997,2202,199,2.443541
3968998,2203,199,2.495724


In [None]:
extracted_features = extract_features(window_long, 
                                     column_id="id", 
                                     column_sort="time")

Feature Extraction:   0%|          | 0/30 [00:00<?, ?it/s]

In [155]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df_target)

In [None]:
# clean the features
extracted_features = extracted_features.dropna(axis=1)
extracted_features.replace([np.inf, -np.inf], np.nan, inplace=True)
extracted_features = extracted_features.dropna(how = "all", axis= "columns")
# Feature-Selektion basierend auf Zielwerten
extracted_features = select_features(extracted_features, y=y_encoded)
selector = VarianceThreshold()
extracted_features = selector.fit_transform(extracted_features)

In [None]:
# check shape of feature and target
if extracted_features.shape[0] == y_encoded.shape[0]:
    print("Data is ready for Modelling!")
    print(f"Shape features: {extracted_features.shape}")
    print(f"Shape target: {y_encoded.shape}")
else:
    print("Shape of the Inputs and target don't match. Please check preprocesing steps")

In [156]:
states = [27, 6728, 49122]
features = features_df
target = df_target

In [None]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

In [None]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    clf = svm.SVC(kernel='linear')
    
    clf.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

In [None]:
accs = []
for RANDOM_STATE in states:
 
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
    )
    
    # Standardise features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    
    preds = knn.predict(X_test)  
    accs.append(accuracy_score(y_test, preds))
    
    # Ergebnisse ausgeben
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))


accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")