<h1> Manual Feature Extraction with Tsfresh </h1>

<h2> 1. Imports and load data </h2>

In [37]:
# imports
import pandas as pd
import numpy as np
from typing import List, Dict
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.signal import decimate
from tsfresh import extract_features
from tsfresh.feature_selection import select_features
from tsfresh.feature_extraction import MinimalFCParameters
from xgboost import XGBClassifier

In [38]:
class DataProcessor:
    def __init__(self, input_path: str, file_names: List[str]) -> None:
        """
        Initializes the DataProcessor with the given input path and file names.

        Args:
            input_path (str): The directory path where the files are stored.
            file_names (List[str]): List of file names to be read.

        Returns:
            None
        """
        self.input_path = input_path
        self.file_names = file_names
        
    def read_files(self) -> Dict[str, pd.DataFrame]:
        """
        Reads the files specified in the file names and stores them in a dictionary.

        Returns:
            Dict[str, pd.DataFrame]: A dictionary where keys are file names and values are DataFrames.
        """
        self.data = {}
        print("Reading files...")
        for file in self.file_names:
            with open(self.input_path + file + '.txt', 'r') as f:
                self.data[file] = pd.read_csv(f, header=None, sep='\t')
        return self.data
    
    def print_shape(self) -> None:
        """
        Prints the shape of each loaded DataFrame.

        Returns:
            None
        """
        print("Files read:")
        for file in self.data:
            print(f"{file}: {self.data[file].shape}")
            
    def create_target_df(self) -> pd.Series:
        """
        Renames the columns in the data['target'] and creates the wanted target DataFrame by extracting the column 'Valve_Condition'.

        Returns:
            pd.Series: A pandas Series containing the 'Valve_Condition' column from the target DataFrame.
        """
        target_columns = ['Cooler_Condition', 'Valve_Condition', 
                          'Internal_Pump_Leakage', 'Hydraulic_Accumulator', 
                          'Stable_Flag']
        self.data['target'].columns = target_columns
        self.valve_condition = self.data['target']['Valve_Condition']
        return self.valve_condition

def process_data() -> (Dict[str, pd.DataFrame], pd.Series): # type: ignore
    """
    Processes the data by reading the files and extracting the target DataFrame.

    Returns:
        Tuple[Dict[str, pd.DataFrame], pd.Series]: A tuple containing the data dictionary and the valve condition Series.
    """
    input_path = "input_data/"
    file_names = [
        "ce", "cp", "eps1", "se", "vs1", 
        "fs1", "fs2", 
        "ps1", "ps2", "ps3", "ps4", "ps5", "ps6",
        "ts1", "ts2", "ts3", "ts4", "target"
    ]
    
    processor = DataProcessor(input_path, file_names)
    data = processor.read_files()
    processor.print_shape()
    df_target = processor.create_target_df()
    return data, df_target

data, df_target = process_data()

Reading files...
Files read:
ce: (2205, 60)
cp: (2205, 60)
eps1: (2205, 6000)
se: (2205, 60)
vs1: (2205, 60)
fs1: (2205, 600)
fs2: (2205, 600)
ps1: (2205, 6000)
ps2: (2205, 6000)
ps3: (2205, 6000)
ps4: (2205, 6000)
ps5: (2205, 6000)
ps6: (2205, 6000)
ts1: (2205, 60)
ts2: (2205, 60)
ts3: (2205, 60)
ts4: (2205, 60)
target: (2205, 5)


<h2> 2. Signal Preprocessing </h2>

<h3> Input data </h3>

Steps:

<ul>
    <li>If the signal frequency is > 1 Hz, the signal gets downsampled to 1 Hz </li>
    <li>Downsampled signals are stored in a new dictionary</li>
    <li>An ID column gets added to the downsampled signals</li>
    <li>The downsampled signals are concatenated in one dataframe</li>
</ul>

In [39]:
def downsample_data(df_list: list) -> pd.DataFrame:
    """
    Downsamples each DataFrame in the provided 'df_list' by applying a FIR filter 
    using the decimate function and returns a combined DataFrame with all downsampled signals.
    
    Parameters:
    -----------
    data : Dict[str, pd.DataFrame]
        Dictionary containing all the DataFrames loaded in the previous step.
    
    df_list : list
        A list of keys (strings) that indicate which DataFrames from the 'data'
        dictionary should be downsampled.
    
    Returns:
    --------
    pd.DataFrame
        A combined DataFrame with downsampled signals from each DataFrame in the 'df_list'.
        Each signal is downsampled based on its original shape.
    """
    
    df_downsampled: Dict[str, pd.DataFrame] = {}

    for df in df_list:
        filtered_signals = []
        if data[df].shape[1] == 6000:
            downsample_factor = 100
            for i in range(data[df].shape[0]):
                row = data[df].iloc[i].values  # extract row as a 1D array
                filtered_signal = decimate(row, downsample_factor, ftype='fir')  # downsample
                filtered_signals.append(filtered_signal)  
            # create dataframe with downsampled signals and add 'id' column
            df_downsampled[df] = pd.DataFrame(filtered_signals)
            df_downsampled[df]["id"] = df_downsampled[df].index

        elif data[df].shape[1] == 600:
            downsample_factor = 10
            for i in range(data[df].shape[0]):
                row = data[df].iloc[i].values  # extract row as a 1D array
                filtered_signal = decimate(row, downsample_factor, ftype='fir')  # downsample
                filtered_signals.append(filtered_signal)  
            # create dataframe with downsampled signals and add 'id' column
            df_downsampled[df] = pd.DataFrame(filtered_signals)
            df_downsampled[df]["id"] = df_downsampled[df].index

        else:
            df_downsampled[df] = data[df]
            df_downsampled[df]["id"] = df_downsampled[df].index
        
    # Debugging: print shape of each downsampled DataFrame
    for i in df_downsampled.keys():
        print(f"shape of {i}: {df_downsampled[i].shape}")
    
    # Combine the downsampled signals into one dataframe
    df_combined = pd.concat([df_downsampled[df] for df in df_list], ignore_index=True)

    return df_combined

In [40]:
# Create the combined, downsampled dataframe
df_list = ['ps3', 'eps1']
df_combined = downsample_data(df_list)

shape of ps3: (2205, 61)
shape of eps1: (2205, 61)


<h3> Transform Input Data into long format </h3>

Since Tsfresh needs the input data in the long format, we transform our input DataFrame

In [41]:
df_combined_long = pd.melt(df_combined, id_vars=['id'], var_name='time', value_name='value')

<h3>Target Data</h3>

Encoding the target labels

In [42]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(df_target)

<h2>3. Extract Features </h2>

In [43]:

# using only the minimal important features
settings = MinimalFCParameters()

# extract the features
extracted_features = extract_features(df_combined_long, 
                                      column_id="id", 
                                      column_sort="time", 
                                      default_fc_parameters=settings)


Feature Extraction: 100%|██████████| 30/30 [00:05<00:00,  5.95it/s]


In [44]:
def clean_features(feature_input: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the input feature DataFrame by performing the following steps:
    1. Deletes columns with only NaN values.
    2. Replaces infinite values (both positive and negative) with NaN.
    3. Drops columns with only NaN values after replacement.
    4. Uses the select_features function offered by tsfresh to identify relevant features based on a given target variable.
    5. Applies variance thresholding to remove features with low variance.
    
    Parameters:
    -----------
    feature_input : pd.DataFrame
        A DataFrame containing the input feature data.
    
    y_encoded : pd.Series
        A Series representing the target or label data, used for feature selection (if needed).
    
    Returns:
    --------
    np.ndarray
        The cleaned and transformed feature data, after dropping NaN and infinite values,
        selecting relevant features, and applying variance thresholding.
    """
    # Step 1: Delete columns with only NaN values
    features_cleaned = feature_input.dropna(axis=1)
    
    # Step 2: Replace infinite values with NaN
    features_cleaned = features_cleaned.replace([np.inf, -np.inf], np.nan)
    
    # Step 3: Drop columns with only NaN values after replacing infinities
    features_cleaned = features_cleaned.dropna(how="all", axis="columns")
    
    # Step 4: Use select_features() function to identify relevant features based on the target y_encoded.
    features_cleaned = select_features(features_cleaned, y=y_encoded)
    
    # Step 5: Apply variance thresholding to remove low-variance features
    selector = VarianceThreshold()
    features_cleaned = selector.fit_transform(features_cleaned)
    
    return features_cleaned


In [45]:
features = clean_features(extracted_features)

In [46]:
import numpy as np
import pandas as pd

def check_data_readiness(feature_input: np.ndarray | pd.DataFrame, target_input: np.ndarray | pd.Series) -> None:
    """
    Checks if the number of samples (rows) in the feature and target arrays/dataframes match.
    If they match, prints the shape of both feature and target. Otherwise, prints an error message.
    
    Parameters:
    -----------
    feature : np.ndarray | pd.DataFrame
        The feature data, either as a NumPy array or a Pandas DataFrame.
        
    target : np.ndarray | pd.Series
        The target data, either as a NumPy array or a Pandas Series.
    
    Returns:
    --------
    None
        This function does not return anything, it simply prints messages based on the shape check.
    """
    if feature_input.shape[0] == target_input.shape[0]:
        print("Data is ready for Modelling!")
        print(f"Shape features: {feature_input.shape}")
        print(f"Shape target: {target_input.shape}")
    else:
        print("Shape of the Inputs and target don't match. Please check pre-processing steps")


In [47]:
check_data_readiness(features, y_encoded)

Data is ready for Modelling!
Shape features: (2205, 6)
Shape target: (2205,)


<h2>4. Making predictions </h2>

In [48]:
states = [27, 6728, 49122]
features = features
target = y_encoded

<h3>4.1. AdaBoost Classifier </h3>

In [49]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size = 0.2, random_state = RANDOM_STATE, stratify = target
    )
    
    model = AdaBoostClassifier(
    algorithm='SAMME',
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=50
    )
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accs.append(accuracy_score(y_test, preds))
    
    # print classification report
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

Random State: 27
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        72
           1       0.99      0.99      0.99        72
           2       0.99      1.00      0.99        72
           3       1.00      1.00      1.00       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Random State: 6728
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        72
           2       0.97      0.97      0.97        72
           3       0.99      0.99      0.99       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Random State: 49122
              precision    recall  f1-score   support

           0       

<h3>4.2. K-Nearest Neighbours</h3>

In [50]:
accs = []
for RANDOM_STATE in states:
 
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
    )
    
    # Standardise features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    
    preds = knn.predict(X_test) 
    accs.append(accuracy_score(y_test, preds))
    
    # print classification report
    print(f"Random State: {RANDOM_STATE}")
    print(classification_report(y_test, preds, zero_division=0.0))


accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

Random State: 27
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.97      0.99        72
           2       0.96      1.00      0.98        72
           3       1.00      0.99      1.00       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Random State: 6728
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        72
           2       0.97      0.99      0.98        72
           3       1.00      0.99      0.99       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Random State: 49122
              precision    recall  f1-score   support

           0       

<h3>4.3. XGBoost Classifier</h3>

In [51]:
accs = []
for RANDOM_STATE in states:
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=RANDOM_STATE, stratify=target
    )


    xgb_clf = XGBClassifier(n_estimators = 50,
                            learning_rate = 0.05,
                            eval_metric = "logloss",
                            n_jobs = -1)
    xgb_clf.fit(X_train, y_train)

    preds = xgb_clf.predict(X_test)

    accs.append(accuracy_score(y_test, preds))
    
    # print classification report
    print(f"Classification Report for random state {RANDOM_STATE}:")
    print(classification_report(y_test, preds, zero_division=0.0))

accs_mean = round(np.mean(accs), 4)
accs_std = round(np.std(accs), 4)

print(f"Mean Accuracy: {accs_mean}")
print(f"Std Accuracy: {accs_std}")

Classification Report for random state 27:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        72
           1       1.00      0.99      0.99        72
           2       0.99      1.00      0.99        72
           3       1.00      1.00      1.00       225

    accuracy                           1.00       441
   macro avg       0.99      1.00      0.99       441
weighted avg       1.00      1.00      1.00       441

Classification Report for random state 6728:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        72
           2       0.97      0.99      0.98        72
           3       1.00      0.99      0.99       225

    accuracy                           0.99       441
   macro avg       0.99      0.99      0.99       441
weighted avg       0.99      0.99      0.99       441

Classification Report for random state 491