### Data preprocessing for the 1st proactive strategy [PMF]

In [1]:
# Importing necessary libraries for data preprocessing
import os
import gc
import utils
import numpy as np
import pandas as pd
from pathlib import Path
from itertools import groupby
from tabulate import tabulate
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#### Loading the raw dataset

In [2]:
# Loading the raw dataset
data = utils.load_data("data/raw/cobot_data.xlsx", "xlsx")
print(f'Dataset loaded. Shape: {data.shape}')

Dataset loaded. Shape: (7409, 24)


In [3]:
# Dropping the column "Num" and renaming a few columns for better readability
data = data.drop(columns="Num")
print(f'Dataset after dropping column "Num". Shape: {data.shape}')

data.rename(columns={
    'cycle': 'Cycle',
    'Robot_ProtectiveStop': 'Robot Protective Stop', 
    'grip_lost': 'Grip Lost', 
    'Tool_current': 'Tool Current'
}, inplace=True)

Dataset after dropping column "Num". Shape: (7409, 23)


In [4]:
# Checking the columns present in the dataset
print(data.columns.tolist())

['Timestamp', 'Current_J0', 'Temperature_T0', 'Current_J1', 'Temperature_J1', 'Current_J2', 'Temperature_J2', 'Current_J3', 'Temperature_J3', 'Current_J4', 'Temperature_J4', 'Current_J5', 'Temperature_J5', 'Speed_J0', 'Speed_J1', 'Speed_J2', 'Speed_J3', 'Speed_J4', 'Speed_J5', 'Tool Current', 'Cycle', 'Robot Protective Stop', 'Grip Lost']


#### Preprocessing the timestamp feature in the dataset

In [5]:
# Function to preprocess the timestamp feature and get new temporal feature for the dataset
def process_timestamp(df):
    df["Timestamp"] = pd.to_datetime(
        df["Timestamp"].str.strip('"'),
        format="%Y-%m-%dT%H:%M:%S.%fZ",
        errors="coerce"
    )
    dropped_rows = df["Timestamp"].isnull().sum()
    if dropped_rows > 0:
        print(f"Warning: {dropped_rows} rows dropped due to invalid timestamps.")
        df = df.dropna(subset=["Timestamp"])
    
    df = df.sort_values(by="Timestamp", ascending=True)
    df = df.reset_index(drop=True)
    df["Hour"] = df["Timestamp"].dt.hour
    df["Minute"] = df["Timestamp"].dt.minute
    df["Second"] = df["Timestamp"].dt.second
    df['Time of Day'] = (df['Hour'] * 3600) + (df['Minute'] * 60) + df['Second']
    return df.drop(columns=["Timestamp", "Hour", "Minute", "Second"])

In [6]:
# Creating a new temporal feature using timestamp
data = process_timestamp(data)
print(f'Dataset after creating new temporal feature using column "Timestamp". Shape: {data.shape}')

Dataset after creating new temporal feature using column "Timestamp". Shape: (7409, 23)


In [7]:
# Selecting the features needed from the dataset
features = data.columns.difference(["Robot Protective Stop", "Grip Lost", "Tool Current"]).tolist()
print("Features identified:", features)
print("Features Count:", len(features))

Features identified: ['Current_J0', 'Current_J1', 'Current_J2', 'Current_J3', 'Current_J4', 'Current_J5', 'Cycle', 'Speed_J0', 'Speed_J1', 'Speed_J2', 'Speed_J3', 'Speed_J4', 'Speed_J5', 'Temperature_J1', 'Temperature_J2', 'Temperature_J3', 'Temperature_J4', 'Temperature_J5', 'Temperature_T0', 'Time of Day']
Features Count: 20


#### Handling missing values in the dataset

In [8]:
# Function to generate missing values report for a dataset
def generate_missing_values_table(dataset, threshold=0.5):
    try:
        if not isinstance(dataset, pd.DataFrame):
            raise ValueError("Input dataset must be a pandas DataFrame.")

        missing_values = dataset.isnull().sum()
        missing_percentage = (missing_values / len(dataset)) * 100

        summary = pd.DataFrame({
            "Column": dataset.columns,
            "Missing Values": missing_values,
            "Missing Percentage (%)": missing_percentage
        })

        summary = summary[summary["Missing Values"] > 0].reset_index(drop=True)

        null_row_threshold = int(len(dataset.columns) * threshold)
        mostly_null_rows = (dataset.isnull().sum(axis=1) > null_row_threshold).sum()

        output = []
        if summary.empty:
            output.append("No missing values found in the dataset.")
        else:
            output.append(tabulate(summary, headers="keys", tablefmt="grid", showindex=False))

        output.append(f"\nRows with >{int(threshold * 100)}% null values: {mostly_null_rows}")

        return "\n".join(output)

    except Exception as e:
        return f"An error occurred: {e}"

In [9]:
# Checking if the missing values are present in the dataset
print(generate_missing_values_table(data))

+-----------------------+------------------+--------------------------+
| Column                |   Missing Values |   Missing Percentage (%) |
| Current_J0            |               46 |                 0.620867 |
+-----------------------+------------------+--------------------------+
| Temperature_T0        |               54 |                 0.728843 |
+-----------------------+------------------+--------------------------+
| Current_J1            |               54 |                 0.728843 |
+-----------------------+------------------+--------------------------+
| Temperature_J1        |               54 |                 0.728843 |
+-----------------------+------------------+--------------------------+
| Current_J2            |               54 |                 0.728843 |
+-----------------------+------------------+--------------------------+
| Temperature_J2        |               54 |                 0.728843 |
+-----------------------+------------------+--------------------

In [10]:
# Handling the missing values in the target values
data["Robot Protective Stop"] = data["Robot Protective Stop"].fillna("FALSE" if (data["Grip Lost"] == "FALSE").any() else data["Robot Protective Stop"])
data["Grip Lost"] = data["Grip Lost"].fillna("FALSE" if (data["Robot Protective Stop"] == "FALSE").any() else data["Grip Lost"])

# Handling the missing values in the features
data = data.ffill().bfill()
data["Grip Lost"] = data["Grip Lost"].astype(int)

# Checking if the missing values are still present in the dataset
print(generate_missing_values_table(data))

No missing values found in the dataset.

Rows with >50% null values: 0


In [11]:
# Saving the dataset
utils.save_data_csv(data, "data/processed", "handling_missing.csv")

Dataset saved: data/processed\handling_missing.csv


In [12]:
# Freeing up memory
del data
del features
gc.collect()

622

In [13]:
# Loading the dataset
data = utils.load_data("data/processed/handling_missing.csv", "csv")

#### Handling outliers in the dataset

In [14]:
# Selecting the features from the dataset
features = data.columns.difference(["Robot Protective Stop", "Grip Lost", "Tool Current"]).tolist()
print("Features identified:", features)
print("Features Count:", len(features))

Features identified: ['Current_J0', 'Current_J1', 'Current_J2', 'Current_J3', 'Current_J4', 'Current_J5', 'Cycle', 'Speed_J0', 'Speed_J1', 'Speed_J2', 'Speed_J3', 'Speed_J4', 'Speed_J5', 'Temperature_J1', 'Temperature_J2', 'Temperature_J3', 'Temperature_J4', 'Temperature_J5', 'Temperature_T0', 'Time of Day']
Features Count: 20


In [15]:
# Function to handle outliers present in the dataset
def handle_outliers(df, features, threshold):
    try:
        if not isinstance(df, pd.DataFrame):
            raise TypeError("The input dataset must be a pandas DataFrame.")

        if not isinstance(features, list):
            raise TypeError("The features parameter must be a list of feature names.")

        for feature in features:
            if feature not in df.columns:
                raise ValueError(f"Feature '{feature}' is not present in the dataset.")

        for feature in features:
            z_scores = (df[feature] - df[feature].mean()) / df[feature].std()
            df[feature] = np.where(z_scores.abs() > threshold, np.nan, df[feature])

        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [16]:
# Checking for outliers in the dataset
data = handle_outliers(data, features, threshold=3)
print(generate_missing_values_table(data, threshold=0.3))

+------------+------------------+--------------------------+
| Column     |   Missing Values |   Missing Percentage (%) |
| Current_J0 |              209 |                 2.82089  |
+------------+------------------+--------------------------+
| Current_J1 |              155 |                 2.09205  |
+------------+------------------+--------------------------+
| Current_J2 |              159 |                 2.14604  |
+------------+------------------+--------------------------+
| Current_J3 |              197 |                 2.65893  |
+------------+------------------+--------------------------+
| Current_J4 |              193 |                 2.60494  |
+------------+------------------+--------------------------+
| Current_J5 |               12 |                 0.161965 |
+------------+------------------+--------------------------+
| Speed_J0   |              242 |                 3.2663   |
+------------+------------------+--------------------------+
| Speed_J1   |          

In [17]:
# Handling the outliers present in the dataset
data = data.ffill().bfill()
print(generate_missing_values_table(data, threshold=0.3))

No missing values found in the dataset.

Rows with >30% null values: 0


In [18]:
# Saving the initial processed dataset
print(f'Dataset after removing outliers. Shape: {data.shape}')
utils.save_data_csv(data, "data/processed", "outliers_removed.csv")

Dataset after removing outliers. Shape: (7409, 23)
Dataset saved: data/processed\outliers_removed.csv


In [19]:
# Freeing up memory
del data
del features
gc.collect()

0

#### Adding interaction features to the dataset

In [20]:
# Loading the initial processed dataset
data = utils.load_data("data/processed/outliers_removed.csv", "csv")

In [21]:
# Getting the cycle time of each operational cycle and also the number of occurrences from the feature "Cycle" present in the dataset
cycle_summary = data[["Time of Day", "Cycle"]].groupby('Cycle').agg(
    cycle_time=('Time of Day', lambda x: x.max() - x.min()),
    occurrences=('Cycle', 'size')
).reset_index()

In [22]:
# Saving the results into a dataset for future usage
utils.save_data_csv(cycle_summary, "data/processed", "cycle_summary.csv")

Dataset saved: data/processed\cycle_summary.csv


In [23]:
# Creating new features called "Sin Time" and "Cos Time" using the created temporal feature "Time of Day"
data['Sin Time'] = np.sin(2 * np.pi * data["Time of Day"] / 86400)
data['Cos Time'] = np.cos(2 * np.pi * data["Time of Day"] / 86400)

# Creating a new interaction feature called "Time Phase" in the range -pie to pie using the created features "Sin Time" and "Cos Time"
data['Time Phase'] = np.arctan2(data['Sin Time'], data['Cos Time'])

# Normalizing the column to 0 to 2pie for model training
data['Time Phase'] = (data['Time Phase'] + 2 * np.pi) % (2 * np.pi)

# Creating a new interaction feature called "Cycle Time" using the operational cycle count feature present in the dataset
data['Cycle Time'] = data.groupby('Cycle')['Cycle'].transform('size')
data = data.drop(columns=["Time of Day", "Sin Time", "Cos Time", "Cycle"])

In [24]:
# Creating new interaction features called "Average Temperature" and "Gradient Temperature" using the temperature joints featureqs
data['Average Temperature'] = data[[f'Temperature_J{i}' for i in range(1, 6)] + ['Temperature_T0']].mean(axis=1)
data['Gradient Temperature'] = (data[[f'Temperature_J{i}' for i in range(1, 6)] + ['Temperature_T0']].max(axis=1) - 
                                data[[f'Temperature_J{i}' for i in range(1, 6)] + ['Temperature_T0']].min(axis=1))

# Creating new interaction features that define the direction of current and speeds from each joint in the dataset
for i in range(6):
    data[f'Speed_Direction_J{i}'] = np.sign(data[f'Speed_J{i}'])
    data[f'Speed_J{i}'] = np.abs(data[f'Speed_J{i}'])
    
    data[f'Current_Direction_J{i}'] = np.sign(data[f'Current_J{i}'])
    data[f'Current_J{i}'] = np.abs(data[f'Current_J{i}'])

# Creating a new feature called "Load Imbalance" using the current joints values present in the dataset
data['Load Imbalance'] = (
    data[[f'Current_J{i}' for i in range(0, 6)]].max(axis=1) - 
    data[[f'Current_J{i}' for i in range(0, 6)]].min(axis=1)) / data[[f'Current_J{i}' for i in range(0, 6)]].mean(axis=1)

In [25]:
# Selecting the newly added interaction features from the dataset
print(f'Dataset after creating interaction features. Shape: {data.shape}')
features = data.columns.difference(["Robot Protective Stop", "Grip Lost", "Tool Current"]).tolist()
print("Features identified:", features)
print("Features Count:", len(features))

Dataset after creating interaction features. Shape: (7409, 38)
Features identified: ['Average Temperature', 'Current_Direction_J0', 'Current_Direction_J1', 'Current_Direction_J2', 'Current_Direction_J3', 'Current_Direction_J4', 'Current_Direction_J5', 'Current_J0', 'Current_J1', 'Current_J2', 'Current_J3', 'Current_J4', 'Current_J5', 'Cycle Time', 'Gradient Temperature', 'Load Imbalance', 'Speed_Direction_J0', 'Speed_Direction_J1', 'Speed_Direction_J2', 'Speed_Direction_J3', 'Speed_Direction_J4', 'Speed_Direction_J5', 'Speed_J0', 'Speed_J1', 'Speed_J2', 'Speed_J3', 'Speed_J4', 'Speed_J5', 'Temperature_J1', 'Temperature_J2', 'Temperature_J3', 'Temperature_J4', 'Temperature_J5', 'Temperature_T0', 'Time Phase']
Features Count: 35


In [26]:
# Saving the newly added interaction features dataset
utils.save_data_csv(data, "data/processed", "interaction_features.csv")

Dataset saved: data/processed\interaction_features.csv


In [27]:
# Freeing up memory
del data
del features
gc.collect()

0

### Saving the processed dataset with ordered columns

In [28]:
# Loading the interaction features dataset
data = utils.load_data("data/processed/interaction_features.csv", "csv")

In [29]:
# Selecting the features from the loaded dataset
features = data.columns.difference(["Robot Protective Stop", "Grip Lost", "Tool Current"]).tolist()
print("Features identified:", features)
print("Features Count:", len(features))

Features identified: ['Average Temperature', 'Current_Direction_J0', 'Current_Direction_J1', 'Current_Direction_J2', 'Current_Direction_J3', 'Current_Direction_J4', 'Current_Direction_J5', 'Current_J0', 'Current_J1', 'Current_J2', 'Current_J3', 'Current_J4', 'Current_J5', 'Cycle Time', 'Gradient Temperature', 'Load Imbalance', 'Speed_Direction_J0', 'Speed_Direction_J1', 'Speed_Direction_J2', 'Speed_Direction_J3', 'Speed_Direction_J4', 'Speed_Direction_J5', 'Speed_J0', 'Speed_J1', 'Speed_J2', 'Speed_J3', 'Speed_J4', 'Speed_J5', 'Temperature_J1', 'Temperature_J2', 'Temperature_J3', 'Temperature_J4', 'Temperature_J5', 'Temperature_T0', 'Time Phase']
Features Count: 35


In [30]:
# Reordering the dataset features and target variables for readability
new_order = [
    'Time Phase', 'Cycle Time',
    *[f'Current_J{i}' for i in range(6)],
    *[f'Current_Direction_J{i}' for i in range(6)],
    *[f'Speed_J{i}' for i in range(6)],
    *[f'Speed_Direction_J{i}' for i in range(6)],
    'Temperature_T0', *[f'Temperature_J{i}' for i in range(1, 6)],
    'Average Temperature', 'Gradient Temperature', 'Load Imbalance',
    'Robot Protective Stop', 'Grip Lost', 'Tool Current'
]

# Arranging the data frame according to the newly defined order
data = data[new_order]
print(f'Dataset after reordering. Shape: {data.shape}')
print(data.columns.tolist())

Dataset after reordering. Shape: (7409, 38)
['Time Phase', 'Cycle Time', 'Current_J0', 'Current_J1', 'Current_J2', 'Current_J3', 'Current_J4', 'Current_J5', 'Current_Direction_J0', 'Current_Direction_J1', 'Current_Direction_J2', 'Current_Direction_J3', 'Current_Direction_J4', 'Current_Direction_J5', 'Speed_J0', 'Speed_J1', 'Speed_J2', 'Speed_J3', 'Speed_J4', 'Speed_J5', 'Speed_Direction_J0', 'Speed_Direction_J1', 'Speed_Direction_J2', 'Speed_Direction_J3', 'Speed_Direction_J4', 'Speed_Direction_J5', 'Temperature_T0', 'Temperature_J1', 'Temperature_J2', 'Temperature_J3', 'Temperature_J4', 'Temperature_J5', 'Average Temperature', 'Gradient Temperature', 'Load Imbalance', 'Robot Protective Stop', 'Grip Lost', 'Tool Current']


In [31]:
# Saving the ordered and processed dataset
utils.save_data_csv(data, "data/processed", "processed_data.csv")

Dataset saved: data/processed\processed_data.csv


In [32]:
# Freeing up memory
del data
del features
gc.collect()

0

#### Adding rolling features to the datasets

In [33]:
# Loading the ordered and processed dataset
data = utils.load_data("data/processed/processed_data.csv", "csv")

In [34]:
# Function to create and add rolling features to the processed dataset
def add_rolling_features(df, features, window_size):
    try:
        if not isinstance(df, pd.DataFrame):
            raise ValueError("The dataset must be a pandas DataFrame.")
        if not isinstance(features, list) or not all(isinstance(i, str) for i in features):
            raise ValueError("Features must be a list of column names (strings).")
        if not isinstance(window_size, int) or window_size <= 0:
            raise ValueError("Window size must be a positive integer.")

        missing_features = [feature for feature in features if feature not in df.columns]
        if missing_features:
            raise ValueError(f"The following features are missing from the dataset: {', '.join(missing_features)}")

        rolling_features = {}
        for feature in features:
            rolling_features[f"{feature}_rolling_mean"] = df[feature].rolling(window=window_size, min_periods=1).mean()
            rolling_features[f"{feature}_rolling_std"] = df[feature].rolling(window=window_size, min_periods=1).std()

        rolling_df = df.copy()
        for key, value in rolling_features.items():
            rolling_df[key] = value

        return rolling_df
    except Exception as e:
        print(f"Error: {e}")
        return None

In [35]:
# Selecting features for adding rolling features
features_rll = [
    *[f'Current_J{i}' for i in range(6)],
    *[f'Speed_J{i}' for i in range(6)], 
    'Temperature_T0', *[f'Temperature_J{i}' for i in range(1, 6)]
]

In [36]:
# Loading the cycle summary dataset
cycle_summary = utils.load_data("data/processed/cycle_summary.csv", "csv")

# Calculating the inter mean (average cycle interval length)
sequence_length = int(np.mean(cycle_summary["occurrences"]))

# Saving the sequence length
with open("data/processed/sequence_length.txt", "w") as file:
    file.write(str(sequence_length))

print(f"Inter Mean (average cycle interval length) of operational cycle -> {sequence_length}")

Inter Mean (average cycle interval length) of operational cycle -> 30


In [37]:
# Adding rolling features to the dataset
data = add_rolling_features(data, features_rll, sequence_length).ffill().bfill()
print(f'Dataset - After adding rolling features. Shape: {data.shape}')

# Selecting the features from the newly added rolling features dataset
features = data.columns.difference(["Robot Protective Stop", "Grip Lost", "Tool Current"]).tolist()
print("Features identified:", features)
print("Features Count:", len(features))

Dataset - After adding rolling features. Shape: (7409, 74)
Features identified: ['Average Temperature', 'Current_Direction_J0', 'Current_Direction_J1', 'Current_Direction_J2', 'Current_Direction_J3', 'Current_Direction_J4', 'Current_Direction_J5', 'Current_J0', 'Current_J0_rolling_mean', 'Current_J0_rolling_std', 'Current_J1', 'Current_J1_rolling_mean', 'Current_J1_rolling_std', 'Current_J2', 'Current_J2_rolling_mean', 'Current_J2_rolling_std', 'Current_J3', 'Current_J3_rolling_mean', 'Current_J3_rolling_std', 'Current_J4', 'Current_J4_rolling_mean', 'Current_J4_rolling_std', 'Current_J5', 'Current_J5_rolling_mean', 'Current_J5_rolling_std', 'Cycle Time', 'Gradient Temperature', 'Load Imbalance', 'Speed_Direction_J0', 'Speed_Direction_J1', 'Speed_Direction_J2', 'Speed_Direction_J3', 'Speed_Direction_J4', 'Speed_Direction_J5', 'Speed_J0', 'Speed_J0_rolling_mean', 'Speed_J0_rolling_std', 'Speed_J1', 'Speed_J1_rolling_mean', 'Speed_J1_rolling_std', 'Speed_J2', 'Speed_J2_rolling_mean', 'Sp

In [38]:
# Saving the rolling features dataset
utils.save_data_csv(data, "data/processed", "rolling_features.csv")

Dataset saved: data/processed\rolling_features.csv


In [39]:
# Freeing up memory
del data
del features_rll
gc.collect()

0

#### Spitting the dataset into train, validation and test sets

In [40]:
# Loading the rolling features dataset
data = utils.load_data("data/processed/rolling_features.csv", "csv")

In [41]:
# Creating 3 separate datasets for each target variable
data_rb = data[data.columns.difference(["Grip Lost", "Tool Current"]).tolist()].copy()
features_rb = data_rb.columns.difference(['Robot Protective Stop']).tolist()
target_rb = "Robot Protective Stop"

data_gl = data[data.columns.difference(["Robot Protective Stop", "Tool Current"]).tolist()].copy()
features_gl = data_gl.columns.difference(['Grip Lost']).tolist()
target_gl = "Grip Lost"

data_tc = data[data.columns.difference(["Robot Protective Stop", "Grip Lost"]).tolist()].copy()
features_tc = data_tc.columns.difference(['Tool Current']).tolist()
target_tc = "Tool Current"

In [42]:
# Splitting the datasets into train, test and validation sets for model training and evaluation
trd_rb, trl_rb, vad_rb, val_rb, ted_rb, tel_rb = utils.split_data(data_rb, features_rb, target_rb)
print(f'Target variable "Robot Protective Stop" dataset - Train feature Shape: {trd_rb.shape}, Train target Shape: {trl_rb.shape}')
print(f'Target variable "Robot Protective Stop" dataset - Validation features Shape: {vad_rb.shape}, Validation target Shape: {val_rb.shape}')
print(f'Target variable "Robot Protective Stop" dataset - Test features Shape: {ted_rb.shape}, Test target Shape: {tel_rb.shape}')

trd_gl, trl_gl, vad_gl, val_gl, ted_gl, tel_gl = utils.split_data(data_gl, features_gl, target_gl)
print(f'\nTarget variable "Grip Lost" dataset - Train features Shape: {trd_gl.shape}, Train target Shape: {trl_gl.shape}')
print(f'Target variable "Grip Lost" dataset - Validation features Shape: {vad_gl.shape}, Validation target Shape: {val_gl.shape}')
print(f'Target variable "Grip Lost" dataset - Test features Shape: {ted_gl.shape}, Test target Shape: {tel_gl.shape}')

trd_tc, trl_tc, vad_tc, val_tc, ted_tc, tel_tc = utils.split_data(data_tc, features_tc, target_tc)
print(f'\nTarget variable "Tool Current" dataset - Train features Shape: {trd_tc.shape}, Train target Shape: {trl_tc.shape}')
print(f'Target variable "Tool Current" dataset - Validation features Shape: {vad_tc.shape}, Validation target Shape: {val_tc.shape}')
print(f'Target variable "Tool Current" dataset - Test features Shape: {ted_tc.shape}, Test target Shape: {tel_tc.shape}')

Target variable "Robot Protective Stop" dataset - Train feature Shape: (4772, 71), Train target Shape: (4772,)
Target variable "Robot Protective Stop" dataset - Validation features Shape: (1737, 71), Validation target Shape: (1737,)
Target variable "Robot Protective Stop" dataset - Test features Shape: (899, 71), Test target Shape: (899,)

Target variable "Grip Lost" dataset - Train features Shape: (4772, 71), Train target Shape: (4772,)
Target variable "Grip Lost" dataset - Validation features Shape: (1737, 71), Validation target Shape: (1737,)
Target variable "Grip Lost" dataset - Test features Shape: (899, 71), Test target Shape: (899,)

Target variable "Tool Current" dataset - Train features Shape: (4772, 71), Train target Shape: (4772,)
Target variable "Tool Current" dataset - Validation features Shape: (1737, 71), Validation target Shape: (1737,)
Target variable "Tool Current" dataset - Test features Shape: (899, 71), Test target Shape: (899,)


In [43]:
# Saving the 3 created datasets for later use
datasets = {
    "rb": {"train": (trd_rb, trl_rb), "valid": (vad_rb, val_rb), "test": (ted_rb, tel_rb)},
    "gl": {"train": (trd_gl, trl_gl), "valid": (vad_gl, val_gl), "test": (ted_gl, tel_gl)},
    "tc": {"train": (trd_tc, trl_tc), "valid": (vad_tc, val_tc), "test": (ted_tc, tel_tc)}
}

for key, splits in datasets.items():
    for split, (data, labels) in splits.items():
        utils.save_data_csv(data, f"data/processed/{key}/{split}", f"{split}_data_{key}.csv")
        utils.save_data_csv(labels, f"data/processed/{key}/{split}", f"{split}_labels_{key}.csv")

Dataset saved: data/processed/rb/train\train_data_rb.csv
Dataset saved: data/processed/rb/train\train_labels_rb.csv
Dataset saved: data/processed/rb/valid\valid_data_rb.csv
Dataset saved: data/processed/rb/valid\valid_labels_rb.csv
Dataset saved: data/processed/rb/test\test_data_rb.csv
Dataset saved: data/processed/rb/test\test_labels_rb.csv
Dataset saved: data/processed/gl/train\train_data_gl.csv
Dataset saved: data/processed/gl/train\train_labels_gl.csv
Dataset saved: data/processed/gl/valid\valid_data_gl.csv
Dataset saved: data/processed/gl/valid\valid_labels_gl.csv
Dataset saved: data/processed/gl/test\test_data_gl.csv
Dataset saved: data/processed/gl/test\test_labels_gl.csv
Dataset saved: data/processed/tc/train\train_data_tc.csv
Dataset saved: data/processed/tc/train\train_labels_tc.csv
Dataset saved: data/processed/tc/valid\valid_data_tc.csv
Dataset saved: data/processed/tc/valid\valid_labels_tc.csv
Dataset saved: data/processed/tc/test\test_data_tc.csv
Dataset saved: data/proce

#### Applying the time series SMOTE balancing

In [44]:
# Function to apply tiem series SMOTE on train data and train labels
def smote_time_series_balancing(train_data, train_labels):
    try:
        if not isinstance(train_data, pd.DataFrame):
            raise ValueError("train_data must be a pandas DataFrame.")
        if not isinstance(train_labels, (pd.Series, np.ndarray, list)):
            raise ValueError("train_labels must be a pandas Series, numpy array, or list.")

        smote = SMOTE()
        balanced_data, balanced_labels = smote.fit_resample(train_data, train_labels)
        balanced_data = pd.DataFrame(balanced_data, columns=train_data.columns)
        balanced_data['Labels'] = balanced_labels

        if 'Time Phase' in balanced_data.columns:
            balanced_data.sort_values(by='Time Phase', inplace=True)
        else:
            raise ValueError("The dataset must contain 'Time Phase' column to sort.")

        balanced_labels = balanced_data.pop('Labels')
        return balanced_data, balanced_labels

    except Exception as e:
        raise RuntimeError(f"An error occurred during SMOTE analysis: {e}")

In [45]:
# Checking the class distribution in target variables before applying the time series SMOTE
print("Before time series SMOTE:")
print(f"Class distribution in 'Robot Protective Stop' target variable: \n{trl_rb.value_counts()}")
print(f"\nClass distribution in 'Grip Lost' target variable: \n{trl_gl.value_counts()}")

Before time series SMOTE:
Class distribution in 'Robot Protective Stop' target variable: 
Robot Protective Stop
0.0    4630
1.0     142
Name: count, dtype: int64

Class distribution in 'Grip Lost' target variable: 
Grip Lost
0    4609
1     163
Name: count, dtype: int64


In [46]:
print(f'"Robot Protective Stop" dataset - Features Shape: {trd_rb.shape}, Labels Shape: {trl_rb.shape}')
print(f'"Grip Lost" dataset - Features Shape: {trd_gl.shape}, Labels Shape: {trl_gl.shape}')

# Apply temporal SMOTE to the datasets
trd_rb, trl_rb = smote_time_series_balancing(trd_rb, trl_rb)
trd_gl, trl_gl = smote_time_series_balancing(trd_gl, trl_gl)

print(f'"Robot Protective Stop" dataset - Features Shape: {trd_rb.shape}, Labels Shape: {trl_rb.shape}')
print(f'"Grip Lost" dataset - Features Shape: {trd_gl.shape}, Labels Shape: {trl_gl.shape}')

"Robot Protective Stop" dataset - Features Shape: (4772, 71), Labels Shape: (4772,)
"Grip Lost" dataset - Features Shape: (4772, 71), Labels Shape: (4772,)
"Robot Protective Stop" dataset - Features Shape: (9260, 71), Labels Shape: (9260,)
"Grip Lost" dataset - Features Shape: (9218, 71), Labels Shape: (9218,)


In [47]:
# Checking the class distribution in target variables after applying the temporal SMOTE
print("\nAfter time series SMOTE:")
print(f"Class distribution in 'Robot Protective Stop' target variable: \n{trl_rb.value_counts()}")
print(f"\nClass distribution in 'Grip Lost' target variable: \n{trl_gl.value_counts()}")


After time series SMOTE:
Class distribution in 'Robot Protective Stop' target variable: 
Labels
0.0    4630
1.0    4630
Name: count, dtype: int64

Class distribution in 'Grip Lost' target variable: 
Labels
0    4609
1    4609
Name: count, dtype: int64


In [48]:
# Function to check for duplicates
def check_duplicates(data, labels):
    feature_duplicates = data.duplicated().sum()

    print(f"Number of duplicate rows in features: {feature_duplicates}")
    if feature_duplicates > 0:
        print("Duplicates are present in the dataset.")
    else:
        print("No duplicates found in the dataset.")

In [49]:
# Checking if duplicates are present in the dataset
check_duplicates(trd_rb, trl_rb)
check_duplicates(trd_gl, trl_gl)

Number of duplicate rows in features: 0
No duplicates found in the dataset.
Number of duplicate rows in features: 0
No duplicates found in the dataset.


In [50]:
# Saving the datasets for later use
datasets = {
    "rb": (trd_rb, trl_rb),
    "gl": (trd_gl, trl_gl)
}

for key, (data, labels) in datasets.items():
    utils.save_data_csv(data, f"data/processed/{key}/train", f"balanced_train_data_{key}.csv")
    utils.save_data_csv(labels, f"data/processed/{key}/train", f"balanced_train_labels_{key}.csv")

Dataset saved: data/processed/rb/train\balanced_train_data_rb.csv
Dataset saved: data/processed/rb/train\balanced_train_labels_rb.csv
Dataset saved: data/processed/gl/train\balanced_train_data_gl.csv
Dataset saved: data/processed/gl/train\balanced_train_labels_gl.csv


#### Reordering the datasets

In [51]:
# Setting the new order for the datasets
new_order = [
    'Time Phase', 'Cycle Time', 
    *[f'Current_J{i}' for i in range(6)],
    *[f'Current_Direction_J{i}' for i in range(6)],
    *[f'Current_J{i}_rolling_mean' for i in range(6)], 
    *[f'Current_J{i}_rolling_std' for i in range(6)], 
    *[f'Speed_J{i}' for i in range(6)],
    *[f'Speed_Direction_J{i}' for i in range(6)],
    *[f'Speed_J{i}_rolling_mean' for i in range(6)], 
    *[f'Speed_J{i}_rolling_std' for i in range(6)], 
    'Temperature_T0', *[f'Temperature_J{i}' for i in range(1, 6)], 
    'Temperature_T0_rolling_mean', *[f'Temperature_J{i}_rolling_std' for i in range(1, 6)],
    'Temperature_T0_rolling_std', *[f'Temperature_J{i}_rolling_std' for i in range(1, 6)],
    'Average Temperature', 'Gradient Temperature', 'Load Imbalance'
]

In [52]:
# Reordering the datasets for easy readability
trd_rb = trd_rb[new_order]
vad_rb = vad_rb[new_order]
ted_rb = ted_rb[new_order]

trd_gl = trd_gl[new_order]
vad_gl = vad_gl[new_order]
ted_gl = ted_gl[new_order]

trd_tc = trd_tc[new_order]
vad_tc = vad_tc[new_order]
ted_tc = ted_tc[new_order]

In [53]:
# Saving the datasets for model training and evaluation
datasets = {
    "rb": {"train": trd_rb, "valid": vad_rb, "test": ted_rb},
    "gl": {"train": trd_gl, "valid": vad_gl, "test": ted_gl},
    "tc": {"train": trd_tc, "valid": vad_tc, "test": ted_tc}
}

for key, splits in datasets.items():
    for split, data in splits.items():
        utils.save_data_csv(data, f"data/processed/{key}/{split}", f"scaled_{split}_data_{key}.csv")

Dataset saved: data/processed/rb/train\scaled_train_data_rb.csv
Dataset saved: data/processed/rb/valid\scaled_valid_data_rb.csv
Dataset saved: data/processed/rb/test\scaled_test_data_rb.csv
Dataset saved: data/processed/gl/train\scaled_train_data_gl.csv
Dataset saved: data/processed/gl/valid\scaled_valid_data_gl.csv
Dataset saved: data/processed/gl/test\scaled_test_data_gl.csv
Dataset saved: data/processed/tc/train\scaled_train_data_tc.csv
Dataset saved: data/processed/tc/valid\scaled_valid_data_tc.csv
Dataset saved: data/processed/tc/test\scaled_test_data_tc.csv


#### Creating sequences from the datasets

In [54]:
# Loading the sequence length value which was calculated earlier
with open("data/processed/sequence_length.txt", "r") as file:
    sequence_length = int(file.read().strip())

In [55]:
# Creating sequences for training, validation, and test datasets
trd_rb, trl_rb = utils.create_sequences(trd_rb, trl_rb, sequence_length, "classification")
vad_rb, val_rb = utils.create_sequences(vad_rb, val_rb, sequence_length, "classification")
ted_rb, tel_rb = utils.create_sequences(ted_rb, tel_rb, sequence_length, "classification")

trd_gl, trl_gl = utils.create_sequences(trd_gl, trl_gl, sequence_length, "classification")
vad_gl, val_gl = utils.create_sequences(vad_gl, val_gl, sequence_length, "classification")
ted_gl, tel_gl = utils.create_sequences(ted_gl, tel_gl, sequence_length, "classification")

trd_tc, trl_tc = utils.create_sequences(trd_tc, trl_tc, sequence_length, "regression")
vad_tc, val_tc = utils.create_sequences(vad_tc, val_tc, sequence_length, "regression")
ted_tc, tel_tc = utils.create_sequences(ted_tc, tel_tc, sequence_length, "regression")

In [56]:
# Checking if the sequences are created properly or not
datasets = {
    'Robot Protective Stop': [trd_rb, trl_rb, vad_rb, val_rb, ted_rb, tel_rb],
    'Grip Lost': [trd_gl, trl_gl, vad_gl, val_gl, ted_gl, tel_gl],
    'Tool Current': [trd_tc, trl_tc, vad_tc, val_tc, ted_tc, tel_tc]
}

for label, (x_train, y_train, x_valid, y_valid, x_test, y_test) in datasets.items():
    print(f"Train data \"{label}\": {x_train.shape}, Train labels \"{label}\": {y_train.shape}")
    print(f"Valid data \"{label}\": {x_valid.shape}, Valid labels \"{label}\": {y_valid.shape}")
    print(f"Test data \"{label}\": {x_test.shape}, Test labels \"{label}\": {y_test.shape}")

Train data "Robot Protective Stop": (9230, 30, 71), Train labels "Robot Protective Stop": (9230,)
Valid data "Robot Protective Stop": (1707, 30, 71), Valid labels "Robot Protective Stop": (1707,)
Test data "Robot Protective Stop": (869, 30, 71), Test labels "Robot Protective Stop": (869,)
Train data "Grip Lost": (9188, 30, 71), Train labels "Grip Lost": (9188,)
Valid data "Grip Lost": (1707, 30, 71), Valid labels "Grip Lost": (1707,)
Test data "Grip Lost": (869, 30, 71), Test labels "Grip Lost": (869,)
Train data "Tool Current": (4742, 30, 71), Train labels "Tool Current": (4742,)
Valid data "Tool Current": (1707, 30, 71), Valid labels "Tool Current": (1707,)
Test data "Tool Current": (869, 30, 71), Test labels "Tool Current": (869,)


In [57]:
# Saving the numpy arrays for model training and evaluation
arrays = {
    "rb": {"train": (trd_rb, trl_rb), "valid": (vad_rb, val_rb), "test": (ted_rb, tel_rb)},
    "gl": {"train": (trd_gl, trl_gl), "valid": (vad_gl, val_gl), "test": (ted_gl, tel_gl)},
    "tc": {"train": (trd_tc, trl_tc), "valid": (vad_tc, val_tc), "test": (ted_tc, tel_tc)}
}

for key, splits in arrays.items():
    for split, data in splits.items():
        folder = f"data/processed/{key}/{split}/sequences"
        utils.save_sequences(data, folder, f"seq_{split}_data_{key}.npy", f"seq_{split}_labels_{key}.npy")

Saved sequences to: data\processed\rb\train\sequences
Saved sequences to: data\processed\rb\valid\sequences
Saved sequences to: data\processed\rb\test\sequences
Saved sequences to: data\processed\gl\train\sequences
Saved sequences to: data\processed\gl\valid\sequences
Saved sequences to: data\processed\gl\test\sequences
Saved sequences to: data\processed\tc\train\sequences
Saved sequences to: data\processed\tc\valid\sequences
Saved sequences to: data\processed\tc\test\sequences


In [58]:
# Freeing up memory
gc.collect()

0