In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df: pd.DataFrame = pd.read_csv(
    'data/processed_data_cleaned.csv',
    parse_dates=["Time"],
    index_col=[0],
    sep=';'
)

In [3]:
df['time'] = df.index
df.reset_index(drop=True, inplace=True)

In [10]:
def analyze_motor_breaks(df, motor_column='corrente_motor', threshold=1500):
    """
    Analyze breaks in the motor process based on the 'corrente_do_motor' column.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    motor_column (str): The column name for motor current. Default is 'corrente_do_motor'.
    threshold (float): The threshold below which the motor is considered off. Default is 1500.

    Returns:
    pd.DataFrame: A DataFrame with the start and end times of each motor-off period.
    """
    # Create a boolean mask where the motor is off
    motor_off = df[motor_column] < threshold

    # Find the transitions (where the motor state changes)
    transitions = motor_off.ne(motor_off.shift())

    # Assign a group number to each continuous motor-off period
    group_ids = transitions.cumsum()

    # Filter only the motor-off periods
    motor_off_periods = df[motor_off]

    # Group by the group IDs and get the start and end times of each motor-off period
    motor_off_analysis = motor_off_periods.groupby(group_ids[motor_off]).agg(
        start_index=('time', 'min'),
        end_index=('time', 'max')
    ).reset_index(drop=True)

    return motor_off_analysis

def get_pre_break_windows(df, motor_off_analysis, window_size=20):
    """
    Get windows of data before each motor break.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    motor_off_analysis (pd.DataFrame): The DataFrame with start and end indices of motor-off periods.
    window_size (int): The number of samples to include before each break. Default is 20.

    Returns:
    list: A list of DataFrames, each containing the window of data before a break.
    """
    pre_break_windows = []

    for start_index in motor_off_analysis['start_index']:
        # Calculate the start of the window
        window_start = max(0, start_index - window_size)  # Ensure we don't go below index 0
        window_end = start_index

        # Extract the window of data
        window = df.iloc[window_start:window_end]
        pre_break_windows.append(window)

    return pre_break_windows

In [11]:
def prepare_window_data(windows):
    """
    Prepare the raw window data for clustering.

    Parameters:
    windows (list of pd.DataFrame): List of DataFrames, each representing a window.

    Returns:
    np.array: A 2D array where each row is a flattened window.
    """
    # Flatten each window into a 1D array
    window_vectors = [window.values.flatten() for window in windows]
    return np.array(window_vectors)


def cluster_windows_raw(window_vectors, n_clusters=3):
    """
    Cluster the windows using raw data.

    Parameters:
    window_vectors (np.array): 2D array where each row is a flattened window.
    n_clusters (int): Number of clusters to create. Default is 3.

    Returns:
    np.array: Cluster labels for each window.
    """
    # Standardize the data
    scaler = StandardScaler()
    window_vectors_scaled = scaler.fit_transform(window_vectors)

    # Apply K-Means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(window_vectors_scaled)

    return labels


def visualize_clusters_raw(window_vectors, labels):
    """
    Visualize the clusters using PCA for dimensionality reduction.

    Parameters:
    window_vectors (np.array): 2D array where each row is a flattened window.
    labels (np.array): Cluster labels for each window.
    """
    # Standardize the data
    scaler = StandardScaler()
    window_vectors_scaled = scaler.fit_transform(window_vectors)

    # Reduce dimensionality to 2D using PCA
    pca = PCA(n_components=2)
    window_vectors_2d = pca.fit_transform(window_vectors_scaled)

    # Plot the clusters
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(window_vectors_2d[:, 0], window_vectors_2d[:, 1], c=labels, cmap='viridis', edgecolor='k')
    plt.colorbar(scatter, label='Cluster')
    plt.title('Clustering of Pre-Break Windows (Raw Data)')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()


In [None]:
# Analyze motor breaks
motor_off_analysis = analyze_motor_breaks(df)

# Get windows of 20 samples before each break
pre_break_windows = get_pre_break_windows(df, motor_off_analysis, window_size=20)

# Prepare the raw window data for clustering
window_vectors = prepare_window_data(pre_break_windows)

# Cluster the windows using raw data
labels = cluster_windows_raw(window_vectors, n_clusters=3)

# Add cluster labels to the windows
for i, window in enumerate(pre_break_windows):
    window['cluster_label'] = labels[i]

# Visualize the clusters
visualize_clusters_raw(window_vectors, labels)

# Print the windows with cluster labels
for i, window in enumerate(pre_break_windows):
    print(f"Window {i + 1} (Cluster {labels[i]}):")
    print("\n")

TypeError: cannot do positional indexing on RangeIndex with these indexers [2023-08-01 04:53:40] of type Timestamp