In [3]:
# -*- coding: utf-8 -*-
# ============================================================
# SKRIP EKSTRAKSI FITUR MANUAL EMG + EXPORT KE CSV
# - Struktur kolom disesuaikan agar lebih mudah dibaca.
# ============================================================
import numpy as np
import pandas as pd
from scipy import signal
import os
import zipfile
import io

# ------------------------------------------------------------
# MSC (Magnitude-Squared Coherence) antar channel
# ------------------------------------------------------------
def compute_msc(emg_data, channel_names, fs=1000, nperseg=500):
    """
    emg_data: array (n_samples, n_timesteps, n_channels)
    channel_names: list of string names for channels
    return: msc_features, list_of_msc_names
    """
    emg_data = np.asarray(emg_data)
    n_samples, n_timesteps, n_channels = emg_data.shape
    list_of_msc_names = []
    msc_features = np.zeros((n_samples, n_channels * (n_channels - 1) // 2))

    pair_idx = 0
    for ch1_idx in range(n_channels):
        for ch2_idx in range(ch1_idx + 1, n_channels):
            ch1_name = channel_names[ch1_idx]
            ch2_name = channel_names[ch2_idx]
            
            # Buat nama fitur MSC: MSC_ChX_ChY
            list_of_msc_names.append(f"MSC_{ch1_name}_{ch2_name}")
            
            for i in range(n_samples):
                # Hitung Magnitude-Squared Coherence
                f, Cxy = signal.coherence(
                    emg_data[i, :, ch1_idx], 
                    emg_data[i, :, ch2_idx], 
                    fs=fs, 
                    nperseg=nperseg
                )
                # Ambil rata-rata Cxy sebagai 1 fitur MSC per pasangan
                msc_features[i, pair_idx] = np.mean(Cxy)
            
            pair_idx += 1
            
    return msc_features, list_of_msc_names

# ------------------------------------------------------------
# TIME-DOMAIN FEATURES (7 fitur per channel)
# ------------------------------------------------------------
def extract_time_domain_features(emg_data, channel_names):
    """
    emg_data: (n_samples, n_timesteps, n_channels)
    channel_names: list of string names for channels
    return: time_features, list_of_time_names
    """
    emg_data = np.asarray(emg_data)
    n_samples, n_timesteps, n_channels = emg_data.shape
    
    FEATURE_ABBR = ['MAV', 'RMS', 'VAR', 'SSI', 'ZC', 'SSC', 'WL']
    n_feats_per_ch = len(FEATURE_ABBR)
    
    features = np.zeros((n_samples, n_channels * n_feats_per_ch))
    list_of_time_names = []

    for ch_idx, ch_name in enumerate(channel_names):
        # Buat nama kolom untuk 7 fitur ini
        for abbr in FEATURE_ABBR:
            list_of_time_names.append(f"Time_{abbr}_{ch_name}")
        
        for i in range(n_samples):
            sig = emg_data[i, :, ch_idx]

            # 0. Mean Absolute Value (MAV)
            mav = np.mean(np.abs(sig))
            # 1. Root Mean Square (RMS)
            rms = np.sqrt(np.mean(sig ** 2))
            # 2. Variance (VAR)
            var = np.var(sig)
            # 3. Simple Square Integral (SSI)
            ssi = np.sum(sig ** 2)
            # 4. Zero Crossings (ZC) (Normalized)
            zc = np.sum(np.diff(np.sign(sig))!= 0) / n_timesteps
            # 5. Slope Sign Changes (SSC) (Normalized)
            diff_sig = np.diff(sig)
            ssc = np.sum((diff_sig[:-1] * diff_sig[1:]) < 0) / n_timesteps
            # 6. Waveform Length (WL)
            wl = np.sum(np.abs(np.diff(sig)))

            base = ch_idx * n_feats_per_ch
            features[i, base + 0] = mav
            features[i, base + 1] = rms
            features[i, base + 2] = var
            features[i, base + 3] = ssi
            features[i, base + 4] = zc
            features[i, base + 5] = ssc
            features[i, base + 6] = wl
            
    return features, list_of_time_names

# ------------------------------------------------------------
# FREQUENCY-DOMAIN FEATURES (6 fitur per channel)
# ------------------------------------------------------------
def extract_frequency_domain_features(emg_data, channel_names, fs=1000):
    """
    emg_data: (n_samples, n_timesteps, n_channels)
    channel_names: list of string names for channels
    return: freq_features, list_of_freq_names
    """
    emg_data = np.asarray(emg_data)
    n_samples, n_timesteps, n_channels = emg_data.shape
    
    FEATURE_ABBR = ['MNF', 'MDF', 'PKF', 'MNP', 'TTP', 'BPr']
    n_feats_per_ch = len(FEATURE_ABBR)
    
    features = np.zeros((n_samples, n_channels * n_feats_per_ch))
    list_of_freq_names = []

    for ch_idx, ch_name in enumerate(channel_names):
        # Buat nama kolom untuk 6 fitur ini
        for abbr in FEATURE_ABBR:
            list_of_freq_names.append(f"Freq_{abbr}_{ch_name}")
            
        for i in range(n_samples):
            sig = emg_data[i, :, ch_idx]

            # Hitung power spectral density (PSD)
            f, Pxx = signal.welch(sig, fs=fs, nperseg=min(256, n_timesteps))
            total_power = np.sum(Pxx)

            if total_power == 0:
                # Jika sinyal nol, semua fitur frekuensi adalah nol
                mnf, mdf, pkf, mnp, ttp, bpr = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
            else:
                # 0. Mean frequency (MNF)
                mnf = np.sum(f * Pxx) / total_power
                
                # 1. Median frequency (MDF)
                cumsum = np.cumsum(Pxx)
                median_idx = np.where(cumsum >= total_power / 2)[0][0]
                mdf = f[median_idx]
                
                # 2. Peak Frequency (PKF)
                peak_idx = np.argmax(Pxx)
                pkf = f[peak_idx]
                
                # 3. Mean Power (MNP)
                mnp = np.mean(Pxx)
                
                # 4. Total Power (TTP)
                ttp = total_power
                
                # 5. Band power ratio (BPr) 10-50 Hz
                mask = (f >= 10) & (f <= 50)
                band_power = np.sum(Pxx[mask])
                bpr = band_power / ttp
            
            base = ch_idx * n_feats_per_ch
            features[i, base + 0] = mnf
            features[i, base + 1] = mdf
            features[i, base + 2] = pkf
            features[i, base + 3] = mnp
            features[i, base + 4] = ttp
            features[i, base + 5] = bpr
            
    return features, list_of_freq_names

# ------------------------------------------------------------
# PREPARE DATA DENGAN META (untuk windowing)
# ------------------------------------------------------------
def prepare_data_with_meta(df, channels, label_col='group', meta_cols=None, window_size=1000, step_size=1000):
    # (Fungsi ini tetap sama)
    if meta_cols is None: meta_cols = []
    X, y = [], []
    meta_list = []
    n_timesteps = window_size

    for _, row in df.iterrows():
        # Memastikan nama kanal digunakan sebagai key untuk mengambil data
        signals = [row[ch] for ch in channels]
        sig_arr = np.array([np.array(sig) for sig in signals])

        if any(len(sig) == 0 for sig in sig_arr):
            continue

        min_len = min(len(sig) for sig in sig_arr)
        
        # Windowing
        for start in range(0, min_len - n_timesteps + 1, step_size):
            # Windowing sinyal untuk semua channel
            window = np.array([sig[start:start+n_timesteps] for sig in sig_arr])
            X.append(window.T) # (time, channels)
            y.append(row[label_col])
            
            # Simpan metadata
            meta = {col: row.get(col) for col in meta_cols}
            meta_list.append(meta)

    return np.array(X), np.array(y), meta_list

# ------------------------------------------------------------
# FUNGSI UTAMA EKSTRAKSI + SIMPAN CSV (Disesuaikan)
# ------------------------------------------------------------
def extract_and_save_emg_features(df, channels, fs=1000, label_col='group', meta_cols=None, window_size=1000, step_size=1000, out_csv='emg_features_structured.csv'):
    
    # 1. Prepare data (Windowing + Meta)
    print("[INFO] Membentuk window EMG dan mengumpulkan metadata...")
    X, y, meta_list = prepare_data_with_meta(
        df, channels, label_col=label_col, meta_cols=meta_cols, 
        window_size=window_size, step_size=step_size
    )
    n_windows, n_timesteps, n_channels = X.shape
    print(f"[INFO] Total windows: {n_windows}, Time steps: {n_timesteps}, Channels: {n_channels}")

    # 2. Ekstraksi Fitur dan Mendapatkan Nama Kolom
    channel_names_abbr = [ch.replace('Right', 'R').replace('Left', 'L').replace('LatissimusDorsi', 'LD').replace('C4Paraspinal', 'C4P').replace('Sternocleidomastoidcaputlateralis', 'SCM').replace('Trapeziusdescendens', 'TD') for ch in channels]

    print("[INFO] Ekstraksi fitur time-domain...")
    time_feats, time_feat_names = extract_time_domain_features(X, channel_names_abbr)
    
    print("[INFO] Ekstraksi fitur frequency-domain...")
    freq_feats, freq_feat_names = extract_frequency_domain_features(X, channel_names_abbr, fs=fs)
    
    print("[INFO] Ekstraksi fitur MSC (koherensi antar kanal)...")
    msc_feats, msc_feat_names = compute_msc(X, channel_names_abbr, fs=fs)
    
    print("[INFO] Menggabungkan semua fitur...")
    all_features = np.concatenate([time_feats, freq_feats, msc_feats], axis=1)
    feature_names = time_feat_names + freq_feat_names + msc_feat_names
    
    n_total_feats = all_features.shape[1]
    print(f"[INFO] Dimensi fitur akhir: {n_windows} windows x {n_total_feats} fitur")

    # 3. Bangun DataFrame Hasil dengan Nama Kolom Terstruktur
    rows = []
    for i in range(n_windows):
        row_dict = {}
        
        # Masukkan meta dan label
        if meta_cols is not None:
            row_dict.update(meta_list[i])
        row_dict[label_col] = y[i]
        
        # Masukkan fitur berdasarkan nama terstruktur
        for k, name in enumerate(feature_names):
            row_dict[name] = all_features[i, k]
        rows.append(row_dict)
        
    df_features = pd.DataFrame(rows)

    # 4. Simpan ke CSV
    num_cols = df_features.select_dtypes(include=[float, np.number]).columns
    df_features[num_cols] = df_features[num_cols].round(5)
    df_features.to_csv(out_csv, index=False)
    print(f"[INFO] Fitur EMG berhasil tersimpan ke: {out_csv}")
    
    return df_features

# ============================================================================
# CONTOH PENGGUNAAN / SIMULASI DATA (HARAP DISESUAIKAN)
# ============================================================================

def create_mock_dataframe():
    """ Membuat DataFrame MOCK dengan 8 Channel EMG. """
    np.random.seed(42)
    n_subjects = 20
    signal_length = 20000 
    
    channels = [
        'RightLatissimusDorsi', 'LeftLatissimusDorsi', 
        'RightC4Paraspinal', 'LeftC4Paraspinal', 
        'RightSternocleidomastoidcaputlateralis', 'LeftSternocleidomastoidcaputlateralis', 
        'RightTrapeziusdescendens', 'LeftTrapeziusdescendens'
    ]
    
    data = []
    for i in range(n_subjects):
        group = 1 if i % 2 == 0 else 0 
        condition = 'Curv_CCW' if i < 10 else 'Rect_RECT'
        row = {
            'subject_number': i + 1,
            'group': group,
            'condition': condition
        }
        
        for ch in channels:
            noise = np.random.normal(0, 0.01 + 0.001 * group, signal_length)
            trend = np.linspace(0, 0.1, signal_length) * (1 - group)
            raw_signal = noise + trend
            row[ch] = raw_signal.tolist()
        data.append(row)
        
    df_mock = pd.DataFrame(data)
    df_mock['group'] = df_mock['group'].astype(int)
    return df_mock, channels

# --- BLOK EKSEKUSI ---
if __name__ == '__main__':
    # 1. Buat data mock (GANTI DENGAN DATA LOADING ASLI ANDA)
    df_raw, channels = create_mock_dataframe()
    df_curvilinear = df_raw[df_raw['condition'].str.contains('Curv')].reset_index(drop=True)

    # 2. Atur parameter ekstraksi
    WINDOW_SIZE = 1000 
    STEP_SIZE = 500 
    FS = 1000 
    META_COLS = ['subject_number', 'condition']
    LABEL_COL = 'group'
    OUTPUT_CSV = 'emg_features_structured_preview.csv'

    # 3. Jalankan ekstraksi dan simpan CSV
    df_features = extract_and_save_emg_features(
        df=df_curvilinear,
        channels=channels,
        fs=FS,
        label_col=LABEL_COL,
        meta_cols=META_COLS,
        window_size=WINDOW_SIZE,
        step_size=STEP_SIZE,
        out_csv=OUTPUT_CSV
    )

    print("\n[INFO] Preview 5 Baris Pertama Hasil Fitur (Kolom Terstruktur):")
    # Tampilkan hanya kolom meta dan beberapa kolom fitur pertama sebagai contoh
    print(df_features.iloc[:, :10].head().to_markdown(index=False))


[INFO] Membentuk window EMG dan mengumpulkan metadata...
[INFO] Total windows: 390, Time steps: 1000, Channels: 8
[INFO] Ekstraksi fitur time-domain...
[INFO] Ekstraksi fitur frequency-domain...
[INFO] Ekstraksi fitur MSC (koherensi antar kanal)...
[INFO] Menggabungkan semua fitur...
[INFO] Dimensi fitur akhir: 390 windows x 132 fitur
[INFO] Fitur EMG berhasil tersimpan ke: emg_features_structured_preview.csv

[INFO] Preview 5 Baris Pertama Hasil Fitur (Kolom Terstruktur):
|   subject_number | condition   |   group |   Time_MAV_RLD |   Time_RMS_RLD |   Time_VAR_RLD |   Time_SSI_RLD |   Time_ZC_RLD |   Time_SSC_RLD |   Time_WL_RLD |
|-----------------:|:------------|--------:|---------------:|---------------:|---------------:|---------------:|--------------:|---------------:|--------------:|
|                1 | Curv_CCW    |       1 |        0.00857 |        0.01077 |        0.00012 |        0.11595 |         0.508 |          0.677 |       12.2223 |
|                1 | Curv_CCW    |  