In [2]:
# -*- coding: utf-8 -*-
# ============================================================
# EKSTRAKSI FITUR EMG + EXPORT CSV
# - Menggunakan data asli dari /content/Neck_EMG_Extracted/Neck_EMG_Extracted
# - Output 3 CSV: Curvilinear, Rectilinear, Combined
# - Kelas: CNP vs CONT
# ============================================================
import numpy as np
import pandas as pd
from scipy import signal
import os
import re
# ------------------------------------------------------------
# MSC (Magnitude-Squared Coherence) antar channel
# ------------------------------------------------------------
def compute_msc(emg_data, channel_names, fs=1000, nperseg=500):
    emg_data = np.asarray(emg_data)
    n_samples, n_timesteps, n_channels = emg_data.shape
    list_of_msc_names = []
    msc_features = np.zeros((n_samples, n_channels * (n_channels - 1) // 2))
    pair_idx = 0
    for ch1_idx in range(n_channels):
        for ch2_idx in range(ch1_idx + 1, n_channels):
            ch1_name = channel_names[ch1_idx]
            ch2_name = channel_names[ch2_idx]
            list_of_msc_names.append(f"MSC_{ch1_name}_{ch2_name}")
            for i in range(n_samples):
                f, Cxy = signal.coherence(
                    emg_data[i, :, ch1_idx], emg_data[i, :, ch2_idx], fs=fs, nperseg=nperseg
                )
                msc_features[i, pair_idx] = np.mean(Cxy)
            pair_idx += 1
    return msc_features, list_of_msc_names

# ------------------------------------------------------------
# TIME-DOMAIN FEATURES (7 fitur per channel)
# ------------------------------------------------------------
def extract_time_domain_features(emg_data, channel_names):
    emg_data = np.asarray(emg_data)
    n_samples, n_timesteps, n_channels = emg_data.shape
    FEATURE_ABBR = ['MAV', 'RMS', 'VAR', 'SSI', 'ZC', 'SSC', 'WL']
    n_feats_per_ch = len(FEATURE_ABBR)
    features = np.zeros((n_samples, n_channels * n_feats_per_ch))
    list_of_time_names = []
    
    # Buat nama kolom
    for ch_idx, ch_name in enumerate(channel_names):
        for abbr in FEATURE_ABBR:
            list_of_time_names.append(f"Time_{abbr}_{ch_name}")
            
    # Hitung fitur
    for ch_idx in range(n_channels):
        for i in range(n_samples):
            sig = emg_data[i, :, ch_idx]
            mav = np.mean(np.abs(sig))
            rms = np.sqrt(np.mean(sig**2))
            var = np.var(sig)
            ssi = np.sum(sig**2)
            zc = np.sum(np.diff(np.sign(sig))!= 0) / n_timesteps
            diff_sig = np.diff(sig)
            ssc = np.sum((diff_sig[:-1] * diff_sig[1:]) < 0) / n_timesteps
            wl = np.sum(np.abs(np.diff(sig)))
            
            base = ch_idx * n_feats_per_ch
            features[i, base + 0] = mav
            features[i, base + 1] = rms
            features[i, base + 2] = var
            features[i, base + 3] = ssi
            features[i, base + 4] = zc
            features[i, base + 5] = ssc
            features[i, base + 6] = wl
            
    return features, list_of_time_names

# ------------------------------------------------------------
# FREQUENCY-DOMAIN FEATURES (6 fitur per channel)
# ------------------------------------------------------------
def extract_frequency_domain_features(emg_data, channel_names, fs=1000):
    emg_data = np.asarray(emg_data)
    n_samples, n_timesteps, n_channels = emg_data.shape
    FEATURE_ABBR = ['MNF', 'MDF', 'PKF', 'MNP', 'TTP', 'BPr']
    n_feats_per_ch = len(FEATURE_ABBR)
    features = np.zeros((n_samples, n_channels * n_feats_per_ch))
    list_of_freq_names = []
    
    # Buat nama kolom
    for ch_idx, ch_name in enumerate(channel_names):
        for abbr in FEATURE_ABBR:
            list_of_freq_names.append(f"Freq_{abbr}_{ch_name}")
            
    # Hitung fitur
    for ch_idx in range(n_channels):
        for i in range(n_samples):
            sig = emg_data[i, :, ch_idx]
            f, Pxx = signal.welch(sig, fs=fs, nperseg=min(256, n_timesteps))
            total_power = np.sum(Pxx)
            
            if total_power == 0:
                mnf = mdf = pkf = mnp = ttp = bpr = 0.0
            else:
                mnf = np.sum(f * Pxx) / total_power
                cumsum = np.cumsum(Pxx)
                median_idx = np.where(cumsum >= total_power / 2)[0][0]
                mdf = f[median_idx]
                peak_idx = np.argmax(Pxx)
                pkf = f[peak_idx]
                mnp = np.mean(Pxx)
                ttp = total_power
                mask = (f >= 10) & (f <= 50)
                band_power = np.sum(Pxx[mask])
                bpr = band_power / ttp
            
            base = ch_idx * n_feats_per_ch
            features[i, base + 0] = mnf
            features[i, base + 1] = mdf
            features[i, base + 2] = pkf
            features[i, base + 3] = mnp
            features[i, base + 4] = ttp
            features[i, base + 5] = bpr
            
    return features, list_of_freq_names

# ------------------------------------------------------------
# PREPARE DATA DENGAN META (untuk windowing)
# ------------------------------------------------------------
def prepare_data_with_meta(df, channels, label_col='group', meta_cols=None, window_size=1000, step_size=1000):
    if meta_cols is None: meta_cols = []
    X, y = [], []
    meta_list = []
    n_timesteps = window_size
    
    for _, row in df.iterrows():
        signals = [row[ch] for ch in channels]
        sig_arr = np.array([np.array(sig) for sig in signals])
        
        if any(len(sig) == 0 for sig in sig_arr):
            continue
            
        min_len = min(len(sig) for sig in sig_arr)
        
        # Windowing
        for start in range(0, min_len - n_timesteps + 1, step_size):
            window = np.array([sig[start:start + n_timesteps] for sig in sig_arr])
            X.append(window.T) # (time, channels)
            y.append(row[label_col])
            meta = {col: row.get(col) for col in meta_cols}
            meta_list.append(meta)
            
    return np.array(X), np.array(y), meta_list

# ------------------------------------------------------------
# FUNGSI UTAMA EKSTRAKSI + SIMPAN CSV
# ------------------------------------------------------------
def extract_and_save_emg_features(
    df, channels, fs=1000, label_col='group', meta_cols=None,
    window_size=1000, step_size=1000, out_csv='emg_features_structured.csv', class_map=None
):
    
    print(f"[INFO] Membentuk window EMG dan mengumpulkan metadata untuk: {out_csv}...")
    X, y, meta_list = prepare_data_with_meta(
        df, channels, label_col=label_col, meta_cols=meta_cols,
        window_size=window_size, step_size=step_size
    )
    
    if X.size == 0:
        print(f"[WARNING] Tidak ada data yang diproses untuk {out_csv}. Mengembalikan DataFrame kosong.")
        return pd.DataFrame()

    n_windows, n_timesteps, n_channels = X.shape
    print(f"[INFO] Total windows: {n_windows}, Time steps: {n_timesteps}, Channels: {n_channels}")
    
    # Nama kanal singkat
    channel_names_abbr = [
        ch.replace('Right', 'R').replace('Left', 'L').replace('LatissimusDorsi', 'LD').replace('C4Paraspinal', 'C4P').replace('Sternocleidomastoidcaputlateralis', 'SCM').replace('Trapeziusdescendens', 'TD')
        for ch in channels
    ]
    
    print("[INFO] Ekstraksi fitur time-domain...")
    time_feats, time_feat_names = extract_time_domain_features(X, channel_names_abbr)
    print("[INFO] Ekstraksi fitur frequency-domain...")
    freq_feats, freq_feat_names = extract_frequency_domain_features(X, channel_names_abbr, fs=fs)
    print("[INFO] Ekstraksi fitur MSC (koherensi antar kanal)...")
    msc_feats, msc_feat_names = compute_msc(X, channel_names_abbr, fs=fs)
    
    print("[INFO] Menggabungkan semua fitur...")
    all_features = np.concatenate([time_feats, freq_feats, msc_feats], axis=1)
    feature_names = time_feat_names + freq_feat_names + msc_feat_names
    n_total_feats = all_features.shape[1]
    print(f"[INFO] Dimensi fitur akhir: {n_windows} windows x {n_total_feats} fitur")
    
    # Bangun DataFrame
    rows = []
    for i in range(n_windows):
        row_dict = {}
        
        if meta_cols is not None:
            row_dict.update(meta_list[i])
            
        row_dict[label_col] = int(y[i])
        
        if class_map is not None:
            row_dict['class_label'] = class_map.get(int(y[i]), 'UNKNOWN')
            
        for k, name in enumerate(feature_names):
            row_dict[name] = all_features[i, k]
        rows.append(row_dict)
        
    df_features = pd.DataFrame(rows)
    
    # Pembulatan angka
    num_cols = df_features.select_dtypes(include=[float, np.number]).columns
    df_features[num_cols] = df_features[num_cols].round(5)
    df_features.to_csv(out_csv, index=False)
    print(f"[INFO] Fitur EMG berhasil tersimpan ke: {out_csv}")
    return df_features

# ============================================================================
# LOADER DATA ASLI DARI FOLDER (Disesuaikan dengan struktur file Anda)
# ============================================================================
def create_real_dataframe(root_dir):
    """
    Membaca data EMG dari struktur folder, menyesuaikan dengan nama folder file Anda.
    """
    # Nama kolom di DataFrame hasil akhir
    channels = [ 
        'RightLatissimusDorsi', 'LeftLatissimusDorsi', 
        'RightC4Paraspinal', 'LeftC4Paraspinal', 
        'RightSternocleidomastoidcaputlateralis', 'LeftSternocleidomastoidcaputlateralis', 
        'RightTrapeziusdescendens', 'LeftTrapeziusdescendens' 
    ]
    
    # MAPPING: Nama folder otot (di disk) -> Nama file base -> Nama kolom (di df)
    # Disesuaikan berdasarkan screenshot Anda: Latissi, Paraspinal, Sterno, Trapezius
    muscle_map = {
        'Latissi': ('latissi', 'LatissimusDorsi'),
        'Paraspinal': ('paraspinal', 'C4Paraspinal'),
        'Sterno': ('sterno', 'Sternocleidomastoidcaputlateralis'),
        'Trapezius': ('trapezius', 'Trapeziusdescendens')
    }

    all_rows = []
    
    # 1. Loop folder top-level (CNP_Curv_CCW, CONT_Rect_RECT, dsb.)
    for folder_name in sorted(os.listdir(root_dir)):
        path_top = os.path.join(root_dir, folder_name)
        if not os.path.isdir(path_top):
            continue
            
        # Parse group dan condition
        if folder_name.startswith('CNP'):
            group = 1
        elif folder_name.startswith('CONT'):
            group = 0
        else:
            continue

        parts = folder_name.split('_', 1)
        condition = parts[1] if len(parts) > 1 else folder_name
        
        print(f"[INFO] Memproses folder: {folder_name} (group={group}, condition={condition})")
        
        # 2. Pakai folder Trapezius sebagai acuan daftar subject_number
        try:
            acuan_dir = os.path.join(path_top, 'Trapezius')
            acuan_files = [fn for fn in sorted(os.listdir(acuan_dir)) if fn.lower().endswith('.csv')]
        except FileNotFoundError:
            print(f"[WARNING] Tidak ditemukan folder Trapezius di {path_top}, skip.")
            continue

        for fn in acuan_files:
            # Ambil angka subject dari nama file (misal trapezius1.csv -> 1)
            m = re.findall(r'\d+', fn)
            if not m: continue
            subject_id = int(m[0])
            
            row = { 
                'subject_number': subject_id, 
                'group': group, 
                'condition': condition 
            }
            
            current_channels = []
            
            # 3. Baca tiap file otot untuk subjek ini
            is_complete = True
            for muscle_folder, (file_base, muscle_key) in muscle_map.items():
                
                csv_path = os.path.join(path_top, muscle_folder, f"{file_base}{subject_id}.csv")
                
                if not os.path.isfile(csv_path):
                    is_complete = False
                    # print(f"[WARNING] File {csv_path} tidak ditemukan untuk subject {subject_id}.")
                    break
                    
                df_muscle = pd.read_csv(csv_path)
                
                # Nama kolom sinyal di CSV (misal RightTrapeziusdescendens)
                r_col_name = f"Right{muscle_key}"
                l_col_name = f"Left{muscle_key}"
                
                if r_col_name not in df_muscle.columns or l_col_name not in df_muscle.columns:
                    print(f"[ERROR] Kolom {r_col_name} atau {l_col_name} tidak ditemukan dalam {csv_path}")
                    is_complete = False
                    break
                    
                # Simpan sinyal sebagai list/array numpy
                row[r_col_name] = df_muscle[r_col_name].to_numpy().tolist()
                row[l_col_name] = df_muscle[l_col_name].to_numpy().tolist()
                
                current_channels.extend([r_col_name, l_col_name])
            
            # 4. Tambahkan jika semua 8 kanal ada
            if is_complete and len(current_channels) == 8:
                 all_rows.append(row)
            elif is_complete:
                 # Hanya untuk memastikan, jika logika di atas ada yang salah
                 # (seharusnya jika is_complete=True, maka len(current_channels) pasti 8)
                 all_rows.append(row)
            else:
                pass # Abaikan subjek yang tidak lengkap

    df_all = pd.DataFrame(all_rows)
    
    if df_all.empty:
        print("[CRITICAL] DataFrame hasil load kosong. Tidak ada data subjek yang lengkap.")
        return df_all, channels

    # Perbaikan: Konversi tipe data hanya jika DataFrame tidak kosong
    df_all['group'] = df_all['group'].astype(int)
    
    return df_all, channels

# --- BLOK EKSEKUSI ---
if __name__ == '__main__':
    # Pastikan ini adalah folder tempat semua subfolder CNP/CONT berada
    ROOT_DIR = '/content/Neck_EMG_Extracted/Neck_EMG_Extracted' 
    
    # 1. Load data asli 
    print("[INFO] Membaca data EMG dari folder:", ROOT_DIR)
    df_raw, channels = create_real_dataframe(ROOT_DIR)
    
    if df_raw.empty:
        print("[EXECUTION FAILED] Tidak ada data yang berhasil dimuat. Cek path dan kelengkapan file.")
    else:
        print("[INFO] Total baris (subjek x kondisi) dalam df_raw:", len(df_raw))
        
        # 2. Bagi berdasarkan kondisi Curv vs Rect
        df_curvilinear = df_raw[df_raw['condition'].str.contains('Curv', case=False, na=False)].reset_index(drop=True)
        df_rectilinear = df_raw[df_raw['condition'].str.contains('Rect', case=False, na=False)].reset_index(drop=True)
        df_combined = df_raw.reset_index(drop=True) # Curv + Rect
        
        # 3. Parameter ekstraksi
        WINDOW_SIZE = 1000
        STEP_SIZE = 500
        FS = 1000
        META_COLS = ['subject_number', 'condition']
        LABEL_COL = 'group'
        CLASS_MAP = { 0: 'CONT', 1: 'CNP' } # Mapping kelas numerik -> teks

        # 4. Ekstraksi & simpan CSV untuk masing-masing set
        print("\n===== CURVILINEAR =====")
        df_curv_features = extract_and_save_emg_features(
            df=df_curvilinear, channels=channels, fs=FS, label_col=LABEL_COL, meta_cols=META_COLS,
            window_size=WINDOW_SIZE, step_size=STEP_SIZE, out_csv='emg_features_curvilinear_CNP_CONT.csv', 
            class_map=CLASS_MAP
        )
        
        print("\n===== RECTILINEAR =====")
        df_rect_features = extract_and_save_emg_features(
            df=df_rectilinear, channels=channels, fs=FS, label_col=LABEL_COL, meta_cols=META_COLS,
            window_size=WINDOW_SIZE, step_size=STEP_SIZE, out_csv='emg_features_rectilinear_CNP_CONT.csv', 
            class_map=CLASS_MAP
        )
        
        print("\n===== COMBINED (CURV + RECT) =====")
        df_comb_features = extract_and_save_emg_features(
            df=df_combined, channels=channels, fs=FS, label_col=LABEL_COL, meta_cols=META_COLS,
            window_size=WINDOW_SIZE, step_size=STEP_SIZE, out_csv='emg_features_combined_CNP_CONT.csv', 
            class_map=CLASS_MAP
        )
        
        # 5. Preview singkat
        print("\n[INFO] Preview Curvilinear (5 baris pertama):")
        if not df_curv_features.empty:
            print(df_curv_features.iloc[:5, :10].to_string(index=False))
        
        print("\n[INFO] Preview Rectilinear (5 baris pertama):")
        if not df_rect_features.empty:
            print(df_rect_features.iloc[:5, :10].to_string(index=False))
            
        print("\n[INFO] Preview Combined (5 baris pertama):")
        if not df_comb_features.empty:
            print(df_comb_features.iloc[:5, :10].to_string(index=False))


[INFO] Membaca data EMG dari folder: /content/Neck_EMG_Extracted/Neck_EMG_Extracted
[INFO] Memproses folder: CNP_Curv_CCW (group=1, condition=Curv_CCW)
[INFO] Memproses folder: CNP_Rect_RECT (group=1, condition=Rect_RECT)
[INFO] Memproses folder: CONT_Curv_CCW (group=0, condition=Curv_CCW)
[INFO] Memproses folder: CONT_Rect_RECT (group=0, condition=Rect_RECT)
[INFO] Total baris (subjek x kondisi) dalam df_raw: 80

===== CURVILINEAR =====
[INFO] Membentuk window EMG dan mengumpulkan metadata untuk: emg_features_curvilinear_CNP_CONT.csv...
[INFO] Total windows: 2296, Time steps: 1000, Channels: 8
[INFO] Ekstraksi fitur time-domain...
[INFO] Ekstraksi fitur frequency-domain...
[INFO] Ekstraksi fitur MSC (koherensi antar kanal)...
[INFO] Menggabungkan semua fitur...
[INFO] Dimensi fitur akhir: 2296 windows x 132 fitur
[INFO] Fitur EMG berhasil tersimpan ke: emg_features_curvilinear_CNP_CONT.csv

===== RECTILINEAR =====
[INFO] Membentuk window EMG dan mengumpulkan metadata untuk: emg_featur