In [None]:
#Working

import pandas as pd

# 📂 Define File Paths
eeg_features_path = r"C:\Users\Kevin Tran\Documents\Project Data\feature extractions\eeg_features.csv"
train_path = r"C:\Users\Kevin Tran\Documents\GitHub ED1\hms-harmful-brain-activity-classificationtrain_eegs\train.csv"
merged_eeg_path = r"C:\Users\Kevin Tran\Documents\Project Data\merged_eeg_data.csv"

# ✅ Load train.csv
train_df = pd.read_csv(train_path)

# 🔄 Convert `expert_consensus` into binary labels (Seizure = 1, Non-Seizure = 0)
train_df["label"] = train_df["expert_consensus"].apply(lambda x: 1 if x == "Seizure" else 0)

# 🎯 Keep only necessary columns
train_df = train_df[["eeg_id", "label"]]

# 🔄 Ensure `eeg_id` is a string for accurate merging
train_df["eeg_id"] = train_df["eeg_id"].astype(str)

# 🛠 Define chunk size for memory efficiency
chunk_size = 100000

# 🚀 Initialize CSV file (overwrite if exists)
with open(merged_eeg_path, "w", newline="") as f:
    pass  # Just creating/clearing the file

# 🔄 Process EEG features in chunks and save incrementally
for i, chunk in enumerate(pd.read_csv(eeg_features_path, chunksize=chunk_size, dtype={"file": str})):
    print(f"🔄 Processing chunk {i + 1}")

    # 🏷 Extract `eeg_id` from 'file' column (remove `.parquet` extension)
    chunk["eeg_id"] = chunk["file"].str.replace(".parquet", "", regex=False)

    # 🔄 Merge chunk with `train.csv` labels
    merged_chunk = chunk.merge(train_df, on="eeg_id", how="left")

    # ✅ Append to CSV (without keeping everything in memory)
    merged_chunk.to_csv(merged_eeg_path, mode="a", index=False, header=(i == 0))  # Write header only for first chunk

    print(f"✅ Chunk {i + 1} saved.")

print(f"🎉 Merging completed. Data saved at: {merged_eeg_path}")


In [2]:
#WORKING it makes feature columns.pkl as well
# Machine Learning - Random Forest Classifier (Handles NaN Labels & Saves Feature Columns)
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# 📂 Define File Paths
merged_eeg_path = r"C:\Users\Kevin Tran\Documents\Project Data\merged_eeg_data.csv"
model_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\random_forest_model.pkl"
feature_columns_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\feature_columns.pkl"

# ✅ Chunk Size (Adjust based on available memory)
chunk_size = 50000

# ✅ Initialize lists to store chunks
X_chunks = []
y_chunks = []

print("📂 Loading EEG data in chunks...")

# Read merged EEG data in chunks
for i, chunk in enumerate(pd.read_csv(merged_eeg_path, chunksize=chunk_size)):
    print(f"🔄 Processing chunk {i + 1}...")

    # 🔍 Ignore non-feature columns
    feature_cols = [col for col in chunk.columns if col not in ["eeg_id", "file", "channel", "window", "label"]]
    X_chunk = chunk[feature_cols]
    y_chunk = chunk["label"]

    # 🔄 Handle missing values in features (Fill NaN with column median)
    X_chunk.fillna(X_chunk.median(), inplace=True)

    # 🔄 Handle missing labels (Fill NaN with most common label)
    y_chunk.fillna(y_chunk.mode()[0], inplace=True)  # Replace NaN labels with the most common class

    # ✅ Append to lists
    X_chunks.append(X_chunk)
    y_chunks.append(y_chunk)

    # ⏸ Stop early to prevent memory issues (optional)
    if i == 10:  
        print("⏸ Stopping early to avoid memory issues (Adjust if needed)")
        break

# ✅ Combine processed chunks
X = pd.concat(X_chunks, ignore_index=True)
y = pd.concat(y_chunks, ignore_index=True)

# 🚀 Final Check: Ensure `X` and `y` have the same number of rows
print(f"✅ Final Data Shape: X={X.shape}, y={y.shape}")

# 🔄 Split into training (80%) and testing (20%)
print("🔄 Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🚀 Train and Save Model
print("🚀 Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# ✅ Save trained model
joblib.dump(rf_model, model_path)
print(f"✅ Model saved to: {model_path}")

# ✅ Save expected feature columns
joblib.dump(list(X_train.columns), feature_columns_path)
print(f"✅ Saved expected feature columns: {feature_columns_path}")

# 🔍 Predict on test set
y_pred = rf_model.predict(X_test)

# ✅ Evaluate Performance
print("✅ Model training completed. Evaluating performance...")
print(f"🎯 Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\n🔍 Classification Report:\n", classification_report(y_test, y_pred))



📂 Loading EEG data in chunks...
🔄 Processing chunk 1...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)


🔄 Processing chunk 2...
🔄 Processing chunk 3...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)


🔄 Processing chunk 4...
🔄 Processing chunk 5...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)


🔄 Processing chunk 6...
🔄 Processing chunk 7...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)


🔄 Processing chunk 8...
🔄 Processing chunk 9...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)


🔄 Processing chunk 10...
🔄 Processing chunk 11...
⏸ Stopping early to avoid memory issues (Adjust if needed)
✅ Final Data Shape: X=(550000, 19), y=(550000,)
🔄 Splitting data into training and testing sets...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_chunk.fillna(X_chunk.median(), inplace=True)


🚀 Training Random Forest model...
✅ Model saved to: C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\random_forest_model.pkl
✅ Saved expected feature columns: C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\feature_columns.pkl
✅ Model training completed. Evaluating performance...
🎯 Accuracy: 0.9825

🔍 Classification Report:
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99     79653
         1.0       0.99      0.95      0.97     30347

    accuracy                           0.98    110000
   macro avg       0.99      0.97      0.98    110000
weighted avg       0.98      0.98      0.98    110000



In [9]:
#WORKING

#new process for eeg prediction 

# EEG Prediction Using Trained Random Forest Model (With Feature Extraction)
import pandas as pd
import numpy as np
import os
import joblib
from scipy.stats import skew, kurtosis
from scipy.signal import butter, filtfilt, iirnotch
from tqdm import tqdm

# 📌 Define Paths
input_folder = r"C:\Users\Kevin Tran\Documents\Project Data\Input files"
output_folder = r"C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs"
model_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\random_forest_model.pkl"
feature_columns_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\feature_columns.pkl"

# ✅ Ensure Output Folder Exists
os.makedirs(output_folder, exist_ok=True)

# ✅ Load Trained Model and Expected Feature Names
print("🧠 Loading trained Random Forest model...")
rf_model = joblib.load(model_path)
expected_features = joblib.load(feature_columns_path)  # Load expected feature names

# 📌 EEG Signal Processing Functions
def apply_notch_filter(signal, fs=400, freq=60.0, quality_factor=30):
    """Apply a notch filter to remove 60Hz noise."""
    b, a = iirnotch(w0=freq, Q=quality_factor, fs=fs)
    return filtfilt(b, a, signal)

def apply_bandpass_filter(signal, fs=400, lowcut=0.5, highcut=40.0, order=5):
    """Apply a bandpass filter to keep frequencies between 0.5Hz and 40Hz."""
    nyquist = 0.5 * fs
    low, high = lowcut / nyquist, highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, signal)

def normalize_signal(signal):
    """Normalize EEG signal to have zero mean and unit variance."""
    return (signal - np.mean(signal)) / np.std(signal)

# 📌 Feature Extraction Functions
def extract_time_features(signal):
    return {
        "mean": np.mean(signal),
        "variance": np.var(signal),
        "skewness": skew(signal),
        "kurtosis": kurtosis(signal),
        "rms": np.sqrt(np.mean(signal**2)),
        "zero_crossing_rate": np.sum(np.diff(np.sign(signal)) != 0) / len(signal),
        "mean_abs": np.mean(np.abs(signal)),   # Added missing feature
        "diff_rms1": np.sqrt(np.mean(np.diff(signal) ** 2)),  # Added missing feature
        "diff_rms2": np.sqrt(np.mean(np.diff(signal, n=2) ** 2))  # Added missing feature
    }

def extract_frequency_features(signal, fs):
    """Extract frequency-based features using FFT."""
    L = len(signal)
    Y = np.fft.fft(signal)
    P2 = np.abs(Y / L)
    P1 = P2[:L // 2 + 1]
    P1[1:-1] *= 2
    freqs = fs * np.arange(L // 2 + 1) / L

    # EEG frequency bands
    bands = {
        "delta": (1, 3),
        "theta": (4, 7),
        "alpha1": (8, 9),
        "alpha2": (10, 12),
        "beta1": (13, 17),
        "beta2": (18, 30),
        "gamma1": (31, 40),
        "gamma2": (41, 50),
        "higher": (51, 250),
    }

    band_powers = {name: np.sum(P1[(freqs >= low) & (freqs <= high)]) for name, (low, high) in bands.items()}
    band_powers["spectral_entropy"] = -np.sum(P1 * np.log(P1 + 1e-10))
    return band_powers

# 📌 Function to Extract Features from EEG Data
def extract_features_from_eeg(file_path):
    data = pd.read_parquet(file_path)

    # Process each EEG channel
    all_features = []
    for channel in data.columns:
        signal = data[channel].values
        signal = apply_notch_filter(signal)
        signal = apply_bandpass_filter(signal)
        signal = normalize_signal(signal)

        # Extract time and frequency features
        time_features = extract_time_features(signal)
        frequency_features = extract_frequency_features(signal, fs=400)

        # Combine into one dictionary
        combined_features = {**time_features, **frequency_features}
        all_features.append(combined_features)

    # Convert list of feature dicts into a DataFrame
    feature_df = pd.DataFrame(all_features).mean(axis=0).to_frame().T  # Aggregate across channels

    return feature_df

# 📌 Function to Process & Predict a Single EEG File
def process_and_predict(file_path):
    try:
        print(f"\n📂 Processing: {os.path.basename(file_path)}")

        # Extract Features
        feature_df = extract_features_from_eeg(file_path)

        # Ensure Columns Match Expected Features
        missing_features = set(expected_features) - set(feature_df.columns)
        extra_features = set(feature_df.columns) - set(expected_features)

        if missing_features:
            print(f"⚠️ Missing features: {missing_features}")
            for feat in missing_features:
                feature_df[feat] = 0  # Fill missing features with 0

        if extra_features:
            print(f"⚠️ Extra features found: {extra_features}")
            feature_df = feature_df[expected_features]  # Keep only expected features

        # Reorder features to match model training order
        feature_df = feature_df[expected_features]

        # Predict using trained model
        print(f"🔮 Predicting seizures for {os.path.basename(file_path)}...")
        prediction = rf_model.predict(feature_df)

        # Determine seizure presence
        seizure_detected = 1 in prediction  # If any row has 1, seizure is detected

        # Display Final Result
        if seizure_detected:
            print(f"🚨 SEIZURE DETECTED in {os.path.basename(file_path)}! 🚨")
            seizure_status = "Seizure Detected"
        else:
            print(f"✅ No seizure detected in {os.path.basename(file_path)}.")
            seizure_status = "No Seizure Detected"

        # Save predictions
        output_file = os.path.join(output_folder, f"predicted_{os.path.basename(file_path)}")
        feature_df["predicted_label"] = prediction
        feature_df.to_parquet(output_file)

        return f"✅ Prediction saved: {output_file} | Status: {seizure_status}"

    except Exception as e:
        return f"❌ Failed: {os.path.basename(file_path)} | Error: {e}"

# 📌 Run EEG Processing and Prediction
if __name__ == "__main__":
    print(f"📂 Scanning for EEG `.parquet` files in {input_folder}...")
    eeg_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".parquet")]

    if len(eeg_files) == 0:
        print("🚨 No `.parquet` files found in the input folder! Please check your folder.")
    else:
        print(f"✅ Found {len(eeg_files)} EEG files. Starting predictions...\n")
        for file in tqdm(eeg_files, desc="Processing EEG Files"):
            result = process_and_predict(file)
            print(result)  # Show final prediction status
        print(f"✅ All predictions completed! Files saved in: {output_folder}")



🧠 Loading trained Random Forest model...
📂 Scanning for EEG `.parquet` files in C:\Users\Kevin Tran\Documents\Project Data\Input files...
✅ Found 2 EEG files. Starting predictions...



Processing EEG Files:   0%|          | 0/2 [00:00<?, ?it/s]


📂 Processing: 1628180742.parquet


Processing EEG Files: 100%|██████████| 2/2 [00:00<00:00, 14.28it/s]

🔮 Predicting seizures for 1628180742.parquet...
🚨 SEIZURE DETECTED in 1628180742.parquet! 🚨
✅ Prediction saved: C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs\predicted_1628180742.parquet | Status: Seizure Detected

📂 Processing: 2237447621.parquet
🔮 Predicting seizures for 2237447621.parquet...
✅ No seizure detected in 2237447621.parquet.
✅ Prediction saved: C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs\predicted_2237447621.parquet | Status: No Seizure Detected
✅ All predictions completed! Files saved in: C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs





**BUILD 2**

In [3]:
import pandas as pd

# 📂 Define File Paths
eeg_features_path = r"C:\Users\Kevin Tran\Documents\Project Data\feature extractions\eeg_features.csv"
train_path = r"C:\Users\Kevin Tran\Documents\GitHub ED1\hms-harmful-brain-activity-classificationtrain_eegs\train.csv"
merged_eeg_path = r"C:\Users\Kevin Tran\Documents\Project Data\merged_eeg_data2.csv"

# ✅ Load train.csv
train_df = pd.read_csv(train_path)

# 🔄 Convert `expert_consensus` into multi-class labels
label_mapping = {
    "Seizure": 1,
    "GPD": 2,
    "GRDA": 3,
    "LPD": 4,
    "LRDA": 5,
    "Normal": 0  # Treat all non-harmful cases as "Normal"
}

train_df["label"] = train_df["expert_consensus"].map(label_mapping)

# 🎯 Keep only necessary columns
train_df = train_df[["eeg_id", "label"]]

# 🔄 Ensure `eeg_id` is a string for accurate merging
train_df["eeg_id"] = train_df["eeg_id"].astype(str)

# 🛠 Define chunk size for memory efficiency
chunk_size = 100000

# 🚀 Initialize CSV file (overwrite if exists)
with open(merged_eeg_path, "w", newline="") as f:
    pass  # Just creating/clearing the file

# 🔄 Process EEG features in chunks and save incrementally
for i, chunk in enumerate(pd.read_csv(eeg_features_path, chunksize=chunk_size, dtype={"file": str})):
    print(f"🔄 Processing chunk {i + 1}")

    # 🏷 Extract `eeg_id` from 'file' column (remove `.parquet` extension)
    chunk["eeg_id"] = chunk["file"].str.replace(".parquet", "", regex=False)

    # 🔄 Merge chunk with `train.csv` labels
    merged_chunk = chunk.merge(train_df, on="eeg_id", how="left")

    # ✅ Append to CSV (without keeping everything in memory)
    merged_chunk.to_csv(merged_eeg_path, mode="a", index=False, header=(i == 0))  # Write header only for first chunk

    print(f"✅ Chunk {i + 1} saved.")

print(f"🎉 Merging completed. Data saved at: {merged_eeg_path}")


🔄 Processing chunk 1
✅ Chunk 1 saved.
🔄 Processing chunk 2
✅ Chunk 2 saved.
🔄 Processing chunk 3
✅ Chunk 3 saved.
🔄 Processing chunk 4
✅ Chunk 4 saved.
🔄 Processing chunk 5
✅ Chunk 5 saved.
🔄 Processing chunk 6
✅ Chunk 6 saved.
🔄 Processing chunk 7
✅ Chunk 7 saved.
🔄 Processing chunk 8
✅ Chunk 8 saved.
🔄 Processing chunk 9
✅ Chunk 9 saved.
🔄 Processing chunk 10
✅ Chunk 10 saved.
🔄 Processing chunk 11
✅ Chunk 11 saved.
🔄 Processing chunk 12
✅ Chunk 12 saved.
🔄 Processing chunk 13
✅ Chunk 13 saved.
🔄 Processing chunk 14
✅ Chunk 14 saved.
🔄 Processing chunk 15
✅ Chunk 15 saved.
🔄 Processing chunk 16
✅ Chunk 16 saved.
🔄 Processing chunk 17
✅ Chunk 17 saved.
🔄 Processing chunk 18
✅ Chunk 18 saved.
🔄 Processing chunk 19
✅ Chunk 19 saved.
🔄 Processing chunk 20
✅ Chunk 20 saved.
🔄 Processing chunk 21
✅ Chunk 21 saved.
🔄 Processing chunk 22
✅ Chunk 22 saved.
🔄 Processing chunk 23
✅ Chunk 23 saved.
🔄 Processing chunk 24
✅ Chunk 24 saved.
🔄 Processing chunk 25
✅ Chunk 25 saved.
🔄 Processing chunk

In [6]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# 📂 Define File Paths
merged_eeg_path = r"C:\Users\Kevin Tran\Documents\Project Data\merged_eeg_data2.csv"
model_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\random_forest_model2.pkl"
feature_columns_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\feature_columns2.pkl"

# ✅ Chunk Size (Adjust based on available memory)
chunk_size = 50000

# ✅ Initialize lists to store chunks
X_chunks = []
y_chunks = []

print("📂 Loading EEG data in chunks...")

# Read merged EEG data in chunks
for i, chunk in enumerate(pd.read_csv(merged_eeg_path, chunksize=chunk_size)):
    print(f"🔄 Processing chunk {i + 1}...")

    # 🔍 Ignore non-feature columns
    feature_cols = [col for col in chunk.columns if col not in ["eeg_id", "file", "channel", "window", "label"]]
    X_chunk = chunk[feature_cols]
    y_chunk = chunk["label"]

    # 🔄 Handle missing values in features (Fill NaN with column median) - Fixing SettingWithCopyWarning
    X_chunk = X_chunk.fillna(X_chunk.median())

    # 🔄 Handle missing labels (Fill NaN with most common label)
    if y_chunk.isna().sum() > 0:
        y_chunk = y_chunk.fillna(y_chunk.mode()[0])  # Replace NaN labels with the most common class

    # ✅ Append to lists
    X_chunks.append(X_chunk)
    y_chunks.append(y_chunk)

    # ⏸ Stop early to prevent memory issues (adjust if needed)
    if i == 10:
        print("⏸ Stopping early to avoid memory issues (Adjust if needed)")
        break

# ✅ Combine processed chunks
X = pd.concat(X_chunks, ignore_index=True)
y = pd.concat(y_chunks, ignore_index=True)

# 🚀 Final Check: Ensure `X` and `y` have the same number of rows
print(f"✅ Final Data Shape: X={X.shape}, y={y.shape}")

# 🔄 Split into training (80%) and testing (20%)
print("🔄 Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🚀 Train and Save Model
print("🚀 Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# ✅ Save trained model
joblib.dump(rf_model, model_path)
print(f"✅ Model saved to: {model_path}")

# ✅ Save expected feature columns
joblib.dump(list(X_train.columns), feature_columns_path)
print(f"✅ Saved expected feature columns: {feature_columns_path}")

# 🔍 Predict on test set
y_pred = rf_model.predict(X_test)

# ✅ Define Condition Mapping
condition_mapping = {
    0: "Normal",
    1: "Seizure",
    2: "GPD",
    3: "GRDA",
    4: "LPD",
    5: "LRDA"
}

# ✅ Check unique labels before classification report
unique_labels = np.unique(y_test)
print(f"Unique classes in y_test: {unique_labels}")

# ✅ Adjust target names dynamically to avoid ValueError
filtered_target_names = [condition_mapping[label] for label in unique_labels]

# ✅ Generate classification report without mismatched labels
print("\n🔍 Classification Report:\n", classification_report(y_test, y_pred, target_names=filtered_target_names))
print("\n🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


📂 Loading EEG data in chunks...
🔄 Processing chunk 1...
🔄 Processing chunk 2...
🔄 Processing chunk 3...
🔄 Processing chunk 4...
🔄 Processing chunk 5...
🔄 Processing chunk 6...
🔄 Processing chunk 7...
🔄 Processing chunk 8...
🔄 Processing chunk 9...
🔄 Processing chunk 10...
🔄 Processing chunk 11...
⏸ Stopping early to avoid memory issues (Adjust if needed)
✅ Final Data Shape: X=(550000, 19), y=(550000,)
🔄 Splitting data into training and testing sets...
🚀 Training Random Forest model...
✅ Model saved to: C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\random_forest_model2.pkl
✅ Saved expected feature columns: C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\feature_columns2.pkl
Unique classes in y_test: [1. 2. 3. 4. 5.]

🔍 Classification Report:
               precision    recall  f1-score   support

     Seizure       0.98      0.97      0.97     38400
         GPD       0.97      0.99      0.98     41423
        GRDA       0.98      0.97      0.97      47

In [10]:
import pandas as pd
import numpy as np
import os
import joblib
from scipy.stats import skew, kurtosis
from scipy.signal import butter, filtfilt, iirnotch
from tqdm import tqdm

# 📌 Define Paths
input_folder = r"C:\Users\Kevin Tran\Documents\Project Data\Input files"
output_folder = r"C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs"
model_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\random_forest_model.pkl"
feature_columns_path = r"C:\Users\Kevin Tran\Documents\Project Data\For Machine Learning\feature_columns.pkl"

# ✅ Ensure Output Folder Exists
os.makedirs(output_folder, exist_ok=True)

# ✅ Load Trained Model and Expected Feature Names
print("🧠 Loading trained Random Forest model...")
rf_model = joblib.load(model_path)
expected_features = joblib.load(feature_columns_path)  # Load expected feature names

# ✅ Define Multi-Class Condition Mapping (Same as in Training)
condition_mapping = {
    0: "Normal",
    1: "Seizure",
    2: "GPD",
    3: "GRDA",
    4: "LPD",
    5: "LRDA"
}

# 📌 EEG Signal Processing Functions
def apply_notch_filter(signal, fs=400, freq=60.0, quality_factor=30):
    """Apply a notch filter to remove 60Hz noise."""
    b, a = iirnotch(w0=freq, Q=quality_factor, fs=fs)
    return filtfilt(b, a, signal)

def apply_bandpass_filter(signal, fs=400, lowcut=0.5, highcut=40.0, order=5):
    """Apply a bandpass filter to keep frequencies between 0.5Hz and 40Hz."""
    nyquist = 0.5 * fs
    low, high = lowcut / nyquist, highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, signal)

def normalize_signal(signal):
    """Normalize EEG signal to have zero mean and unit variance."""
    return (signal - np.mean(signal)) / np.std(signal)

# 📌 Feature Extraction Functions
def extract_time_features(signal):
    return {
        "mean": np.mean(signal),
        "variance": np.var(signal),
        "skewness": skew(signal),
        "kurtosis": kurtosis(signal),
        "rms": np.sqrt(np.mean(signal**2)),
        "zero_crossing_rate": np.sum(np.diff(np.sign(signal)) != 0) / len(signal),
        "mean_abs": np.mean(np.abs(signal)),
        "diff_rms1": np.sqrt(np.mean(np.diff(signal) ** 2)),
        "diff_rms2": np.sqrt(np.mean(np.diff(signal, n=2) ** 2))
    }

def extract_frequency_features(signal, fs):
    """Extract frequency-based features using FFT."""
    L = len(signal)
    Y = np.fft.fft(signal)
    P2 = np.abs(Y / L)
    P1 = P2[:L // 2 + 1]
    P1[1:-1] *= 2
    freqs = fs * np.arange(L // 2 + 1) / L

    # EEG frequency bands
    bands = {
        "delta": (1, 3),
        "theta": (4, 7),
        "alpha1": (8, 9),
        "alpha2": (10, 12),
        "beta1": (13, 17),
        "beta2": (18, 30),
        "gamma1": (31, 40),
        "gamma2": (41, 50),
        "higher": (51, 250),
    }

    band_powers = {name: np.sum(P1[(freqs >= low) & (freqs <= high)]) for name, (low, high) in bands.items()}
    band_powers["spectral_entropy"] = -np.sum(P1 * np.log(P1 + 1e-10))
    return band_powers

# 📌 Function to Extract Features from EEG Data
def extract_features_from_eeg(file_path):
    data = pd.read_parquet(file_path)

    # Process each EEG channel
    all_features = []
    for channel in data.columns:
        signal = data[channel].values
        signal = apply_notch_filter(signal)
        signal = apply_bandpass_filter(signal)
        signal = normalize_signal(signal)

        # Extract time and frequency features
        time_features = extract_time_features(signal)
        frequency_features = extract_frequency_features(signal, fs=400)

        # Combine into one dictionary
        combined_features = {**time_features, **frequency_features}
        all_features.append(combined_features)

    # Convert list of feature dicts into a DataFrame
    feature_df = pd.DataFrame(all_features).mean(axis=0).to_frame().T  # Aggregate across channels

    return feature_df

# 📌 Function to Process & Predict a Single EEG File
def process_and_predict(file_path):
    try:
        print(f"\n📂 Processing: {os.path.basename(file_path)}")

        # Extract Features
        feature_df = extract_features_from_eeg(file_path)

        # Ensure Columns Match Expected Features
        missing_features = set(expected_features) - set(feature_df.columns)
        extra_features = set(feature_df.columns) - set(expected_features)

        if missing_features:
            print(f"⚠️ Missing features: {missing_features}")
            for feat in missing_features:
                feature_df[feat] = 0  # Fill missing features with 0

        if extra_features:
            print(f"⚠️ Extra features found: {extra_features}")
            feature_df = feature_df[expected_features]  # Keep only expected features

        # Reorder features to match model training order
        feature_df = feature_df[expected_features]

        # Predict using trained model
        print(f"🔮 Predicting condition for {os.path.basename(file_path)}...")
        prediction = rf_model.predict(feature_df)

        # Convert numeric prediction to condition label
        predicted_condition = condition_mapping[int(prediction[0])]

        print(f"🔮 Predicted Condition: {predicted_condition}")

        # Save predictions
        output_file = os.path.join(output_folder, f"predicted_{os.path.basename(file_path)}")
        feature_df["predicted_label"] = prediction
        feature_df.to_parquet(output_file)

        return f"✅ Prediction saved: {output_file} | Status: {predicted_condition}"

    except Exception as e:
        return f"❌ Failed: {os.path.basename(file_path)} | Error: {e}"

# 📌 Run EEG Processing and Prediction
if __name__ == "__main__":
    print(f"📂 Scanning for EEG `.parquet` files in {input_folder}...")
    eeg_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith(".parquet")]

    if len(eeg_files) == 0:
        print("🚨 No `.parquet` files found in the input folder! Please check your folder.")
    else:
        print(f"✅ Found {len(eeg_files)} EEG files. Starting predictions...\n")
        for file in tqdm(eeg_files, desc="Processing EEG Files"):
            result = process_and_predict(file)
            print(result)  # Show final prediction status
        print(f"✅ All predictions completed! Files saved in: {output_folder}")


🧠 Loading trained Random Forest model...
📂 Scanning for EEG `.parquet` files in C:\Users\Kevin Tran\Documents\Project Data\Input files...
✅ Found 2 EEG files. Starting predictions...



Processing EEG Files:   0%|          | 0/2 [00:00<?, ?it/s]


📂 Processing: 1628180742.parquet


Processing EEG Files: 100%|██████████| 2/2 [00:00<00:00, 12.37it/s]

🔮 Predicting condition for 1628180742.parquet...
🔮 Predicted Condition: Seizure
✅ Prediction saved: C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs\predicted_1628180742.parquet | Status: Seizure

📂 Processing: 2237447621.parquet
🔮 Predicting condition for 2237447621.parquet...
🔮 Predicted Condition: Normal
✅ Prediction saved: C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs\predicted_2237447621.parquet | Status: Normal
✅ All predictions completed! Files saved in: C:\Users\Kevin Tran\Documents\Project Data\Predicted_EEGs



