# Line Length & Power

In [1]:
import os, pickle
import numpy as np
import pandas as pd
from scipy.fft import fft

from ieeg_utils import *
from signal_processing import *

IEEG_DIRECTORY = "../../../Data/ieeg/all/2_min"
LINE_LENGTH_DIRECTORY = "../../../Data/line_length"
ENERGY_DIRECTORY = "../../../Data/energy"
TEAGER_ENERGY_DIRECTORY = "../../../Data/teager_energy"
KURAMOTO_DIRECTORY = "../../../Data/plv/kuramoto"

In [2]:
nina_patient_hup_ids = pd.read_excel("../../../Data/HUP_implant_dates.xlsx")
nina_patient_hup_ids = nina_patient_hup_ids["hup_id"].to_numpy()
nina_patient_hup_ids

array([225, 224, 223, 221, 219, 217, 216, 215, 214, 213, 211, 210, 209,
       208, 207, 206, 205, 204, 202, 201, 199, 197, 196, 195, 194, 193,
       192, 191, 190, 189, 188, 187, 186, 185, 184, 182, 181, 180, 179,
       178, 177, 175, 174, 173, 172, 171, 170, 169, 168, 167, 166, 165,
       164, 163, 162, 161, 160, 159, 158, 157, 156, 155, 154, 153, 152,
       151, 150, 149, 148, 147, 146, 145, 144, 143, 142, 141, 140, 139,
       138, 137])

In [3]:
# Create a mapping between patient ids and the index of the patient in the patients_df dataframe
patient_hup_id_to_index = {}
for i, patient_id in enumerate(nina_patient_hup_ids):
    patient_hup_id_to_index[patient_id] = i
# patient_hup_id_to_index

In [4]:
ieeg_offset_row1_df = pd.read_excel("../../../Data/ieeg_offset/row_1.xlsx", header=None)
ieeg_offset_row2_df = pd.read_excel("../../../Data/ieeg_offset/row_2.xlsx", header=None)
ieeg_offset_row3_df = pd.read_excel("../../../Data/ieeg_offset/row_3.xlsx", header=None)

In [5]:
# Load master_elecs.csv from ./data/
master_elecs_df = pd.read_csv("../../../Data/master_elecs.csv")

# only take the numbers in rid column
master_elecs_df["rid"] = master_elecs_df["rid"].str.extract("(\d+)", expand=False)
master_elecs_df["rid"] = master_elecs_df["rid"].astype(int)

# Drop mni_x, mni_y, mni_z, mm_x, mm_y, mm_z columns
master_elecs_df = master_elecs_df.drop(
    columns=["mni_x", "mni_y", "mni_z", "mm_x", "mm_y", "mm_z"]
)

master_elecs_df

Unnamed: 0,rid,name,vox_x,vox_y,vox_z,label,soz,resected,spike_rate,engel
0,13,LST01,80.6116,106.5480,64.5941,left inferior temporal,False,False,1.091902,1.0
1,13,LST02,72.0779,109.4150,63.1223,left inferior temporal,False,False,1.091902,1.0
2,13,LST03,64.9060,112.3760,68.7455,EmptyLabel,False,False,1.419472,1.0
3,13,LST04,65.0210,114.6600,78.2339,left middle temporal,False,False,0.655141,1.0
4,13,MST01,131.7410,64.3756,70.4205,right lingual,True,False,3.439490,1.0
...,...,...,...,...,...,...,...,...,...,...
14212,785,RB08,154.2550,114.2730,136.7560,EmptyLabel,False,,0.369914,1.0
14213,785,RB09,159.1350,111.9920,136.6960,EmptyLabel,False,,0.665845,1.0
14214,785,RB10,164.7520,109.9030,137.7640,right middle temporal,False,,4.586930,1.0
14215,785,RB11,169.6320,107.6220,137.7040,right middle temporal,False,,2.071517,1.0


In [6]:
# Load rid_hup_table.csv from ./data/
rid_hup_table_df = pd.read_csv("../../../Data/rid_hup_table.csv")
# Drop the t3_subject_id and ieegportalsubjno columns
rid_hup_table_df = rid_hup_table_df.drop(columns=["t3_subject_id", "ieegportalsubjno"])
rid_hup_table_df

Unnamed: 0,record_id,hupsubjno
0,623,35
1,624,36
2,625,37
3,626,38
4,627,39
...,...,...
212,534,250
213,923,251
214,918,252
215,864,253


In [7]:
def bipolar_montage(data: np.ndarray, ch_types: pd.DataFrame) -> np.ndarray:
    """_summary_

    Args:
        data (np.ndarray): _description_
        ch_types (pd.DataFrame): _description_

    Returns:
        np.ndarray: _description_
    """

    n_ch = len(ch_types)
    new_ch_types = []
    for ind, row in ch_types.iterrows():
        # do only if type is ecog or seeg
        if row["type"] not in ["ecog", "seeg"]:
            continue

        ch1 = row["name"]

        ch2 = ch_types.loc[
            (ch_types["lead"] == row["lead"])
            & (ch_types["contact"] == row["contact"] + 1),
            "name",
        ]
        if len(ch2) > 0:
            ch2 = ch2.iloc[0]
            entry = {
                "name": ch1 + "-" + ch2,
                "type": row["type"],
                "idx1": ind,
                "idx2": ch_types.loc[ch_types["name"] == ch2].index[0],
            }
            new_ch_types.append(entry)

    new_ch_types = pd.DataFrame(new_ch_types)
    # apply montage to data
    new_data = np.empty((len(new_ch_types), data.shape[1]))
    for ind, row in new_ch_types.iterrows():
        new_data[ind, :] = data[row["idx1"], :] - data[row["idx2"], :]

    return new_data, new_ch_types

In [8]:
# Create an empty dictionary to store all the data
data_dict = {"dataset_name": [], "max_hour": [], "sample_rate": [], "hup_id": []}

# Iterate through the directory
for filename in os.listdir(IEEG_DIRECTORY):
    if filename.endswith(".pkl"):  # Only process .pkl files
        # Split the filename to get the dataset_name, hour, and sample_rate
        parts = filename.split("_")
        dataset_name = "_".join(parts[:-4])  # Exclude the '_hr' from the dataset_name
        hour = int(parts[-3])
        sample_rate = int(parts[-1].split(".")[0])

        # Extract hup_id from dataset_name
        hup_id = dataset_name.split("_")[0].split("HUP")[1]

        # If the dataset_name is already in the dictionary, update the max_hour
        if dataset_name in data_dict["dataset_name"]:
            index = data_dict["dataset_name"].index(dataset_name)
            data_dict["max_hour"][index] = max(data_dict["max_hour"][index], hour)
        else:
            # Else, add the dataset_name, hour, sample_rate and hup_id to the dictionary
            data_dict["dataset_name"].append(dataset_name)
            data_dict["max_hour"].append(hour)
            data_dict["sample_rate"].append(sample_rate)
            data_dict["hup_id"].append(hup_id)

# Create a DataFrame from the dictionary
datasets_df = pd.DataFrame(data_dict)
# Make max_hour and sample_rate and hup_id integers
datasets_df["max_hour"] = datasets_df["max_hour"].astype(int)
datasets_df["sample_rate"] = datasets_df["sample_rate"].astype(int)
datasets_df["hup_id"] = datasets_df["hup_id"].astype(int)
# Sort by hup_id
datasets_df = datasets_df.sort_values(by=["hup_id"])
# Reset the index
datasets_df = datasets_df.reset_index(drop=True)
# Create a column called max_hour_count that is the max_hour + 1
datasets_df["max_hour_count"] = datasets_df["max_hour"] + 1
datasets_df

Unnamed: 0,dataset_name,max_hour,sample_rate,hup_id,max_hour_count
0,HUP138_phaseII,172,1024,138,173
1,HUP140_phaseII_D02,128,1024,140,129
2,HUP140_phaseII_D01,19,1024,140,20
3,HUP141_phaseII,146,512,141,147
4,HUP142_phaseII,311,512,142,312
...,...,...,...,...,...
81,HUP215_phaseII_D01,14,2048,215,15
82,HUP216_phaseII_D01,143,512,216,144
83,HUP216_phaseII_D02,144,512,216,145
84,HUP223_phaseII,135,1024,223,136


In [9]:
frequency_bands = {
    "delta": (0.5, 4),
    "theta": (4, 8),
    "alpha": (8, 12),
    "beta": (12, 30),
    "gamma": (30, 100),
    "high": (60, 100),
}

In [10]:
def LineLength(x):
    return np.sum(np.absolute(np.ediff1d(x)))


def Energy(x):
    return np.sum(np.square(x))


def LineLengthVectorized(x):
    return np.sum(np.abs(np.diff(x, axis=1)), axis=1)


def EnergyVectorized(x):
    return np.sum(np.square(x), axis=1)


def TeagerEnergy(x):
    return x[1:-1] ** 2 - x[2:] * x[:-2]


def TeagerEnergyVectorized(x):
    return x[:, 1:-1] ** 2 - x[:, :-2] * x[:, 2:]

In [11]:
for patient_hup_id in datasets_df["hup_id"].unique():
    # Find the value of record_id in rid_hup_table_df where hupsubjno == patient_hup_id
    patient_rid = rid_hup_table_df[rid_hup_table_df["hupsubjno"] == patient_hup_id][
        "record_id"
    ].values[0]
    # Get the row in datasets_df corresponding to the patient_hup_id
    rows_df = datasets_df[datasets_df["hup_id"] == patient_hup_id]
    # Sort rows_df by dataset_name
    rows_df = rows_df.sort_values(by=["dataset_name"])
    rows_df = rows_df.reset_index(drop=True)
    patient_electrodes_df = master_elecs_df.loc[master_elecs_df["rid"] == patient_rid]
    print(f"HUP {patient_hup_id}, rid {patient_rid}")

    # Add up all the max_hours for rows_df
    total_max_hour_count = rows_df["max_hour_count"].sum()

    ##########################################
    # Create empty vectors to save the data
    ##########################################
    plv_vector_to_save = np.zeros(total_max_hour_count)
    teager_energy_vector_to_save = np.zeros(total_max_hour_count)
    line_length_vector_to_save = np.zeros(total_max_hour_count)
    energy_vector_to_save = np.zeros(total_max_hour_count)
    current_hour = 0

    for dataset_idx, dataset_row in rows_df.iterrows():
        # Get the dataset_name, max_hour, and sample_rate
        dataset_name = dataset_row["dataset_name"]
        max_hour_count = dataset_row["max_hour_count"]
        sample_rate = dataset_row["sample_rate"]
        print(dataset_name)

        for hour in range(max_hour_count):
            # Get the filename
            filename = f"{dataset_name}_hr_{hour}_fs_{sample_rate}.pkl"
            # Get the full path to the file
            full_path = os.path.join(IEEG_DIRECTORY, filename)

            # Load the data
            try:
                with open(full_path, "rb") as f:
                    eeg_segment_df = pickle.load(f)
            except FileNotFoundError:
                print(f"Skipping {hour} for {dataset_name}")
                teager_energy_vector_to_save[current_hour] = np.nan
                line_length_vector_to_save[current_hour] = np.nan
                energy_vector_to_save[current_hour] = np.nan
                plv_vector_to_save[current_hour] = np.nan
                current_hour += 1
                continue

            print(
                f"Processing hour {hour} in {dataset_name}, that's hour {current_hour} out of {total_max_hour_count} for HUP {patient_hup_id}"
            )
            channel_labels = eeg_segment_df.columns.values.tolist()
            channel_indices = np.arange(len(channel_labels))

            # Get the labels of the good indicies
            good_channel_labels = channel_labels

            # Skip this hour if there are less than 2 good channels, i.e., unable to create a bipolar montage
            if len(good_channel_labels) < 2:
                print(
                    f"Skipping {hour} for {dataset_name} because there are less than 2 good channels"
                )
                teager_energy_vector_to_save[current_hour] = np.nan
                line_length_vector_to_save[current_hour] = np.nan
                energy_vector_to_save[current_hour] = np.nan
                plv_vector_to_save[current_hour] = np.nan
                current_hour += 1
                continue

            good_channel_types_df = check_channel_types(good_channel_labels)

            try:
                bipolar_eeg_data, bipolar_channel_types_df = bipolar_montage(
                    ((eeg_segment_df[good_channel_labels]).T).to_numpy(),
                    good_channel_types_df,
                )
            except Exception as error:
                print(
                    f"Skipping {hour} for {dataset_name} because of an error in bipolar_montage"
                )
                print(error)
                continue

            if len(bipolar_eeg_data) == 0:
                print(
                    f"Skipping {hour} for {dataset_name} because there's less than 1 good channel after bipolar_montage"
                )
                teager_energy_vector_to_save[current_hour] = np.nan
                line_length_vector_to_save[current_hour] = np.nan
                energy_vector_to_save[current_hour] = np.nan
                plv_vector_to_save[current_hour] = np.nan
                current_hour += 1
                continue

            bipolar_eeg_data_filtered = process_eeg_data(
                data=bipolar_eeg_data,
                sample_rate=sample_rate,
                band_pass_freq=frequency_bands["high"],
                notch_freq=60,
            )

            ##########################################
            # Compute hourly features
            ##########################################

            teager_energy_vector_to_save[current_hour] = np.mean(
                TeagerEnergyVectorized(bipolar_eeg_data_filtered)
            )
            plv_vector_to_save[current_hour] = np.mean(
                calculate_kuramoto_order_parameter(bipolar_eeg_data_filtered)
            )
            current_hour += 1

    ##########################################
    # Save files
    ##########################################
    np.save(
        f"{TEAGER_ENERGY_DIRECTORY}/high_no_afr/HUP_{patient_hup_id}.npy",
        teager_energy_vector_to_save,
    )
    np.save(
        f"{KURAMOTO_DIRECTORY}/high_no_afr/HUP_{patient_hup_id}.npy",
        plv_vector_to_save,
    )

HUP 138, rid 278
HUP138_phaseII
Processing hour 0 in HUP138_phaseII, that's hour 0 out of 173 for HUP 138
Processing hour 1 in HUP138_phaseII, that's hour 1 out of 173 for HUP 138
Processing hour 2 in HUP138_phaseII, that's hour 2 out of 173 for HUP 138
Processing hour 3 in HUP138_phaseII, that's hour 3 out of 173 for HUP 138
Processing hour 4 in HUP138_phaseII, that's hour 4 out of 173 for HUP 138
Processing hour 5 in HUP138_phaseII, that's hour 5 out of 173 for HUP 138
Processing hour 6 in HUP138_phaseII, that's hour 6 out of 173 for HUP 138
Processing hour 7 in HUP138_phaseII, that's hour 7 out of 173 for HUP 138
Processing hour 8 in HUP138_phaseII, that's hour 8 out of 173 for HUP 138
Processing hour 9 in HUP138_phaseII, that's hour 9 out of 173 for HUP 138
Skipping 10 for HUP138_phaseII
Processing hour 11 in HUP138_phaseII, that's hour 11 out of 173 for HUP 138
Processing hour 12 in HUP138_phaseII, that's hour 12 out of 173 for HUP 138
Processing hour 13 in HUP138_phaseII, that's 

KeyboardInterrupt: 