# Extração de Caracteríticas

Features:
- Variance
- Skewness
- Kurtosis
- Shannon-Entropy

In [1]:
from os import walk, environ
import pandas as pd
from pyspark.rdd import RDD
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (col, variance, skewness, kurtosis, desc)
from pyspark.sql.types import (StructType, StructField, StringType, FloatType)
from functools import reduce
from typing import List
from joblib import Parallel, delayed


In [2]:
import logging

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(environ.get("LOGGER_LEVEL", "INFO"))


In [4]:
spark = (SparkSession.builder
         .master('local')
         .appName('Doutorado')
         .getOrCreate())


path_data = '/media/davi/6A81-05CF/physionet.org/files/siena-scalp-eeg/1.0.0/PN00/'
infos_path = '/home/davi/Documentos/doutorado_ppgee_v2/data/siena_infos.json'

infos = pd.read_json(infos_path)

files = [dir_[0] for dir_ in walk(path_data)][1:]


## Extraindo de EKG

In [5]:
def get_intervals(df: DataFrame, freq: int, type_seizure: str) -> list:

    if type_seizure == 'ICTAL':
        first_id = (df.select('__index_level_0__')
                    .limit(1)
                    .collect())[0][0]

        last_id = (df.orderBy(desc('__index_level_0__'))
                   .select('__index_level_0__')
                   .limit(1)
                   .collect())[0][0]
    else:
        idx = df.select('__index_level_0__').collect()
        idx = [item['__index_level_0__'] for item in idx]

        first_id, last_id = idx[0], idx[-1]

    logger.info(f'Range ids: {first_id}:{last_id}')
    logger.info(f'Seconds of intervals: {first_id//freq}:{last_id//freq}')

    ids, cont = [], 0

    while True:
        value = first_id + (cont * freq)
        ids.append(value)
        cont += 1
        if value >= last_id:
            break

    return ids


def get_features_ekg(df: DataFrame, start: int, end: int) -> DataFrame:
    return (df.filter(col('__index_level_0__')
                      .between(start, end))
            .groupBy('label')
            .agg(variance('EKG_EKG').alias('var'),
                 skewness('EKG_EKG').alias('skew'),
                 kurtosis('EKG_EKG').alias('kur')))


In [6]:
def pipeline_ekg(info: dict, seizure: str) -> None:

    name = info['name'].replace('.edf', '')
    freq = int(info['sfreq'])
    type_signal = 'EKG'
    path_file = f"{path_data}{name}/{seizure}/{type_signal}"

    logger.info(f"Path file: {path_file}")

    ekg = spark.read.parquet(path_file)
    ids = get_intervals(df=ekg, freq=freq, type_seizure=seizure)

    logger.info('Starting writing')

    for i in range(len(ids)-1):
        start = ids[i]
        end = ids[i+1]
        df = get_features_ekg(df=ekg, start=start, end=end)
        (df.write
           .mode("append")
           .parquet(f"{path_file}/features"))

    logger.info('Starting complete')


In [None]:

for index, row in infos.iterrows():
    info = dict(row)
    seizure = 'ICTAL'
    pipeline_ekg(info=info, seizure=seizure, empty=empty, schema=schema)


In [7]:
for index, row in infos.iterrows():
    info = dict(row)
    seizure = 'N_ICTAL'
    pipeline_ekg(info=info, seizure=seizure)


INFO:__main__:Path file: /media/davi/6A81-05CF/physionet.org/files/siena-scalp-eeg/1.0.0/PN00/PN00-1/N_ICTAL/EKG
INFO:__main__:Range ids: 666624:702464
INFO:__main__:Seconds of intervals: 1302:1372
INFO:__main__:Starting writing
INFO:__main__:Starting complete
INFO:__main__:Path file: /media/davi/6A81-05CF/physionet.org/files/siena-scalp-eeg/1.0.0/PN00/PN00-2/N_ICTAL/EKG
INFO:__main__:Range ids: 478208:505856
INFO:__main__:Seconds of intervals: 934:988
INFO:__main__:Starting writing
INFO:__main__:Starting complete
INFO:__main__:Path file: /media/davi/6A81-05CF/physionet.org/files/siena-scalp-eeg/1.0.0/PN00/PN00-3/N_ICTAL/EKG
INFO:__main__:Range ids: 811008:841728
INFO:__main__:Seconds of intervals: 1584:1644
INFO:__main__:Starting writing
INFO:__main__:Starting complete
INFO:__main__:Path file: /media/davi/6A81-05CF/physionet.org/files/siena-scalp-eeg/1.0.0/PN00/PN00-4/N_ICTAL/EKG
INFO:__main__:Range ids: 464896:502784
INFO:__main__:Seconds of intervals: 908:982
INFO:__main__:Starting 

f"