# Agregandos Informações de Pacientes

In [47]:
import pandas as pd
from datetime import datetime
import json

In [29]:
# Variáveis de ambiente
import os
from os.path import join, dirname
from dotenv import load_dotenv

dotenv_path = join(dirname('__file__'), '.env')

load_dotenv(dotenv_path)

ROOT_PATH = os.environ.get("ROOT_PATH")
SOURCE_PATH = os.environ.get("SOURCE_PATH")
DATALAKE_PATH = os.environ.get("DATALAKE_PATH")

## 1° Pipeline

In [30]:
# Padrões de strings
def get_patterns(file):
    patterns = [
        'File name:', 
        'Registration start time:',
        'Registration end time:',
        'Seizure start time:',
        'Seizure end time:'
    ]

    keys = ['name','registration_start','registration_end','seizure_start','seizure_end']

    data = []

    for line in file:
        for idx in range(len(keys)):
            if patterns[idx] in line:
                d = {}
                d[keys[idx]]= line.replace(patterns[idx],'').strip()
                data.append(d)

    return data


# Incorporando datas
def set_intervals(data):
    intervals = [cont for cont in range(len(data)) if 'name' in data[cont].keys()]
    
    intervals.append(None)

    data_ = []
    
    for cont in range(len(intervals)-1):
        data_.append(data[intervals[cont]:intervals[cont+1]])

    return data_




def dict_update(data):
    d = {}
    for i in data:
        d.update(i)
    return d


def pipeline(file):
    data = get_patterns(file)
    result = set_intervals(data)
    
    return [dict_update(data) for data in result]

## 2° Pipeline

In [39]:
def split_time(label, data):
    return [int(x) for x in data[label].split('.')]

def set_hour(label, data):
    h, m, s = split_time(label, data)
    
    date = datetime.now()

    return date.replace(hour=h,minute=m,second=s,microsecond=0)


def pipeline2(d:dict)->dict:
    registration_start = set_hour('registration_start', d)
    registration_end = set_hour('registration_end', d)
    seizure_start = set_hour('seizure_start', d)
    seizure_end = set_hour('seizure_end', d)
    
    total = registration_end - registration_start
    seizure_end_sec = registration_end - seizure_start 
    seizure_start_sec = registration_end - seizure_end

    d['total'] = int(total.total_seconds())
    d['seizure_start_sec'] = int(seizure_start_sec.total_seconds())
    d['seizure_end_sec'] = int(seizure_end_sec.total_seconds())

    return d

In [58]:
def format_source_types(file):
    file['age_years'] = int(file['age_years'])
    file['eeg_channel'] = int(file['eeg_channel']) 
    file['number_seizures'] = int(file['number_seizures']) 
    file['rec_time_minutes'] = int(file['rec_time_minutes']) 

    return file


## Executando

In [32]:
sb_infos = pd.read_csv(f"{ROOT_PATH}/docs/subject_info.csv")


In [33]:
# refatorando colunas
sb_infos.columns = [col.replace(' ','') for col in  sb_infos.columns] 

In [59]:
data = []

for idx in range(len(sb_infos)):
    patient = dict(sb_infos.iloc[idx])

    patient = format_source_types(patient)

    try:
    
        file = f"{SOURCE_PATH}/{patient['patient_id']}/Seizures-list-{patient['patient_id']}.txt"

        f = [i for i in open(file, "r")]


        data1 = pipeline(f)

        data2 = [pipeline2(d) for d in data1]

        patient['collections'] = data2

        data.append(patient)
        
    except KeyError:
        continue

In [60]:
with open(f"{ROOT_PATH}/docs/infos.json", "w") as outfile:
    json.dump(data, outfile)