# Environment Setup

In [1]:
# !pip install tsfresh

# Bibliotecas

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
# from tsfresh import extract_features
# from tsfresh.feature_extraction import MinimalFCParameters, ComprehensiveFCParameters, EfficientFCParameters

# Funções

## calculate_metrics_specific

In [3]:
def calculate_metrics_specific(df):
    results = {}

    # Preenche valores faltantes com forward fill
    df = df.ffill()

    # Número de amostras (para normalização de métricas cumulativas)
    n_samples = len(df)

    # MÉTRICAS PARA 'X', 'Y', 'Z' (Aceleração nos eixos)
    for var in ['X', 'Y', 'Z']:
        series = df[var]  # Série já sem valores faltantes
        results[f'{var}_mean'] = series.mean()  # Média
        results[f'{var}_std'] = series.std()  # Desvio padrão
        results[f'{var}_amplitude'] = series.max() - series.min()  # Amplitude
        results[f'{var}_energy'] = (series**2).sum() / n_samples  # Energia normalizada
        results[f'{var}_inactivity'] = (abs(series) < 0.1).sum() / n_samples  # Índice de inatividade
        results[f'{var}_autocorr_lag1'] = series.autocorr(lag=1)  # Autocorrelação no lag 1

    # MÉTRICAS PARA 'enmo' (Norma Euclidiana Menos Um)
    enmo = df['enmo']  # Série já preenchida
    results['enmo_mean'] = enmo.mean()  # Média
    results['enmo_skew'] = skew(enmo, nan_policy='omit')  # Assimetria
    results['enmo_kurtosis'] = kurtosis(enmo, nan_policy='omit')  # Curtose
    results['enmo_auc'] = np.trapz(enmo) / n_samples  # Área sob a curva normalizada
    results['enmo_above_threshold'] = (enmo > 0.5).sum() / n_samples  # Proporção acima de 0.5

    # MÉTRICAS PARA 'anglez' (Ângulo relativo ao plano horizontal)
    anglez = df['anglez']  # Série já preenchida
    results['anglez_mean'] = anglez.mean()  # Média
    results['anglez_variance'] = anglez.var()  # Variância
    results['anglez_stability'] = ((anglez > -5) & (anglez < 5)).sum() / n_samples  # Estabilidade
    results['anglez_mean_change'] = anglez.diff().abs().mean()  # Mudanças médias
    results['anglez_zero_crossings'] = ((anglez.shift(1) * anglez) < 0).sum() / n_samples  # Cruzamentos do zero (normalizado)

    # MÉTRICAS PARA 'light' (Luz ambiente)
    light = df['light']  # Série já preenchida
    results['light_mean'] = light.mean()  # Média
    results['light_max'] = light.max()  # Máximo
    results['light_cv'] = light.std() / light.mean()  # Coeficiente de variação
    results['light_entropy'] = -np.nansum((light / light.sum()) * np.log2(light / light.sum() + 1e-10))  # Entropia
    results['light_above_500_lux'] = (light > 500).sum() / n_samples  # Duração acima de 500 lux

    # Retorna como um DataFrame com uma linha
    return pd.DataFrame([results])

# Dados

In [5]:
file = '/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=00115b9f/part-0.parquet'
df = pd.read_parquet(file)
df

Unnamed: 0,step,X,Y,Z,enmo,anglez,non-wear_flag,light,battery_voltage,time_of_day,weekday,quarter,relative_date_PCIAT
0,0,0.021536,0.022214,-1.022370,0.022853,-88.280762,0.0,53.000000,4188.000000,56940000000000,4,3,41.0
1,1,0.022005,0.022187,-1.019740,0.020231,-88.241707,0.0,51.666668,4188.166504,56945000000000,4,3,41.0
2,2,0.022240,0.022005,-1.019401,0.019893,-88.170067,0.0,50.333332,4188.333496,56950000000000,4,3,41.0
3,3,0.021589,0.022578,-1.018177,0.018667,-88.250031,0.0,50.500000,4188.500000,56955000000000,4,3,41.0
4,4,0.022005,0.023763,-1.014323,0.016848,-88.130775,0.0,33.166668,4181.000000,57235000000000,4,3,41.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43325,43325,-0.008333,-0.023620,-1.006901,0.007224,-88.595741,0.0,0.000000,3824.000000,57580000000000,6,3,85.0
43326,43326,0.048730,-0.076725,-0.953776,0.056839,-85.352219,0.0,0.000000,3824.000000,57640000000000,6,3,85.0
43327,43327,0.387370,0.793151,-0.402214,0.069961,-24.097908,0.0,0.000000,3824.000000,57645000000000,6,3,85.0
43328,43328,0.801953,0.501589,-0.040937,0.045489,-2.113776,0.0,0.000000,3824.000000,57650000000000,6,3,85.0


# Extração de Características das Séries Temporais

In [6]:
import pandas as pd
import os

# Diretório onde os arquivos estão localizados
base_dir = "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/"

# Dicionário para armazenar os DataFrames com o ID como chave
data_by_id = {}
df_metrics = pd.DataFrame()

# Percorre os diretórios e arquivos
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".parquet"):
            # Caminho completo do arquivo
            file_path = os.path.join(root, file)
            
            # Extrai o ID a partir do nome do diretório
            id_name = os.path.basename(os.path.dirname(file_path))
            
            # Lê o arquivo Parquet
            data = pd.read_parquet(file_path)
            metrics = calculate_metrics_specific(data[['X','Y','Z','enmo','anglez','light']])
            metrics['id'] = id_name[3:]

            #
            df_metrics = pd.concat([df_metrics,metrics], axis='rows')
            
            # # Adiciona ao dicionário
            # if id_name in data_by_id:
            #     data_by_id[id_name].append(data)
            # else:
            #     data_by_id[id_name] = [data]

# # Exemplo: Mostra o ID e o número de arquivos associados
# for id_name, data_list in data_by_id.items():
#     print(f"ID: {id_name}, Number of files: {len(data_list)}")
df_metrics

Unnamed: 0,X_mean,X_std,X_amplitude,X_energy,X_inactivity,X_autocorr_lag1,Y_mean,Y_std,Y_amplitude,Y_energy,...,anglez_variance,anglez_stability,anglez_mean_change,anglez_zero_crossings,light_mean,light_max,light_cv,light_entropy,light_above_500_lux,id
0,-0.054638,0.633126,3.662422,0.403825,0.226763,0.963399,-0.163923,0.513286,6.211563,0.290328,...,1219.257812,0.125689,7.031084,0.097229,46.009533,2626.199951,4.474338,12.730483,0.020274,0745c390
0,0.113277,0.507897,3.736724,0.270790,0.181952,0.936537,0.093139,0.541129,6.122276,0.301495,...,1939.056641,0.092705,5.770268,0.079933,56.437958,2628.199951,3.661102,15.839968,0.027371,eaab7a96
0,-0.499738,0.454021,2.924792,0.455867,0.065045,0.896022,0.046381,0.510668,4.166693,0.262926,...,748.980774,0.136115,9.227910,0.126591,77.305130,2618.199951,3.555367,12.421968,0.045947,8ec2cc63
0,0.007430,0.586100,7.592625,0.343567,0.085698,0.916840,0.007583,0.542189,4.489431,0.294025,...,1059.687500,0.111900,10.424760,0.133720,9.369678,2502.000000,5.774415,15.131376,0.002340,b2987a65
0,0.086653,0.509845,4.907422,0.267434,0.203372,0.945121,-0.115162,0.494897,2.104661,0.258169,...,2297.641846,0.087419,6.566788,0.066861,5.049157,1046.800049,3.087797,12.704145,0.000195,7b8842c3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-0.067798,0.591072,2.899334,0.353962,0.099914,0.944850,0.006292,0.481551,4.074034,0.231930,...,1517.174072,0.065892,7.050254,0.057309,8.857800,2576.399902,6.001390,15.987559,0.002055,cd68643b
0,0.097154,0.269882,2.418437,0.082214,0.241841,0.795383,-0.356072,0.530971,2.007578,0.408482,...,2621.728027,0.089540,5.506832,0.060251,35.192787,1526.599976,2.207614,8.814626,0.005021,f8ff0bc8
0,-0.147508,0.478085,2.507981,0.250323,0.236342,0.958017,-0.047232,0.499994,4.001310,0.252225,...,2305.686279,0.062761,4.585949,0.063325,10.387013,2592.199951,7.288898,15.184383,0.003647,db23fbe4
0,-0.441574,0.502446,2.077995,0.447433,0.059000,0.943245,-0.080691,0.457471,2.436732,0.215786,...,1062.122681,0.148410,7.069418,0.120020,11.325677,1875.199951,3.091885,13.928528,0.000674,687c85e7


In [7]:
df_metrics = df_metrics.reset_index(drop=True)
df_metrics

Unnamed: 0,X_mean,X_std,X_amplitude,X_energy,X_inactivity,X_autocorr_lag1,Y_mean,Y_std,Y_amplitude,Y_energy,...,anglez_variance,anglez_stability,anglez_mean_change,anglez_zero_crossings,light_mean,light_max,light_cv,light_entropy,light_above_500_lux,id
0,-0.054638,0.633126,3.662422,0.403825,0.226763,0.963399,-0.163923,0.513286,6.211563,0.290328,...,1219.257812,0.125689,7.031084,0.097229,46.009533,2626.199951,4.474338,12.730483,0.020274,0745c390
1,0.113277,0.507897,3.736724,0.270790,0.181952,0.936537,0.093139,0.541129,6.122276,0.301495,...,1939.056641,0.092705,5.770268,0.079933,56.437958,2628.199951,3.661102,15.839968,0.027371,eaab7a96
2,-0.499738,0.454021,2.924792,0.455867,0.065045,0.896022,0.046381,0.510668,4.166693,0.262926,...,748.980774,0.136115,9.227910,0.126591,77.305130,2618.199951,3.555367,12.421968,0.045947,8ec2cc63
3,0.007430,0.586100,7.592625,0.343567,0.085698,0.916840,0.007583,0.542189,4.489431,0.294025,...,1059.687500,0.111900,10.424760,0.133720,9.369678,2502.000000,5.774415,15.131376,0.002340,b2987a65
4,0.086653,0.509845,4.907422,0.267434,0.203372,0.945121,-0.115162,0.494897,2.104661,0.258169,...,2297.641846,0.087419,6.566788,0.066861,5.049157,1046.800049,3.087797,12.704145,0.000195,7b8842c3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,-0.067798,0.591072,2.899334,0.353962,0.099914,0.944850,0.006292,0.481551,4.074034,0.231930,...,1517.174072,0.065892,7.050254,0.057309,8.857800,2576.399902,6.001390,15.987559,0.002055,cd68643b
992,0.097154,0.269882,2.418437,0.082214,0.241841,0.795383,-0.356072,0.530971,2.007578,0.408482,...,2621.728027,0.089540,5.506832,0.060251,35.192787,1526.599976,2.207614,8.814626,0.005021,f8ff0bc8
993,-0.147508,0.478085,2.507981,0.250323,0.236342,0.958017,-0.047232,0.499994,4.001310,0.252225,...,2305.686279,0.062761,4.585949,0.063325,10.387013,2592.199951,7.288898,15.184383,0.003647,db23fbe4
994,-0.441574,0.502446,2.077995,0.447433,0.059000,0.943245,-0.080691,0.457471,2.436732,0.215786,...,1062.122681,0.148410,7.069418,0.120020,11.325677,1875.199951,3.091885,13.928528,0.000674,687c85e7


In [8]:
df_metrics.to_csv('time_series_features.csv')

# Rascunho

In [9]:
import os

# Diretório onde os arquivos estão localizados
base_dir = "/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/"

# Lista para armazenar os IDs e caminhos dos arquivos
id_file_list = []

# Percorre os diretórios e arquivos
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".parquet"):
            # Caminho completo do arquivo
            file_path = os.path.join(root, file)
            
            # Extrai o ID a partir do nome do diretório
            id_name = os.path.basename(os.path.dirname(file_path))
            
            # Adiciona à lista como um dicionário
            id_file_list.append({"id": id_name, "file_path": file_path})

# # Exibe a lista
# for item in id_file_list:
#     print(item['id'], item['file_path'])
# # print(id_file_list[])

In [10]:
dados_tabulares = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
df = pd.read_csv(dados_tabulares)
df

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,Winter,0.0,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0.0


In [11]:
df['id'].nunique()

3960

In [12]:
df

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,Winter,0.0,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0.0


In [13]:
df_completo = pd.merge(df, df_metrics, on='id', how='left')
df_completo

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,anglez_mean,anglez_variance,anglez_stability,anglez_mean_change,anglez_zero_crossings,light_mean,light_max,light_cv,light_entropy,light_above_500_lux
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,-10.580416,1844.459473,0.101339,8.598380,0.106324,42.296310,2633.250000,4.921682,12.206059,0.018463
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,,,,,,,,,,
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,,,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,-12.374899,1512.892700,0.099581,6.391167,0.085407,22.480034,2605.750000,4.908380,16.011845,0.007757
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,27.160025,4547.101562,0.054459,1.327870,0.020386,7.501198,1186.599976,1.912873,17.542433,0.000020


In [14]:
df_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Columns: 115 entries, id to light_above_500_lux
dtypes: float32(17), float64(84), int64(2), object(12)
memory usage: 3.2+ MB
