In [3]:
import pandas as pd
import numpy as np
import os


def load_dataset(folder=None, filename=None, parse_to_sktime=True):
    if not filename:
        files = []
        for file in os.listdir(folder):
            if file.startswith("processed_dataset_"):
                files.append(file)
            #end if
        #end for
        files.sort(key=os.path.getctime, reverse=True)
        df = pd.read_csv(files[0])
    else:
        df = pd.read_csv(filename)
    #end if
    df = df.sample(frac=1., random_state=1234)

    df.target_cat = df.target_cat.astype(np.float64)
    data_cols = [col for col in df if col.startswith('datapoint')]
    df[data_cols] = df[data_cols].astype(np.float64)
    df["datapoints_list"] = df[data_cols].values.tolist()
    #df = df.drop(columns=data_cols)
    return df
#end load_dataset

In [4]:
import itertools,operator

def get_0s_sequence_indexes(ll):
    r = max((list(y) for (x,y) in itertools.groupby((enumerate(ll)),operator.itemgetter(1)) if x == 0), key=len)
    start = r[0][0] 
    end = r[-1][0]
    return start, end
#end

def index_longest_seq(A):
    A_rev = list(reversed(A))
    idx = len(A) + 1
    for el in A_rev:
        if el == 0:
            idx -= 1
        else:
            break
        #end
    #end
    return idx
#end

In [5]:
df = load_dataset(filename="../../datasets/processed_dataset_23-08-2023-18-46.csv")
df.dtypes


  df["datapoints_list"] = df[data_cols].values.tolist()


id_group            object
id_subject          object
key                 object
datapoint1         float64
datapoint2         float64
                    ...   
datapoint139       float64
datapoint140       float64
target             float64
target_cat         float64
datapoints_list     object
Length: 146, dtype: object

In [6]:
df.head()

Unnamed: 0,id_group,id_subject,key,datapoint1,datapoint2,datapoint3,datapoint4,datapoint5,datapoint6,datapoint7,...,datapoint134,datapoint135,datapoint136,datapoint137,datapoint138,datapoint139,datapoint140,target,target_cat,datapoints_list
71,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,1a9b9fe6-2f12-4890-993e-2e3d3ba91699,Kauê Viana de Souza (20102031),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.5,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
51,fe968227-cc73-4ffa-87d1-ca8843c63f99,8de0e2da-bd7c-4d51-ae32-701cc3cd1053,Julio Alberto Dahm Lima (20207355),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.5,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
67,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,f9de4534-bea1-4600-adc9-891a191e147c,Gilson Ricardo da Silva (19105505),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0, 0.0,..."
59,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,30e02485-80dc-4d6d-af36-7ba750fff4dc,Bruno Marcolino Luiz (20150482),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
35,106ecbe7-a885-4dbb-8d8b-871370201ce8,0f0bef41-cf5b-4e20-ae32-0700fb856b6b,Marco Antonio Accorsi Lisboa Filho (18204422),0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [7]:
time_col = []
time_col_mod = []
event_col = []

for index,row in df.iterrows():
    if row["target"] == 0:
        start, _ = get_0s_sequence_indexes(row["datapoints_list"])
        time_col.append(start)
        start_mod = index_longest_seq(row["datapoints_list"])
        time_col_mod.append(start_mod)
        event_col.append(1)
    else: #the subject did not drop out
        time_col.append( len(row["datapoints_list"]) )
        time_col_mod.append( len(row["datapoints_list"]) )
        event_col.append( 0 )
    #end if
#end for
df["event"] = event_col
df["time_drop"] = time_col
df["time_drop_mod"] = time_col_mod

  df["event"] = event_col
  df["time_drop"] = time_col
  df["time_drop_mod"] = time_col_mod


In [8]:
df.head()

Unnamed: 0,id_group,id_subject,key,datapoint1,datapoint2,datapoint3,datapoint4,datapoint5,datapoint6,datapoint7,...,datapoint137,datapoint138,datapoint139,datapoint140,target,target_cat,datapoints_list,event,time_drop,time_drop_mod
71,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,1a9b9fe6-2f12-4890-993e-2e3d3ba91699,Kauê Viana de Souza (20102031),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.5,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,140,140
51,fe968227-cc73-4ffa-87d1-ca8843c63f99,8de0e2da-bd7c-4d51-ae32-701cc3cd1053,Julio Alberto Dahm Lima (20207355),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.5,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,140,140
67,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,f9de4534-bea1-4600-adc9-891a191e147c,Gilson Ricardo da Silva (19105505),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0, 0.0,...",0,140,140
59,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,30e02485-80dc-4d6d-af36-7ba750fff4dc,Bruno Marcolino Luiz (20150482),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,14,15
35,106ecbe7-a885-4dbb-8d8b-871370201ce8,0f0bef41-cf5b-4e20-ae32-0700fb856b6b,Marco Antonio Accorsi Lisboa Filho (18204422),0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,1.0,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,140,140


In [9]:
import datetime
today_ts = datetime.datetime.now()
today_ts = today_ts.strftime("%d-%m-%Y-%H-%M")

df.to_csv("../../datasets/sa_ready_dataset_{}.csv".format(today_ts), sep=',', encoding='utf-8', index=False)

In [10]:
df.query("target ==  0 ").head(10)

Unnamed: 0,id_group,id_subject,key,datapoint1,datapoint2,datapoint3,datapoint4,datapoint5,datapoint6,datapoint7,...,datapoint137,datapoint138,datapoint139,datapoint140,target,target_cat,datapoints_list,event,time_drop,time_drop_mod
59,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,30e02485-80dc-4d6d-af36-7ba750fff4dc,Bruno Marcolino Luiz (20150482),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,14,15
55,fe968227-cc73-4ffa-87d1-ca8843c63f99,feab185c-5309-4c82-847e-8075512eaf02,Suelen Luiza Constante de Estefani (20206424),0.0,0.0,0.0,0.0,0.0,0.0,27.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 27.0, 35.0, 34....",1,75,76
66,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,4384cf1e-37fb-46d2-803b-4717cc5d291b,Fernando dos Santos Dornelles (18150481),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,36,37
44,fe968227-cc73-4ffa-87d1-ca8843c63f99,6b08dff9-eb5d-4525-a0c4-6d66242c9a55,Gabriel Cardoso da Silva (20205233),0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 11.0,...",1,49,50
64,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,9cb1137c-f8f7-48a2-b0f9-fa8fac061dfc,Erick Raimundo Oliveira Athaydes (17206693),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...",1,64,65
79,a2ccb8b9-1324-41e9-88a5-4bab83a2671c,1eaa44d7-f100-4b4a-928a-f82acc985954,Natalia Bortoli Vieira (17203199),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,47,108
9,83758b93-431a-4def-b625-71d8ee9c4520,9bcb17f8-db08-4eac-8fd8-321b019829bc,Jair Filipe Peruchi Cardoso (20102387),0.0,0.0,0.0,0.0,0.0,0.0,16.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16.0, 0.0, 0.0,...",1,77,135
56,fe968227-cc73-4ffa-87d1-ca8843c63f99,4298c0d6-12cc-4f85-95e3-c8fd1151128b,Viviane de Vargas Pereira (19102131),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,45,114
5,83758b93-431a-4def-b625-71d8ee9c4520,04d00228-a766-4cbb-abd3-06164919f1ad,Everton Cruz Soares (20104573),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,57,58
32,106ecbe7-a885-4dbb-8d8b-871370201ce8,d4d20a9c-efb8-432f-9d05-25dd52e58f24,João Eduardo de Aquino Silveira (19207028),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,0,74


In [11]:
df.event.sum()

19

In [12]:
df.query("target ==  0.0 ").shape

(19, 149)

In [13]:
df.target.tolist()

[6.5,
 8.5,
 8.0,
 0.0,
 2.0,
 7.0,
 8.5,
 8.5,
 8.0,
 9.5,
 0.5,
 9.0,
 0.0,
 6.0,
 8.0,
 2.0,
 9.5,
 0.0,
 6.0,
 7.0,
 1.0,
 9.5,
 9.0,
 9.0,
 0.0,
 7.5,
 7.5,
 0.5,
 0.0,
 0.0,
 6.0,
 0.0,
 0.0,
 6.5,
 9.0,
 2.0,
 9.0,
 0.0,
 8.5,
 0.0,
 9.5,
 10.0,
 0.5,
 6.5,
 8.0,
 9.5,
 0.0,
 9.5,
 9.5,
 0.5,
 0.5,
 0.0,
 6.0,
 6.5,
 6.5,
 8.5,
 6.0,
 0.0,
 9.0,
 9.0,
 9.0,
 10.0,
 9.0,
 8.0,
 9.0,
 10.0,
 9.0,
 6.0,
 7.5,
 3.0,
 6.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.0,
 9.5,
 6.0,
 0.0,
 6.5,
 0.0,
 9.5,
 7.0,
 10.0,
 6.5]