In [None]:
# stdlib
import sys
import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesSurvivalDataLoader

log.add(sink=sys.stderr, level="INFO")
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Ejemplo

In [2]:
# stdlib
import sys
import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader

log.add(sink=sys.stderr, level="INFO")
warnings.filterwarnings("ignore")

In [84]:
# stdlib
import datetime
import uuid

# third party
# import libraries for generating simulated data
import numpy as np
import pandas as pd

# set the number of individuals and observations per individual you want to generate
num_subj = 200
num_obs = 10

# generate static data
ids = [uuid.uuid4().hex[:6].upper() for i in range(num_subj)]
static_data = pd.DataFrame(
    {
        "id": ids,
        "var_a": np.random.randint(2, size=(num_subj)),
        "var_b": np.random.normal(loc=2, scale=0.5, size=(num_subj)),
        "outcome": np.random.binomial(1, 0.7, size=(num_subj)),
    }
)

# generate temporal data
temp_len = num_obs * len(ids)
temp_ids = ids * num_obs
timepoints = [i for i in range(num_obs)] * num_subj

temporal_data = pd.DataFrame(
    {
        "id": temp_ids,
        "temp_a": np.random.normal(loc=0, scale=0.2, size=(temp_len)),
        "temp_b": np.random.normal(loc=5, scale=1, size=(temp_len)),
        "temp_c": np.random.binomial(1, 0.5, size=(temp_len)),
        "timepoint": timepoints,
    }
)

In [85]:
# rearrange static data
outcome_data = static_data[["outcome"]]
static_data = static_data.drop(columns=["outcome"])

# rearrange temporal data
observation_data, temporal_dataframes = ([] for i in range(2))
for id in static_data["id"].unique():
    temp_df = temporal_data[temporal_data["id"] == id]
    observations = temp_df["timepoint"].tolist()
    temp_df.set_index("timepoint", inplace=True)
    temp_df = temp_df.drop(columns=["id"])
    # add each to list
    observation_data.append(observations)
    temporal_dataframes.append(temp_df)


In [86]:
outcome_data

Unnamed: 0,outcome
0,1
1,1
2,0
3,1
4,1
...,...
195,1
196,1
197,0
198,1


In [88]:
len(static_data)
static_data

Unnamed: 0,id,var_a,var_b
0,615A6E,1,2.197626
1,FE2233,0,1.872536
2,E174D9,0,2.182496
3,A637A2,0,2.671294
4,E0180B,0,1.518642
...,...,...,...
195,1A8EE5,1,2.181504
196,85BFF6,1,2.004823
197,016A0A,0,2.285084
198,D217E6,1,2.450679


In [7]:
static_data

Unnamed: 0,id,var_a,var_b
0,9A2F5B,0,1.828560
1,E1318C,1,1.902427
2,D3AC60,1,2.895921
3,4DD2CE,1,2.243258
4,FD231E,1,2.083909
...,...,...,...
195,F9A12E,1,1.313767
196,9BDE9F,1,1.619937
197,8687CE,1,2.225457
198,B8C65A,1,0.879029


In [93]:
observation_data[2]

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

In [94]:
temporal_dataframes[2]

Unnamed: 0_level_0,temp_a,temp_b,temp_c
timepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.284381,4.571139,0
2,-0.258637,5.929148,0
2,0.131625,4.362734,1
2,0.461439,6.015419,0
2,0.377672,5.081388,1
2,0.371049,4.056295,1
2,0.308486,2.784473,0
2,0.193602,6.075643,0
2,0.049525,7.094383,0
2,-0.122024,6.380099,0


In [90]:
temporal_dataframes

[             temp_a    temp_b  temp_c
 timepoint                            
 0          0.097274  4.205577       1
 0          0.159255  5.848780       1
 0          0.067188  6.267811       0
 0         -0.073607  5.809085       0
 0         -0.167332  4.865429       1
 0         -0.020659  6.031925       1
 0          0.248652  5.959658       1
 0          0.061587  4.863666       1
 0         -0.082816  6.580555       1
 0          0.301776  4.385487       1,
              temp_a    temp_b  temp_c
 timepoint                            
 1          0.149184  4.890551       0
 1         -0.154776  4.260784       0
 1         -0.043498  5.264660       1
 1          0.014324  5.828754       1
 1          0.011502  4.932455       0
 1         -0.093241  3.110291       1
 1         -0.195266  4.811769       0
 1         -0.054741  5.591359       0
 1          0.164412  5.109744       1
 1         -0.012904  4.863962       0,
              temp_a    temp_b  temp_c
 timepoint             

In [8]:
print(len(temporal_dataframes))
for i in temporal_dataframes:
    print(len(i))

200
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10


In [9]:
temporal_data

Unnamed: 0,id,temp_a,temp_b,temp_c,timepoint
0,9A2F5B,0.588447,6.396517,1,0
1,E1318C,0.027274,3.886206,0,1
2,D3AC60,0.098256,5.251266,0,2
3,4DD2CE,-0.129277,3.623033,0,3
4,FD231E,0.237149,3.852978,0,4
...,...,...,...,...,...
1995,F9A12E,0.495580,4.662043,1,5
1996,9BDE9F,0.222206,5.764439,1,6
1997,8687CE,0.042065,5.651513,1,7
1998,B8C65A,0.081813,5.485687,1,8


In [10]:
print(len(temporal_data))
for i in temporal_dataframes:
    print(i.shape)

2000
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10

Mi modelo

# Preprocesamiento de los datos

In [2]:
import pandas as pd
df =  pd.read_csv('/Users/cgarciay/Desktop/Laval_Master_Computer/research/generative_input/input_onehot_encoding.csv')
adm = pd.read_csv('./data/data_preprocess_nonfilteres.csv')
res = pd.merge(adm[["HOSPITAL_EXPIRE_FLAG","SUBJECT_ID","HADM_ID","DOB"]],df, on=["SUBJECT_ID","HADM_ID"], how='right')
# Limitar las fechas a un rango permitido

# Ahora deberías poder restar las fechas sin problemas

res['ADMITTIME'] = pd.to_datetime(res['ADMITTIME'])
#res['DOB'] = pd.to_datetime(res['DOB'], format='%Y-%m-%d %H:%M:%S')
#res['DOB'] = pd.to_datetime(res['DOB'])


res = res.fillna(0)
outcome =  res.groupby('SUBJECT_ID')['HOSPITAL_EXPIRE_FLAG'].max()

print(outcome.shape)

(44952,)


In [3]:


print(res.shape, adm.shape, df.shape)
# Assuming df is your DataFrame

# Find columns that contain 'unnamed' in their name
cols_to_drop = res.filter(like='Unnamed', axis=1).columns
res.drop(cols_to_drop, axis=1, inplace=True)
print(res.shape, adm.shape, df.shape)

print(res.isnull().sum().sum())
res = res.fillna(0)
print(res.isnull().sum().sum())

(56678, 691) (58976, 41) (56678, 689)
(56678, 690) (58976, 41) (56678, 689)
0
0


loader = TimeSeriesSurvivalDataLoader(
    temporal_data=temporal_surv,
    observation_times=temporal_surv_horizons,
    static_data=static_surv,
    T=T,
    E=E,
    time_horizons=time_horizons,
)

loader.dataframe()

In [4]:
# Lista de nombres de columnas a buscar
static_data = ['INSURANCE',  'RELIGION',
                'MARITAL_STATUS', 'ETHNICITY','GENDER', 'SUBJECT_ID','LOSRD_sum',

 'LOSRD_avg',
 'L_1s_last_p1',
 ]


# Filtra las columnas

# Filtra las columnas
filtered_df = pd.concat([df.filter(like=term) for term in static_data], axis=1)

In [5]:
cols_s = filtered_df.columns

In [6]:
static = res.groupby('SUBJECT_ID')[cols_s].last()


In [7]:
# crear visit ranks


# Ordenar el DataFrame por 'SUBJECT_ID' y 'ADMITTIME' para asegurar el orden correcto
res = res.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])

# Agregar una nueva columna 'VISIT_NUMBER' que indica el número de visita para cada 'SUBJECT_ID'
res['visit_rank'] = res.groupby('SUBJECT_ID').cumcount() + 1
# Crear una nueva columna 'visit_rank' que represente el número de la visita para cada paciente


# Ahora, vamos a separar las visitas en DataFrames individuales y guardarlos en una lista
max_visits = res['visit_rank'].max()
# Asegúrate de que 'ADMITTIME' es una fecha
res['ADMITTIME'] = pd.to_datetime(res['ADMITTIME'], errors='coerce')

# Ordena los datos por 'SUBJECT_ID' y 'ADMITTIME'
res = res.sort_values(['SUBJECT_ID', 'ADMITTIME'])

# Calcula la diferencia en tiempo entre las visitas para cada paciente
res['horizons'] = res.groupby('SUBJECT_ID')['ADMITTIME'].diff().fillna(pd.Timedelta(seconds=0))
res['horizons'] =[int(i) for i in res['horizons'].dt.total_seconds()]
# Para la primera visita de cada paciente, establece 'TIME_BETWEEN_ADM' en 0
res.loc[res['visit_rank'] == 1, 'horizons'] = 0

temporal_surv = [res[res['visit_rank'] == i] for i in range(1, max_visits + 1)]


In [8]:
#horizons = [df.filter(like = ["horizon","SUBJECT_ID"]) for df in temporal_surv]
horizons = [df.filter(regex='horizon|SUBJECT_ID') for df in temporal_surv]
unique_subjects = pd.DataFrame(temporal_surv[0]['SUBJECT_ID'].unique(), columns=['SUBJECT_ID'])

# Inicializar una lista vacía para almacenar los DataFrames modificados
horizons_list = []

# Para cada DataFrame en df_list, hacer un 'merge' con 'unique_subjects' para asegurar que todos los 'SUBJECT_ID' de df_list[0] estén presentes
for df in horizons:
    new_df = pd.merge(unique_subjects, df, on='SUBJECT_ID', how='outer')
    #new_df = new_df.fillna({col: pd.Timedelta(0) for col in new_df.select_dtypes(include=['timedelta'])})
    new_df.fillna(0, inplace=True)  # Llenar los valores faltantes con 0
    horizons_list.append(new_df)

horizons = pd.concat(horizons_list)
horizons = horizons.sort_values('SUBJECT_ID') 
horizons = [group for _, group in horizons.groupby('SUBJECT_ID')]    
#horizons = [df.filter(like = "horizon") for df in horizons_list]
for i in range(10):
    print(horizons[i].shape)

(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)


In [9]:
len(horizons)

44952

In [10]:
horizons_list_final = [i["horizons"].to_list() for i in horizons]

In [11]:
not_considet_temporal = [
    'HOSPITAL_EXPIRE_FLAG',
 
    'HADM_ID',
    'DOB',
    'ADMITTIME',
    'L_1s_last',
    'INSURANCE_Medicare',
    'INSURANCE_Otra',
    'RELIGION_CATHOLIC',
    'RELIGION_NOT SPECIFIED',
    'RELIGION_Otra',
    'RELIGION_UNOBTAINABLE',
    'MARITAL_STATUS_MARRIED',
    'MARITAL_STATUS_Otra',
    'MARITAL_STATUS_SINGLE',
    'ETHNICITY_BLACK/AFRICAN AMERICAN',
    'ETHNICITY_Otra',
    'ETHNICITY_WHITE',
    'GENDER_M',
    'GENDER_Otra',
    'visit_rank',
    'horizons'
]
temporal_surv =[df.drop(columns=not_considet_temporal) for df in temporal_surv]



In [12]:
for i in range(10):
    print(temporal_surv[i].shape)

(44952, 671)
(7075, 671)
(2260, 671)
(976, 671)
(493, 671)
(261, 671)
(156, 671)
(114, 671)
(84, 671)
(58, 671)


In [13]:
import pandas as pd

# Asumiendo que df_list es tu lista de DataFrames y que df_list[0] es el DataFrame que contiene todos los 'SUBJECT_ID' que quieres mantener

# Crear un DataFrame que solo contiene los 'SUBJECT_ID' únicos de df_list[0]
unique_subjects = pd.DataFrame(temporal_surv[0]['SUBJECT_ID'].unique(), columns=['SUBJECT_ID'])

# Inicializar una lista vacía para almacenar los DataFrames modificados
new_df_list = []

# Para cada DataFrame en df_list, hacer un 'merge' con 'unique_subjects' para asegurar que todos los 'SUBJECT_ID' de df_list[0] estén presentes
for df in temporal_surv:
    new_df = pd.merge(unique_subjects, df, on='SUBJECT_ID', how='outer')
    #new_df = new_df.fillna({col: pd.Timedelta(0) for col in new_df.select_dtypes(include=['timedelta'])})
    new_df.fillna(0, inplace=True)  # Llenar los valores faltantes con 0
    new_df_list.append(new_df)

In [14]:

df = pd.concat(new_df_list)
df = df.sort_values('SUBJECT_ID') 
dfs = [group for _, group in df.groupby('SUBJECT_ID')]

In [15]:
len(dfs)

44952

In [25]:
len(dfs)
print(len(dfs))
for i in dfs:
    print(len(i))

44952
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
4

In [16]:
static = static[[i for i in static.columns if i != "SUBJECT_ID"]].reset_index().sort_values('SUBJECT_ID')

In [27]:
static

Unnamed: 0,SUBJECT_ID,INSURANCE_Medicare,INSURANCE_Otra,RELIGION_CATHOLIC,RELIGION_NOT SPECIFIED,RELIGION_Otra,RELIGION_UNOBTAINABLE,MARITAL_STATUS_MARRIED,MARITAL_STATUS_Otra,MARITAL_STATUS_SINGLE,ETHNICITY_BLACK/AFRICAN AMERICAN,ETHNICITY_Otra,ETHNICITY_WHITE,GENDER_M,GENDER_Otra,LOSRD_sum,LOSRD_avg,L_1s_last_p1
0,2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,9.0,3.0,0.0
1,3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,90.0,10.0,0.0
2,4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,63.0,7.0,0.0
3,5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,6.0,2.0,0.0
4,6,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,128.0,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44947,99985,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,208.0,16.0,0.0
44948,99991,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,204.0,12.0,0.0
44949,99992,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,24.0,2.0,0.0
44950,99995,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,51.0,3.0,0.0


# Entrenamiento

In [17]:
len(outcome)
out = pd.DataFrame(outcome)
df_out = out.rename(columns={'HOSPITAL_EXPIRE_FLAG': 'outcome'})
df_out.index.name = None

In [19]:
df_out_train = df_out.iloc[:int(len(static)*0.2)]
df_out_test = df_out.iloc[int(len(static)*0.2):]
len(df_out_train)

8990

In [20]:
static_train = static.iloc[:int(len(static)*0.2)]
static_test = static.iloc[int(len(static)*0.2):]

In [21]:
dfs_test = dfs[int(len(dfs)*0.2):] 
dfs_train = dfs[:int(len(dfs)*0.2)]

In [22]:
len(dfs_train)

8990

In [68]:
horizons_lists = [i.reset_index()["horizons"].values.tolist() for i in horizons]
 # Reemplaza 10 con el número total de listas que deseas crear
horizons_lists = [[i for i in range(1, 43)] for _ in range(len(dfs))]
horizons_lists_train=horizons_lists[:int(len(horizons_lists)*0.2)]
horizons_lists_test=horizons_lists[int(len(horizons_lists)*0.2):]

In [65]:
len(dfs[100])

42

In [67]:
len(dfs)

44952

In [None]:

import pandas as pd

# Assuming lst is your list
lst = [0 if x == pd.Timedelta(days=0) else x for x in horizons_lists_train]
lst

In [24]:
static_train
print(len(static_train))
for i in static_train:
    print(len(i))

8990
10
18
14
17
22
13
21
22
19
21
32
14
15
8
11
9
9
12


In [69]:
dfs_train
len(dfs)
print(len(dfs_train))
for i in dfs_train:
    print(len(i))

8990
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42

In [70]:
dfs_train[0]

Unnamed: 0,SUBJECT_ID,1_diagnosis,2_diagnosis,3_diagnosis,4_diagnosis,5_diagnosis,6_diagnosis,7_diagnosis,8_diagnosis,9_diagnosis,...,231_procedures,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_Otra,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_Otra,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_HOME HEALTH CARE,DISCHARGE_LOCATION_Otra,DISCHARGE_LOCATION_SNF
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
horizons_lists_train[1000]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42]

In [72]:
horizons_lists_train[2]
horizons_lists_train = [
    [0 if isinstance(item, pd.Timedelta) and item == pd.Timedelta(0) else item for item in sublist]
    for sublist in horizons_lists_train
]

In [42]:
static_train

Unnamed: 0_level_0,SUBJECT_ID,INSURANCE_Medicare,INSURANCE_Otra,RELIGION_CATHOLIC,RELIGION_NOT SPECIFIED,RELIGION_Otra,RELIGION_UNOBTAINABLE,MARITAL_STATUS_MARRIED,MARITAL_STATUS_Otra,MARITAL_STATUS_SINGLE,ETHNICITY_BLACK/AFRICAN AMERICAN,ETHNICITY_Otra,ETHNICITY_WHITE,GENDER_M,GENDER_Otra,LOSRD_sum,LOSRD_avg,L_1s_last_p1
SUBJECT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,9.0,3.0,0.0
3,3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,90.0,10.0,0.0
4,4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,63.0,7.0,0.0
5,5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,6.0,2.0,0.0
6,6,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,128.0,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9802,9802,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,24.0,4.0,0.0
9803,9803,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,42.0,6.0,0.0
9804,9804,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,48.0,3.0,0.0
9805,9805,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,24.0,6.0,0.0


In [75]:
static_train.index = static_train.SUBJECT_ID

In [76]:
dfs_train[0]

Unnamed: 0,SUBJECT_ID,1_diagnosis,2_diagnosis,3_diagnosis,4_diagnosis,5_diagnosis,6_diagnosis,7_diagnosis,8_diagnosis,9_diagnosis,...,231_procedures,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_Otra,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_Otra,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_HOME HEALTH CARE,DISCHARGE_LOCATION_Otra,DISCHARGE_LOCATION_SNF
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
len(dfs_train[0])

42

In [105]:

print(len(dfs_train))
len(horizons_lists_train)

8990


8990

In [79]:
len(horizons_lists_train[0])

42

In [96]:


# Suponiendo que dfs es tu lista de DataFrames
for i in range(len(dfs_train)):
    # Crear un nuevo índice del 1 al 42
    new_index = pd.RangeIndex(start=1, stop=43, step=1)
    
    # Establecer el nuevo índice para el DataFrame actual
    dfs[i].set_index(new_index, inplace=True)

In [97]:
dfs[899]

Unnamed: 0,SUBJECT_ID,1_diagnosis,2_diagnosis,3_diagnosis,4_diagnosis,5_diagnosis,6_diagnosis,7_diagnosis,8_diagnosis,9_diagnosis,...,231_procedures,ADMISSION_TYPE_EMERGENCY,ADMISSION_TYPE_Otra,ADMISSION_LOCATION_EMERGENCY ROOM ADMIT,ADMISSION_LOCATION_Otra,ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI,DISCHARGE_LOCATION_HOME,DISCHARGE_LOCATION_HOME HEALTH CARE,DISCHARGE_LOCATION_Otra,DISCHARGE_LOCATION_SNF
1,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
horizons_lists_train[899]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42]

In [81]:
help(TimeSeriesDataLoader)

Help on class TimeSeriesDataLoader in module synthcity.plugins.core.dataloader:

class TimeSeriesDataLoader(DataLoader)
 |  TimeSeriesDataLoader(temporal_data: List[pandas.core.frame.DataFrame], observation_times: List, outcome: Optional[pandas.core.frame.DataFrame] = None, static_data: Optional[pandas.core.frame.DataFrame] = None, sensitive_features: List[str] = [], important_features: List[str] = [], fairness_column: Optional[str] = None, random_state: int = 0, train_size: float = 0.8, seq_offset: int = 0, **kwargs: Any) -> None
 |  
 |  .. inheritance-diagram:: synthcity.plugins.core.dataloader.TimeSeriesDataLoader
 |      :parts: 1
 |  
 |  Data Loader for Time Series Data
 |  
 |  Constructor Args:
 |      temporal data: List[pd.DataFrame]
 |          The temporal data. A list of pandas DataFrames
 |      observation times: List
 |          List of arrays mapping directly to index of each dataframe in temporal_data
 |      outcome: Optional[pd.DataFrame] = None
 |          pandas 

In [None]:
static_data=static_train,

In [103]:
horizons_lists_train[100]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42]

In [102]:
horizons_lists_train[100]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42]

In [107]:
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader


loader = TimeSeriesDataLoader(
    temporal_data=dfs_train,
    observation_times=horizons_lists_train,
    

)



: 

In [None]:
plugin = Plugins().get("timegan", n_iter = 1)
plugin.fit(loader)

plugin.generate(count = 10)

NameError: name 'Plugins' is not defined

In [None]:
# synthcity absolute
from synthcity.benchmark import Benchmarks

score = Benchmarks.evaluate(
    [
        (f"test_{model}", model, {})
        for model in ["timegan"]
    ],
    loader,
    synthetic_size=1000,
    repeats=2,
    task_type="time_series",  # time_series_survival or time_series
)

NameError: name 'loader' is not defined

In [None]:

Benchmarks.print(score)

In [None]:
# third party
import matplotlib.pyplot as plt

syn_model.plot(plt, loader)

plt.show()