In [2]:
# stdlib
import sys
import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesSurvivalDataLoader

log.add(sink=sys.stderr, level="INFO")
warnings.filterwarnings("ignore")

# Ejemplo

In [3]:
# stdlib
import sys
import warnings

# synthcity absolute
import synthcity.logger as log
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader

log.add(sink=sys.stderr, level="INFO")
warnings.filterwarnings("ignore")

In [4]:
# stdlib
import datetime
import uuid

# third party
# import libraries for generating simulated data
import numpy as np
import pandas as pd

# set the number of individuals and observations per individual you want to generate
num_subj = 200
num_obs = 10

# generate static data
ids = [uuid.uuid4().hex[:6].upper() for i in range(num_subj)]
static_data = pd.DataFrame(
    {
        "id": ids,
        "var_a": np.random.randint(2, size=(num_subj)),
        "var_b": np.random.normal(loc=2, scale=0.5, size=(num_subj)),
        "outcome": np.random.binomial(1, 0.7, size=(num_subj)),
    }
)

# generate temporal data
temp_len = num_obs * len(ids)
temp_ids = ids * num_obs
timepoints = [i for i in range(num_obs)] * num_subj

temporal_data = pd.DataFrame(
    {
        "id": temp_ids,
        "temp_a": np.random.normal(loc=0, scale=0.2, size=(temp_len)),
        "temp_b": np.random.normal(loc=5, scale=1, size=(temp_len)),
        "temp_c": np.random.binomial(1, 0.5, size=(temp_len)),
        "timepoint": timepoints,
    }
)

In [5]:
# rearrange static data
outcome_data = static_data[["outcome"]]
static_data = static_data.drop(columns=["outcome"])

# rearrange temporal data
observation_data, temporal_dataframes = ([] for i in range(2))
for id in static_data["id"].unique():
    temp_df = temporal_data[temporal_data["id"] == id]
    observations = temp_df["timepoint"].tolist()
    temp_df.set_index("timepoint", inplace=True)
    temp_df = temp_df.drop(columns=["id"])
    # add each to list
    observation_data.append(observations)
    temporal_dataframes.append(temp_df)


In [6]:
outcome_data

Unnamed: 0,outcome
0,0
1,0
2,1
3,1
4,1
...,...
195,0
196,1
197,1
198,1


In [7]:
len(static_data)

200

In [8]:
static_data

Unnamed: 0,id,var_a,var_b
0,34F537,1,2.004348
1,F35470,1,2.763034
2,80BA14,0,2.179110
3,B8BC44,1,2.164328
4,26A860,0,2.036782
...,...,...,...
195,752D83,1,2.595383
196,7D5449,1,0.998102
197,AA9901,1,1.696953
198,7C4053,0,2.775053


In [9]:
print(len(temporal_dataframes))
for i in temporal_dataframes:
    print(len(i))

200
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10


In [10]:
temporal_data

Unnamed: 0,id,temp_a,temp_b,temp_c,timepoint
0,34F537,0.073934,5.562959,0,0
1,F35470,0.003243,6.491283,1,1
2,80BA14,0.240913,6.210841,0,2
3,B8BC44,-0.444746,4.096361,0,3
4,26A860,-0.085669,4.566329,0,4
...,...,...,...,...,...
1995,752D83,-0.308136,4.872962,0,5
1996,7D5449,0.183912,5.670612,0,6
1997,AA9901,0.204516,5.852654,1,7
1998,7C4053,0.048169,5.676983,0,8


In [11]:
print(len(temporal_data))
for i in temporal_dataframes:
    print(i.shape)

2000
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10, 3)
(10

Mi modelo

# Preprocesamiento de los datos

In [12]:
import pandas as pd
df =  pd.read_csv('generative_input/input_onehot_encoding.csv')
adm = pd.read_csv('./data/data_preprocess_nonfilteres.csv')
res = pd.merge(adm[["HOSPITAL_EXPIRE_FLAG","SUBJECT_ID","HADM_ID","DOB"]],df, on=["SUBJECT_ID","HADM_ID"], how='right')
# Limitar las fechas a un rango permitido

# Ahora deberías poder restar las fechas sin problemas

res['ADMITTIME'] = pd.to_datetime(res['ADMITTIME'])
#res['DOB'] = pd.to_datetime(res['DOB'], format='%Y-%m-%d %H:%M:%S')
#res['DOB'] = pd.to_datetime(res['DOB'])


res = res.fillna(0)
outcome =  res.groupby('SUBJECT_ID')['HOSPITAL_EXPIRE_FLAG'].max()

print(outcome.shape)

(44952,)


In [13]:


print(res.shape, adm.shape, df.shape)
# Assuming df is your DataFrame

# Find columns that contain 'unnamed' in their name
cols_to_drop = res.filter(like='Unnamed', axis=1).columns
res.drop(cols_to_drop, axis=1, inplace=True)
print(res.shape, adm.shape, df.shape)

print(res.isnull().sum().sum())
res = res.fillna(0)
print(res.isnull().sum().sum())

(56678, 691) (58976, 41) (56678, 689)
(56678, 690) (58976, 41) (56678, 689)
0


0


loader = TimeSeriesSurvivalDataLoader(
    temporal_data=temporal_surv,
    observation_times=temporal_surv_horizons,
    static_data=static_surv,
    T=T,
    E=E,
    time_horizons=time_horizons,
)

loader.dataframe()

In [14]:
# Lista de nombres de columnas a buscar
static_data = ['INSURANCE',  'RELIGION',
                'MARITAL_STATUS', 'ETHNICITY','GENDER', 'SUBJECT_ID','LOSRD_sum',

 'LOSRD_avg',
 'L_1s_last_p1',
 ]


# Filtra las columnas

# Filtra las columnas
filtered_df = pd.concat([df.filter(like=term) for term in static_data], axis=1)

In [15]:
cols_s = filtered_df.columns

In [16]:
static = res.groupby('SUBJECT_ID')[cols_s].last()


In [17]:
# crear visit ranks


# Ordenar el DataFrame por 'SUBJECT_ID' y 'ADMITTIME' para asegurar el orden correcto
res = res.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])

# Agregar una nueva columna 'VISIT_NUMBER' que indica el número de visita para cada 'SUBJECT_ID'
res['visit_rank'] = res.groupby('SUBJECT_ID').cumcount() + 1
# Crear una nueva columna 'visit_rank' que represente el número de la visita para cada paciente


# Ahora, vamos a separar las visitas en DataFrames individuales y guardarlos en una lista
max_visits = res['visit_rank'].max()
# Asegúrate de que 'ADMITTIME' es una fecha
res['ADMITTIME'] = pd.to_datetime(res['ADMITTIME'], errors='coerce')

# Ordena los datos por 'SUBJECT_ID' y 'ADMITTIME'
res = res.sort_values(['SUBJECT_ID', 'ADMITTIME'])

# Calcula la diferencia en tiempo entre las visitas para cada paciente
res['horizons'] = res.groupby('SUBJECT_ID')['ADMITTIME'].diff().fillna(pd.Timedelta(seconds=0))
res['horizons'] =[int(i) for i in res['horizons'].dt.total_seconds()]
# Para la primera visita de cada paciente, establece 'TIME_BETWEEN_ADM' en 0
res.loc[res['visit_rank'] == 1, 'horizons'] = pd.Timedelta(0)

temporal_surv = [res[res['visit_rank'] == i] for i in range(1, max_visits + 1)]


In [18]:
#horizons = [df.filter(like = ["horizon","SUBJECT_ID"]) for df in temporal_surv]
horizons = [df.filter(regex='horizon|SUBJECT_ID') for df in temporal_surv]
unique_subjects = pd.DataFrame(temporal_surv[0]['SUBJECT_ID'].unique(), columns=['SUBJECT_ID'])

# Inicializar una lista vacía para almacenar los DataFrames modificados
horizons_list = []

# Para cada DataFrame en df_list, hacer un 'merge' con 'unique_subjects' para asegurar que todos los 'SUBJECT_ID' de df_list[0] estén presentes
for df in horizons:
    new_df = pd.merge(unique_subjects, df, on='SUBJECT_ID', how='outer')
    #new_df = new_df.fillna({col: pd.Timedelta(0) for col in new_df.select_dtypes(include=['timedelta'])})
    new_df.fillna(0, inplace=True)  # Llenar los valores faltantes con 0
    horizons_list.append(new_df)

horizons = pd.concat(horizons_list)
horizons = horizons.sort_values('SUBJECT_ID') 
horizons = [group for _, group in horizons.groupby('SUBJECT_ID')]    
#horizons = [df.filter(like = "horizon") for df in horizons_list]
for i in range(10):
    print(horizons[i].shape)

(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)
(42, 2)


In [19]:
len(horizons)

44952

In [20]:
horizons_list_final = [i["horizons"].to_list() for i in horizons]

In [21]:
not_considet_temporal = [
    'HOSPITAL_EXPIRE_FLAG',
 
    'HADM_ID',
    'DOB',
    'ADMITTIME',
    'L_1s_last',
    'INSURANCE_Medicare',
    'INSURANCE_Otra',
    'RELIGION_CATHOLIC',
    'RELIGION_NOT SPECIFIED',
    'RELIGION_Otra',
    'RELIGION_UNOBTAINABLE',
    'MARITAL_STATUS_MARRIED',
    'MARITAL_STATUS_Otra',
    'MARITAL_STATUS_SINGLE',
    'ETHNICITY_BLACK/AFRICAN AMERICAN',
    'ETHNICITY_Otra',
    'ETHNICITY_WHITE',
    'GENDER_M',
    'GENDER_Otra',
    'visit_rank',
    'horizons'
]
temporal_surv =[df.drop(columns=not_considet_temporal) for df in temporal_surv]



In [22]:
for i in range(10):
    print(temporal_surv[i].shape)

(44952, 671)
(7075, 671)
(2260, 671)
(976, 671)
(493, 671)
(261, 671)
(156, 671)
(114, 671)
(84, 671)
(58, 671)


In [23]:
import pandas as pd

# Asumiendo que df_list es tu lista de DataFrames y que df_list[0] es el DataFrame que contiene todos los 'SUBJECT_ID' que quieres mantener

# Crear un DataFrame que solo contiene los 'SUBJECT_ID' únicos de df_list[0]
unique_subjects = pd.DataFrame(temporal_surv[0]['SUBJECT_ID'].unique(), columns=['SUBJECT_ID'])

# Inicializar una lista vacía para almacenar los DataFrames modificados
new_df_list = []

# Para cada DataFrame en df_list, hacer un 'merge' con 'unique_subjects' para asegurar que todos los 'SUBJECT_ID' de df_list[0] estén presentes
for df in temporal_surv:
    new_df = pd.merge(unique_subjects, df, on='SUBJECT_ID', how='outer')
    #new_df = new_df.fillna({col: pd.Timedelta(0) for col in new_df.select_dtypes(include=['timedelta'])})
    new_df.fillna(0, inplace=True)  # Llenar los valores faltantes con 0
    new_df_list.append(new_df)

In [24]:

df = pd.concat(new_df_list)
df = df.sort_values('SUBJECT_ID') 
dfs = [group for _, group in df.groupby('SUBJECT_ID')]

In [25]:
len(dfs)

44952

In [26]:
len(dfs)
print(len(dfs))
for i in dfs:
    print(len(i))

44952
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
4


42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42
42


In [27]:
static = static[[i for i in static.columns if i != "SUBJECT_ID"]].reset_index().sort_values('SUBJECT_ID')

In [28]:
static

Unnamed: 0,SUBJECT_ID,INSURANCE_Medicare,INSURANCE_Otra,RELIGION_CATHOLIC,RELIGION_NOT SPECIFIED,RELIGION_Otra,RELIGION_UNOBTAINABLE,MARITAL_STATUS_MARRIED,MARITAL_STATUS_Otra,MARITAL_STATUS_SINGLE,ETHNICITY_BLACK/AFRICAN AMERICAN,ETHNICITY_Otra,ETHNICITY_WHITE,GENDER_M,GENDER_Otra,LOSRD_sum,LOSRD_avg,L_1s_last_p1
0,2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,9.0,3.0,0.0
1,3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,90.0,10.0,0.0
2,4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,63.0,7.0,0.0
3,5,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,6.0,2.0,0.0
4,6,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,128.0,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44947,99985,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,208.0,16.0,0.0
44948,99991,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,204.0,12.0,0.0
44949,99992,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,24.0,2.0,0.0
44950,99995,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,51.0,3.0,0.0


# Entrenamiento

In [29]:
len(outcome)
out = pd.DataFrame(outcome)
df_out = out.rename(columns={'HOSPITAL_EXPIRE_FLAG': 'outcome'})
df_out.index.name = None

In [30]:
from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import TimeSeriesDataLoader
#static, temporal, horizons, outcome = GoogleStocksDataloader().load()

# Ensure that the number of rows in the static data matches the number of rows in the temporal data


loader = TimeSeriesDataLoader(
            temporal_data=dfs,
            observation_times=horizons,
            static_data=static,
            outcome=df_out,
)

plugin = Plugins().get("timegan", n_iter = 1)
plugin.fit(loader)

plugin.generate(count = 10)

In [None]:
# synthcity absolute
from synthcity.benchmark import Benchmarks

score = Benchmarks.evaluate(
    [
        (f"test_{model}", model, {})
        for model in ["timegan"]
    ],
    loader,
    synthetic_size=1000,
    repeats=2,
    task_type="time_series",  # time_series_survival or time_series
)

NameError: name 'loader' is not defined

In [None]:

Benchmarks.print(score)

In [None]:
# third party
import matplotlib.pyplot as plt

syn_model.plot(plt, loader)

plt.show()