In [1]:
import sys
sys.path.append("./..")

import tsdb
import pandas as pd
from sklearn.model_selection import train_test_split

from src.data.datatool import DataProcessor

In [None]:
import tsdb
import pandas as pd
import numpy as np
import torch
from dataclasses import dataclass



@dataclass
class DataProcessor:
    df: pd.DataFrame
    df_label: pd.DataFrame
    id_col: str
    time_col: str
    global_features: list
    cols_exclude_z_norm: list
    max_wanted_len: int
    use_padding: bool = True
    data_tensor: torch.tensor = None
    label_tensor: torch.tensor = None
    taget_name: str = None
    event_to_token : dict = None


    def replace_global_features_with_nan(self):
        """
        Replaces all numeric values for specified global features with NaN except for the first observation per id.
        """
        # Sort the DataFrame by id and timestamp
        self.df = self.df.sort_values(by=[self.id_col, self.time_col]).reset_index(drop=True)

        # Create a mask for the first occurrence of each ID
        # first_occurrence = self.df.groupby(self.id_col).cumcount() == 0

        # # Iterate through global features and set non-first occurrences to NaN
        # for feature in self.global_features:
        #     self.df[feature] = np.where(first_occurrence, self.df[feature], np.nan)

        mask = self.df.groupby(self.id_col).cumcount() == 0
        self.df.loc[~mask, self.global_features] = np.nan


    def zscore_transformation(self):
        """
        Apply z-score normalization to all columns except those specified in cols_exclude_z_norm.
        """
        feature_columns = [col for col in self.df.columns if col not in self.cols_exclude_z_norm]

        # Apply z-score transformation
        self.df[feature_columns] = self.df[feature_columns].apply(
            lambda x: (x - x.mean()) / x.std(), axis=0
        )


    def melt_dataframe(
            self,
            feature_name="event",
            value_name="value"
                       ):
        """
        Reshapes the DataFrame to a long format using melt, excluding NaN values for the value column.
        """
        self.df = pd.melt(
            self.df, 
            id_vars=[self.id_col, self.time_col], 
            var_name=feature_name, 
            value_name=value_name
        ).dropna(subset=["value"]).sort_values(by=[self.id_col, self.time_col])

    def merge_label(
        self,
        # target,
    ):
        # self.taget_name = target
        self.df = pd.merge(self.df, self.df_label, on=self.id_col,how='inner')
        # self.label_tensor = torch.tensor(self.df.drop_duplicates(subset=self.id_col, keep='first')[target].values)


    
    def tokenizer(
            self,
            col_event='event'
        ):
        # Step 1: Map unique events to integer tokens
        unique_events = self.df[col_event].unique()
        self.event_to_token = {event: idx for idx, event in enumerate(unique_events)}

        # Step 2: Apply the mapping to create tokenized data
        self.df[col_event] = self.df[col_event].map(self.event_to_token)


    def df_to_3dtensor(
            self,
    ):

        id = self.id_col
        date = self.time_col

        grouped = self.df.groupby(id)
        max_length = min(self.max_wanted_len, max(grouped.size()))
        if self.max_wanted_len > max_length:
            print(f"max_wanted_len is langer dan de de aantal timestamps in de data,namelijk: {self.max_wanted_len}. data heeft max van: {max_length}")
        
        tensors = []
        for _, group in grouped:
            if len(group) > max_length:
                # cut of the oldest record to given sequence length
                group = group.iloc[-max_length:]
            elif self.use_padding:
                # fill up wit 0 paddings if shorten than given seuqence length:
                padding = pd.DataFrame([{date: None, **{feat: 0 for feat in group.columns if feat not in [self.id_col]}}] * (
                        max_length - len(group)))
                group = pd.concat([group, padding], ignore_index=True)
                group[id] = group[id].max()  # fill the empty id with id number

            group_features = group.values
            group_tensor = torch.from_numpy(group_features)
            tensors.append(group_tensor)

        data_tensor = torch.stack(tensors)
        
        self.data_tensor = data_tensor[:,:,1:] # remove id
        self.data_tensor = data_tensor[:,:,:-1] # remove target
        self.label_tensor = data_tensor[:,:,-1].max(dim=1).values # get target
        # return self.tensor

    def return_(
            self,
            name="df"
            ):
        if name == "df":
            return self.df
        if name == "3dtensor":
            return self.data_tensor.float()
        if name == "label_tensor":
            return self.label_tensor.long()
        
    
    def apply_steps(self, steps: list):
        for step in steps:
            if hasattr(self, step):
                getattr(self, step)()
            else:
                print(f"Step '{step}' not found in DataProcessor.")
        
    def get_data(self, steps=None):
        if steps is None:
            steps = [
                "replace_global_features_with_nan",
                "zscore_transformation",
                "melt_dataframe",
                "merge_label",
                "tokenizer",
                "df_to_3dtensor"
            ]
        self.apply_steps(steps)
        return self.return_("3dtensor"), self.return_("label_tensor")



In [22]:
dataset = 'physionet_2012'
data = tsdb.load(dataset)
config = {}

print(data.keys())

df_a  = pd.DataFrame(data['set-a'])
df_outcomes_a  = pd.DataFrame(data['outcomes-a']).reset_index()
df_static_features  = pd.DataFrame(data['static_features'])

id = "RecordID"
time = "Time"
target = "In-hospital_death"
global_features = list(df_static_features.iloc[:,0])


processor = DataProcessor(
    df=df_a,
    df_label=df_outcomes_a,  
    id_col=id, 
    time_col=time,
    taget_name=target,
    global_features=global_features,
    cols_exclude_z_norm=[id, time],
    max_wanted_len=5000
)


d3tensor, label_tensor = processor.get_data(
    # steps = [
    #             "replace_global_features_with_nan",
    #             "zscore_transformation",
    #             # "melt_dataframe",
    #             "merge_label",
    #             # "tokenizer",
    #             "df_to_3dtensor"
    #         ]
)

print(f"""
      Shape van data-tensor {d3tensor.shape} 
      Aantal cases zijn: {d3tensor.shape[0]}
      Aantal tijdsobservatie per cases zijn: {d3tensor.shape[1]} (NULL wordt gepad)
      De meet waarde over de tijd zijn: tijdsmoment, event, waarde
""")

2024-12-10 14:41:47 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-12-10 14:41:47 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-12-10 14:41:47 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-12-10 14:41:47 [INFO]: Loaded successfully!


dict_keys(['set-a', 'set-b', 'set-c', 'outcomes-a', 'outcomes-b', 'outcomes-c', 'static_features'])
max_wanted_len is langer dan de de aantal timestamps in de data,namelijk: 5000. data heeft max van: 631

      Shape van data-tensor torch.Size([3997, 631, 4]) 
      Aantal cases zijn: 3997
      Aantal tijdsobservatie per cases zijn: 631 (NULL wordt gepad)
      De meet waarde over de tijd zijn: tijdsmoment, event, waarde



In [19]:
df_a

Parameter,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,Creatinine,DiasABP,...,SaO2,SysABP,Temp,Time,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,,,,54.0,,,,,,,...,,,35.35,0,,,480.0,,-1.0,
1,,,,54.0,,,,,,,...,,,,1,,,30.0,,,
2,,,,54.0,,,,,,,...,,,,2,,,170.0,,,
3,,,,54.0,,,,,,,...,,,37.80,3,,,60.0,,,
4,,,,54.0,,,,,,,...,,,,4,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,,,,78.0,,,,,,52.0,...,98.0,112.0,37.70,43,,,25.0,,87.3,7.34
43,,,,78.0,,,,,,49.0,...,,110.0,37.60,44,,,25.0,,87.3,
44,,,,78.0,,,,,,50.0,...,,112.0,,45,,,23.0,,87.3,7.31
45,,,,78.0,,,,,,54.0,...,,121.0,37.30,46,,,40.0,,87.3,


In [3]:
df_a

Parameter,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,Creatinine,DiasABP,...,SaO2,SysABP,Temp,Time,TroponinI,TroponinT,Urine,WBC,Weight,pH
0,,,,54.0,,,,,,,...,,,35.35,0,,,480.0,,-1.0,
1,,,,54.0,,,,,,,...,,,,1,,,30.0,,,
2,,,,54.0,,,,,,,...,,,,2,,,170.0,,,
3,,,,54.0,,,,,,,...,,,37.80,3,,,60.0,,,
4,,,,54.0,,,,,,,...,,,,4,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,,,,78.0,,,,,,52.0,...,98.0,112.0,37.70,43,,,25.0,,87.3,7.34
43,,,,78.0,,,,,,49.0,...,,110.0,37.60,44,,,25.0,,87.3,
44,,,,78.0,,,,,,50.0,...,,112.0,,45,,,23.0,,87.3,7.31
45,,,,78.0,,,,,,54.0,...,,121.0,37.30,46,,,40.0,,87.3,


In [4]:
df_outcomes_a

Unnamed: 0,RecordID,In-hospital_death
0,132539,0
1,132540,0
2,132541,0
3,132543,0
4,132545,0
...,...,...
3995,142665,0
3996,142667,0
3997,142670,0
3998,142671,1


In [5]:
df_a.merge(df_outcomes_a,on=id, how='inner')

Unnamed: 0,ALP,ALT,AST,Age,Albumin,BUN,Bilirubin,Cholesterol,Creatinine,DiasABP,...,SysABP,Temp,Time,TroponinI,TroponinT,Urine,WBC,Weight,pH,In-hospital_death
0,,,,54.0,,,,,,,...,,35.35,0,,,480.0,,-1.0,,0
1,,,,54.0,,,,,,,...,,,1,,,30.0,,,,0
2,,,,54.0,,,,,,,...,,,2,,,170.0,,,,0
3,,,,54.0,,,,,,,...,,37.80,3,,,60.0,,,,0
4,,,,54.0,,,,,,,...,,,4,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180547,,,,78.0,,,,,,52.0,...,112.0,37.70,43,,,25.0,,87.3,7.34,0
180548,,,,78.0,,,,,,49.0,...,110.0,37.60,44,,,25.0,,87.3,,0
180549,,,,78.0,,,,,,50.0,...,112.0,,45,,,23.0,,87.3,7.31,0
180550,,,,78.0,,,,,,54.0,...,121.0,37.30,46,,,40.0,,87.3,,0


In [26]:
index =1
RecordID_int = d3tensor[index][0][0].int().item()
label_prep = label_tensor[index].item()
label_orgi_id, label_orgi_label = df_outcomes_a.query(f"{id}=={RecordID_int}").values[0]
print(RecordID_int)
print(label_prep)
print(label_orgi_id, label_orgi_label)

132540
0
132540 0


In [29]:
for i in range(len(d3tensor)):
    RecordID_int = d3tensor[index][0][0].int().item()
    label_prep = label_tensor[index].item()
    label_orgi_id, label_orgi_label = df_outcomes_a.query(f"{id}=={RecordID_int}").values[0]
    if label_orgi_label != label_prep:
        print(RecordID_int)
    # print(RecordID_int)
    # print(label_prep)
    # print(label_orgi_id, label_orgi_label)

In [39]:
from src.datastreamers import BaseDatastreamer, BaseDataset

config["batchsize"] = 32

# Split the dataset into train, validation, and test sets
train_data, test_data, train_labels, test_labels = train_test_split(
    d3tensor, label_tensor, test_size=0.2, random_state=42
)

# Put data in streamer:
test_streamer = BaseDatastreamer(dataset=BaseDataset(test_data, test_labels), batchsize=config["batchsize"])

# Split the train_data further into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    train_data, train_labels, test_size=0.3, random_state=42
)

# Put data in streamer:
train_streamer = BaseDatastreamer(dataset=BaseDataset(train_data, train_labels), batchsize=config["batchsize"])
validation_streamer = BaseDatastreamer(dataset=BaseDataset(val_data, val_labels), batchsize=config["batchsize"])

# Verify the split
print(f"Training set size: {train_data.shape[0]}")
print(f"Validation set size: {val_data.shape[0]}")
print(f"Test set size: {test_data.shape[0]}")

Training set size: 2237
Validation set size: 960
Test set size: 800


In [45]:
for s in range(len(train_streamer)):
    x, y = next(train_streamer.stream())
    for i in range(len(x)):
        RecordID_int = x[index][0][0].int().item()
        label_prep = y[index].item()
        label_orgi_id, label_orgi_label = df_outcomes_a.query(f"{id}=={RecordID_int}").values[0]
        
        
        if label_orgi_label != label_prep:
            print(RecordID_int)
        

In [37]:
len(train_streamer)

69