In [1]:
from jyquickhelper import add_notebook_menu
add_notebook_menu()

# How does learningset work ?

#### This notebook aims to explain how to use the functions in transplant.data.learningset.py

It is also a good sandbox to try new function or debug the original file 

## Libraries 

In [2]:
import pandas as pd
import numpy as np

from transplant.data.dataset import Dataset
dataset=Dataset()

## Usfull functions you won't call directly 

This functions are used in the different functions so there are really usefull here

In [3]:
# Used to merge static and dynamic data (after having been flatten)
def merge_dyn_sta(X_train_static, X_train_dynamic, X_test_static, X_test_dynamic):
    return pd.merge(X_train_static, X_train_dynamic, on='id_patient'), pd.merge(X_test_static, X_test_dynamic, on='id_patient')

In [4]:
# Center and reduce the data
def center_reduce_data(W_train, W_test):
    mean_train = W_train.mean()
    std_train = W_train.std()

    return (W_train-mean_train)/std_train, (W_test-mean_train)/std_train

In [5]:
# Add some information about the length and the date of the operations
def add_start_end_length_op_to_static(X_stat, X_dyn):
            #X_dyn.index.names=['index',None] #Parfois ne semblait pas marcher à cause du format "grouped_by"
            #X_stat.index.names=['index']
            
            grouped_time = X_dyn.groupby(['id_patient'])['time']
            
            time_start_df = grouped_time.first().to_frame()
            time_start_df.columns = ['start_operation']
            #time_start_df['id_patient'] = time_start_df.index

            X_return = pd.merge(X_stat, time_start_df, on='id_patient',right_index=True)

            time_ends_df = grouped_time.last().to_frame()
            time_ends_df.columns = ['ends_operation']
            #time_ends_df['id_patient'] = time_ends_df.index

            X_return = pd.merge(X_return, time_ends_df, on='id_patient',right_index=True)

            X_return['length_op'] = (X_return['ends_operation'] - X_return['start_operation']).apply(lambda x: x.seconds//60)
            
            X_return['start_operation_year']=X_return['start_operation'].apply(lambda x: x.year)
            X_return['start_operation_month']=X_return['start_operation'].apply(lambda x: x.month)
            X_return['start_operation_day']=X_return['start_operation'].apply(lambda x: x.dayofyear)
            
            
            X_return['ends_operation_year']=X_return['ends_operation'].apply(lambda x: x.year)
            X_return['ends_operation_month']=X_return['ends_operation'].apply(lambda x: x.month)
            X_return['ends_operation_day']=X_return['ends_operation'].apply(lambda x: x.dayofyear)
 

            return X_return.drop(['ends_operation','start_operation'], axis=1)

In [6]:
# Merges Static and Dynamic (where we get the full time series)
def get_timeseries_in_array(X_stat, X_dyn, dyn_to_drop=['id_patient', 'time']):

            X_return = X_stat

            grouped = X_dyn.groupby(['id_patient'])

            list_time_serie_col = X_dyn.drop(dyn_to_drop, axis=1).columns

            for i in list_time_serie_col:
                df_muette = grouped[i].apply(np.array).to_frame()
                df_muette['id_patient'] = df_muette.index
                df_muette.index.names=['index']
                X_return = pd.merge(X_return, df_muette, on='id_patient')

            return X_return

## get_static_filled

In [7]:
def get_static_filled():
        # Function that return the train and test set of the static data where :
        # - We transformed the string into numbers
        # - We drop the columns with just Nan
        # - We replace the remaning Nan with the train column mean
        # - We make sure we got the same columns for the train and the test set
        from transplant.data.dataset import Dataset
        train_static_0, test_static_0 = Dataset().get_static()

        train_static_str_to_num = train_static_0.apply(
            pd.to_numeric, errors='coerce').dropna(1, how="all") #string in numbers and drop if full of Nan

        mean_train_static = train_static_str_to_num.mean() #get mean of train to fill empty values in train and test

        train_static_filled = train_static_str_to_num.fillna(mean_train_static) 
        
        test_static_filled = test_static_0.apply(
            pd.to_numeric, errors='coerce').dropna(1, how="all").fillna(mean_train_static) #drop strings and fill Nan
                                                                                            # in test by mean in train

        
        # Same columns
        drop_test = []
        drop_train = []
        train_static_columns = train_static_filled.columns
        test_static_columns = test_static_filled.columns

        for i in train_static_filled.columns:
            if not(i in test_static_columns):
                drop_train += [i]

        for i in test_static_columns:
            if not(i in train_static_filled.columns):
                drop_test += [i]

        train_static_filled = train_static_filled.drop(drop_train, axis=1)
        test_static_filled = test_static_filled.drop(drop_test, axis=1)

        return train_static_filled, test_static_filled

#### What it does :

Input : None

Output : Static train and test set usable for ML


Function that return the train and test set of the static data where :
        - We transformed the string into numbers
        - We drop the columns with just Nan
        - We replace the remaning Nan with the train column mean
        - We make sure we got the same columns for the train and the test set

#### How to use it 

In [8]:
from transplant.data.learningset import Learningset
learningset=Learningset()

train_static , test_static = learningset.get_static_filled()

In [9]:
train_static.head(3)

Unnamed: 0,Age_donor,Aspirations_donor,BMI_donor,Donneur_CPT,Insuffisance_renale,LAS,PAPS,PFO,PF_donor,Poids,...,preoperative_mechanical_ventilation,preoperative_pulmonary_hypertension,preoperative_vasopressor,retransplant,sexe,super_urgence,thoracic_surgery_history,time_on_waiting_liste,transplanted_twice_during_study_period,target
324,20,3,25.502925,5430.0,0,34.4,94.677165,0.0,490.0,76.0,...,0.0,0.0,0.0,0,1,0,1,16,0,1
323,56,3,25.502925,6750.0,0,35.8,35.0,0.0,513.0,50.0,...,0.0,0.0,0.0,0,1,0,1,33,0,1
328,22,3,25.502925,6750.0,0,33.2,32.0,0.0,301.0,70.0,...,0.0,0.0,0.0,0,1,0,1,7,0,0


## get_data_merged_dynamic_flatten_full

In [10]:
def get_data_merged_dynamic_flatten_full(target_format="cls", centered_reduced=False, full_df=False):
        
        from transplant.data.learningset import Learningset
        learningset = Learningset()

        train_static_0, test_static_0 = learningset.get_static_filled() #On prend les statics utilisables pour ML
        train_dynamic_0, test_dynamic_0 = dataset.get_dynamic() 
        
        train_dynamic_0 = train_dynamic_0.fillna(0) # On remplace les Nan par des 0 dans dynamic
        test_dynamic_0 = test_dynamic_0.fillna(0)

        train_static_1 = add_start_end_length_op_to_static(train_static_0, train_dynamic_0) #On obtient entre autre la longueur de l'opération 
        test_static_1 = add_start_end_length_op_to_static(test_static_0, test_dynamic_0)
        
        
        liste_func=[np.mean,np.std,np.amax,np.amin] #Les fonctions qu'on applique sur les séries temporelle
        liste_func_name=['mean','std','max','min']
        
        train_glob_0, test_glob_0 = train_static_1 , test_static_1

        for i in range(len(liste_func)) :
            
            func=liste_func[i]
    
            train_grouped=train_dynamic_0.drop(['time'],axis=1).groupby(['id_patient'], as_index=False) #On enregistre les groupes
            test_grouped=test_dynamic_0.drop(['time'],axis=1).groupby(['id_patient'], as_index=False)
    
            train_id=train_grouped['id_patient'].apply(np.mean)  #On conserve le id du patient
            test_id=test_grouped['id_patient'].apply(np.mean)
    
   
            train_dynamic_flat = train_grouped.apply(func)   #On applique la fonction
            test_dynamic_flat = test_grouped.apply(func)

            train_dynamic_flat.rename(columns=lambda x: x+'_'+liste_func_name[i] if x!='id_patient' else x, inplace=True)
            test_dynamic_flat.rename(columns=lambda x: x+'_'+liste_func_name[i]  if x!='id_patient' else x, inplace=True)
    
            train_dynamic_flat['id_patient']=train_id
            test_dynamic_flat['id_patient']=test_id
    
            train_glob_0, test_glob_0 = merge_dyn_sta(train_glob_0, train_dynamic_flat, test_glob_0, test_dynamic_flat)
    
        if full_df :
            return train_glob_0, test_glob_0

        dic_to_One_Hot = {0: [1, 0], 1: [0, 1]}

        y_train_cls = np.array(train_glob_0['target'])
        y_train_hot = np.array(list(train_glob_0['target'].map(dic_to_One_Hot)))

        y_test_cls = np.array(test_glob_0['target'])
        y_test_hot = np.array(list(test_glob_0['target'].map(dic_to_One_Hot)))

        if centered_reduced:
            X_train, X_test = center_reduce_data(train_glob_0.drop(['target'], axis=1), test_glob_0.drop(['target'], axis=1))

            X_train = np.array(X_train)
            X_test = np.array(X_test)

        else:
            X_train = np.array(train_glob_0.drop(['target'], axis=1))
            X_test = np.array(test_glob_0.drop(['target'], axis=1))

        # Return
        if target_format == "cls":
            return X_train, X_test, y_train_cls, y_test_cls, train_glob_0.drop(['target'], axis=1).columns

        if target_format == "One_Hot":
            return X_train, X_test, y_train_hot, y_test_hot, train_glob_0.drop(['target'], axis=1).columns



#### What it does :

Input : 

    -target_format="cls" if you want the target as 0 or 1 , "One_Hot" if you want the target to be [1,0] or [0,1]
    
    -centered_reduced=True if you want your data (except the target) to have mean 0 and var 1 (for train and we use the same transformation for the test set). False if you don't want
    
    -full_df=True if you want train and test as full dataframe (including the target)
    
Output :
  
    Return the train and test set where static is merge with the dynamic data where we get keept the mean , standard deviation , min and max of the time series.
    If full_df=True, the output return 2 dataframe
    If full_df=False , return 5 numpy array ( X_train, X_test, y_train, y_test and X_col (the name of the features)

#### How to use it 

In [11]:
from transplant.data.learningset import Learningset
learningset = Learningset()

train, test = learningset.get_data_merged_dynamic_flatten_full(full_df=True)

In [12]:
train.head(3)

Unnamed: 0,Age_donor,Aspirations_donor,BMI_donor,Donneur_CPT,Insuffisance_renale,LAS,PAPS,PFO,PF_donor,Poids,...,PNIm_min,PNIs_min,Pmax_min,Pmean_min,SpO2_min,SvO2 (m)_min,Temp_min,VT_min,declampage_cote1_done_min,declampage_cote2_done_min
0,22,3,25.502925,6750.0,0,33.2,32.0,0.0,301.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,56,1,25.502925,6151.95122,0,39.757613,94.677165,0.0,388.0,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44,1,22.481329,4950.0,0,46.1,60.0,0.0,486.0,43.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from transplant.data.learningset import Learningset
learningset = Learningset()

X_train, X_test, y_train , y_test, X_col = learningset.get_data_merged_dynamic_flatten_full()

In [14]:
X_train.shape, X_test.shape, y_train.shape , y_test.shape

((228, 142), (102, 142), (228,), (102,))

## get_data_merged_dynamic

In [15]:
def get_data_merged_dynamic(target_format="cls", full_df=False):
        from transplant.data.learningset import Learningset
        learningset = Learningset()
        
        train_static_0, test_static_0 = learningset.get_static_filled()
        train_dynamic_0, test_dynamic_0 = dataset.get_dynamic()

        mean_dynamic_train = train_dynamic_0.groupby(['id_patient']).mean().mean() # Fill Nan with mean   
        train_dynamic_0 = train_dynamic_0.fillna(mean_dynamic_train)
        test_dynamic_0 = test_dynamic_0.fillna(mean_dynamic_train)

        train_static_1 = add_start_end_length_op_to_static(train_static_0, train_dynamic_0) #Obtain length operation 
        test_static_1 = add_start_end_length_op_to_static(test_static_0, test_dynamic_0)
            
            
        
        train_glob = get_timeseries_in_array(train_static_1, train_dynamic_0) #Merge static and dynamic with full time series
        test_glob = get_timeseries_in_array(test_static_1, test_dynamic_0) 
        
        if full_df :
            return  train_glob , test_glob

        dic_to_One_Hot = {0: [1, 0], 1: [0, 1]}

        y_train_cls = np.array(train_glob['target'])
        y_train_hot = np.array(list(train_glob['target'].map(dic_to_One_Hot)))

        y_test_cls = np.array(test_glob['target'])
        y_test_hot = np.array(list(test_glob['target'].map(dic_to_One_Hot)))


        X_train = np.array(train_glob.drop(['target'], axis=1))
        X_test = np.array(test_glob.drop(['target'], axis=1))

        # Return
        if target_format == "cls":
            return X_train, X_test, y_train_cls, y_test_cls, train_glob.drop(['target'], axis=1).columns

        if target_format == "One_Hot":
            return X_train, X_test, y_train_hot, y_test_hot, train_glob.drop(['target'], axis=1).columns

#### What it does :

Input : 

    -target_format="cls" if you want the target as 0 or 1 , "One_Hot" if you want the target to be [1,0] or [0,1]
    
    -full_df=True if you want train and test as full dataframe (including the target)
    
Output :
  
    Return the train and test set where static is merge with the dynamic data (full time series)
    
    If full_df=True, the output return 2 dataframe
    If full_df=False , return 5 numpy array ( X_train, X_test, y_train, y_test and X_col (the name of the features)

#### How to use it 

In [16]:
from transplant.data.learningset import Learningset
learningset = Learningset()

train, test = get_data_merged_dynamic(full_df=True)

In [17]:
train.head(3)

Unnamed: 0,Age_donor,Aspirations_donor,BMI_donor,Donneur_CPT,Insuffisance_renale,LAS,PAPS,PFO,PF_donor,Poids,...,PNIm,PNIs,Pmax,Pmean,SpO2,SvO2 (m),Temp,VT,declampage_cote1_done,declampage_cote2_done
0,22,3,25.502925,6750.0,0,33.2,32.0,0.0,301.0,70.0,...,"[0.0, 0.0, 0.0, 0.0, 108.0, 0.0, 0.0, 0.0, 102...","[0.0, 0.0, 0.0, 0.0, 142.0, 0.0, 0.0, 0.0, 130...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 0, ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[99, 99, 97, 97, 97, 97, 97, 97, 96, 96, 97, 9...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,56,1,25.502925,6151.95122,0,39.757613,94.677165,0.0,388.0,52.0,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 2, 1, 4, 3, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, ...","[99, 100, 100, 99, 99, 71, 100, 100, 100, 100,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,44,1,22.481329,4950.0,0,46.1,60.0,0.0,486.0,43.0,...,"[4.7212439550048755, 4.7212439550048755, 4.721...","[6.3442819371062615, 6.3442819371062615, 6.344...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[89, 90, 88, 91, 93, 94, 94, 95, 94, 94, 93, 9...","[46.77534140375992, 46.77534140375992, 46.7753...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
