In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import datetime
import seaborn as sns
from datetime import date
from sklearn.preprocessing import LabelEncoder
from statsmodels.graphics.tsaplots import plot_acf
import warnings
warnings.filterwarnings('ignore') #action='once')

In [15]:
df_fildate = pd.read_table("./processed/filtered_data.csv", sep=',', index_col = 0)
cross_id_dom50 = pd.read_table("./processed/cross_id_dom.csv", sep=',', index_col = 0)
cross_id_dom50.head()

Unnamed: 0,MDN,institution,NotitieID,Typenotitie,Notitiedatum,annotated,ADM_lvl,ATT_lvl,BER_lvl,ENR_lvl,ETN_lvl,FAC_lvl,INS_lvl,MBW_lvl,STM_lvl
0,138098,161,161,160,161,161,135,1,4,23,40,24,16,18,36
1,406479,105,105,103,105,105,89,2,0,3,43,24,20,2,15
2,544716,109,109,107,109,109,82,1,0,10,36,41,5,8,24
3,762325,66,66,66,66,66,56,0,2,9,29,7,8,4,3
5,840595,91,91,91,91,91,80,0,0,17,27,40,8,8,18


## Aligning time

In [16]:
def align_time(df):
    '''Align first day of notes per patient as being day 0, and calculates all consecutive days from that.'''
    
    df['year'] = df['Notitiedatum'].str.split('-').str[0]
    df['month'] = df['Notitiedatum'].str.split('-').str[1]
    df['day'] = df['Notitiedatum'].str.split('-').str[2]

    #array with idx for first occurrence of a patient (trick from https://stackoverflow.com/questions/41255215/pandas-find-first-occurrence)
    first_occ= [df.MDN.eq(patient).idxmax() for patient in df.MDN.unique()]

    deltas = []
    counter = 0
    for idx in first_occ: 
        f_date = date(int(df['year'][idx]), int(df['month'][idx]), int(df['day'][idx])) #reference date (first visit)
        df_patient = df[df.MDN ==df.MDN[idx]]
        counter +=  len(df_patient)
        for year,month,day in zip(df_patient['year'],df_patient['month'],df_patient['day']): #smaller dataframe with only a given patient data
            l_date = date(int(year), int(month), int(day)) #put i to generalize
            deltas.append(str(l_date-f_date).split(" ")[0].split(":")[0])
            if int(str(l_date-f_date).split(" ")[0].split(":")[0]) > 460: 
                print(str(l_date-f_date).split(" ")[0].split(":")[0])
                print(year, month, day)
    df['Days_count'] = deltas
    print(len(deltas),len(df['Notitiedatum']),counter)

    #encode days_count as int64
    labelenc = LabelEncoder()
    df.iloc[:,-1] = labelenc.fit_transform(df.iloc[:,-1].values)
    
    return df 

df_fildate = align_time(df_fildate)      

22045 22045 22045


In [17]:
#updating dataframe with all filters included
m = df_fildate.MDN.isin(cross_id_dom50.MDN)
df_final = df_fildate[m]

labelenc = LabelEncoder()
df_final.iloc[:,-1] = labelenc.fit_transform(df_final.iloc[:,-1].values)

- Use only the first admission data (so not use the outliers that happens months later when the person goes back for a checkup)

In [18]:
#difference between day of note and previous date of note. for first note of patient this value is defined as zero.
df_final['Dif'] = df_final.Days_count.diff(periods=1)
df_final['Dif'] = df_final['Dif'].clip(lower=0, upper=None, axis=0)
df_final.iloc[:,-1] = labelenc.fit_transform(df_final.iloc[:,-1].values)

In [19]:
#check how many days without any note indicates the person is not anymore admitted. Initial filter is if it is 3 days in a row without notes, notes after are disregarded
#from this point on the trimmed dataframe based on final dataset will be called df for simplicity
df = df_final[df_final['Dif'] < 3]
print(df.shape, df_final.shape)

(17912, 20) (20662, 20)


- Interpolating data per domain (in principle for ADM only) and creating time series features

In [8]:
#checking missing values. Here we focus on ADM_lvl mainly
df.isnull().sum()

institution         0
MDN                 0
NotitieID           0
Typenotitie       229
Notitiedatum        0
annotated           0
ADM_lvl          4777
ATT_lvl         17746
BER_lvl         17542
ENR_lvl         15966
ETN_lvl         10443
FAC_lvl         15019
INS_lvl         16257
MBW_lvl         17285
STM_lvl         14146
year                0
month               0
day                 0
Days_count          0
Dif                 0
delta_7             0
delta_30            0
delta_60            0
dtype: int64

In [9]:
# Interpolate the dataset based on previous/next values..
def impute_interpolate(dataset, col):
    dataset[col] = dataset[col].interpolate()
    # And fill the initial data points if needed:
    dataset[col] = dataset[col].fillna(method='bfill')
    return dataset

In [10]:
def add_lag(df):
    '''Interpolates and creates new features by shifting window 1,2 and 3 instances within each patient.'''
    
    first_occ = [df.MDN.eq(patient).idxmax() for patient in df.MDN.unique()]
    #creating different lags that can affect the time series
    df['lag1_ADM'] = df.loc[:, 'lag1_ADM'] = 0
    df['lag2_ADM'] = df.loc[:, 'lag2_ADM'] = 0
    df['lag3_ADM'] = df.loc[:, 'lag3_ADM'] = 0

    counter = 0

    for idx in first_occ: 
        df_patient = df[df.MDN == df.MDN[idx]]
        counter +=  len(df_patient)
        for domain in df_patient['ADM_lvl']: #smaller dataframe with only a given patient data
            # Interpolate the column based on previous/next values for ADM
            df['ADM_lvl'].loc[df.MDN == df.MDN[idx]] = df_patient['ADM_lvl'].interpolate()
            df['ADM_lvl'].loc[df.MDN == df.MDN[idx]] = df_patient['ADM_lvl'].fillna(method='bfill')

            #by shifting one element.  
            df['lag1_ADM'].loc[df.MDN == df.MDN[idx]] = df_patient.ADM_lvl.shift(1)
            df['lag2_ADM'].loc[df.MDN == df.MDN[idx]] = df_patient.ADM_lvl.shift(2)
            df['lag3_ADM'].loc[df.MDN == df.MDN[idx]] = df_patient.ADM_lvl.shift(3)

    df.isnull().sum()

    labelenc = LabelEncoder()
    #encode days_count as int64
    df.iloc[:,-1] = labelenc.fit_transform(df.iloc[:,-1].values)
    df.iloc[:,-2] = labelenc.fit_transform(df.iloc[:,-2].values)
    df.iloc[:,-3] = labelenc.fit_transform(df.iloc[:,-3].values)
    return df

df = add_lag(df)

- Creating feature with average result of all domains per instance

In [11]:
#creating feature that is average of all domains
columns = ['ADM_lvl', 'ATT_lvl', 'BER_lvl', 'ENR_lvl', 'ETN_lvl', 'FAC_lvl', 'INS_lvl', 'MBW_lvl', 'STM_lvl']  
df['avg_domains'] = df[columns].mean(axis=1)
df

Unnamed: 0,institution,MDN,NotitieID,Typenotitie,Notitiedatum,annotated,ADM_lvl,ATT_lvl,BER_lvl,ENR_lvl,...,day,Days_count,Dif,delta_7,delta_30,delta_60,lag1_ADM,lag2_ADM,lag3_ADM,avg_domains
67031,amc,138098,457333386,Zorgplan/VPK rapportage,2020-10-14,False,3.034180,,,1.566406,...,14,0,0,1,1,1,11934,11985,8023,2.933268
77391,amc,138098,456811920,Voortgangsverslag,2020-10-14,False,3.034180,,,1.410156,...,14,0,0,1,1,1,7768,8535,9372,2.383789
66123,amc,138098,457532765,Voortgangsverslag,2020-10-15,False,2.912109,,,1.762695,...,15,1,1,1,1,1,6286,8536,9371,2.337402
77392,amc,138098,457533093,Zorgplan/VPK rapportage,2020-10-15,False,3.478516,,,,...,15,1,0,1,1,1,6185,6244,9370,3.683594
79526,amc,138098,457617575,Zorgplan/VPK rapportage,2020-10-15,False,3.478516,,,,...,15,1,0,1,1,1,7740,6143,6213,3.704102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15952,amc,9964647,412188625,Zorgplan/VPK rapportage,2020-04-28,False,2.796875,,,,...,28,32,0,0,0,1,6534,8537,6778,2.796875
11403,amc,9964647,411779834,Brief,2020-04-29,False,1.609985,,,,...,29,33,1,0,0,1,6064,6490,7102,1.609985
11405,amc,9964647,412538539,Telefonisch contact,2020-04-29,False,4.156250,,,1.518555,...,29,33,0,0,0,1,3768,6024,6458,2.583659
3057,amc,9964647,412727649,Telefonisch contact,2020-04-30,False,3.670898,,,1.774414,...,30,34,1,0,0,1,6989,3749,5994,2.722656


- Create features using different window techniques

In [12]:
def rolling_and_expanding_mean(df):
    '''Rolling window feature: the size of the window is constant while the window slides forward in time. 
    calculating some statistical values based on past values using a rolling window method.
     
    Expanding window: an advanced version of the rolling window technique. 
    This feature takes all the past values into account.'''

    first_occ = [df.MDN.eq(patient).idxmax() for patient in df.MDN.unique()]

    counter = 0

    for idx in first_occ: 
        df_patient = df[df.MDN == df.MDN[idx]]
        counter +=  len(df_patient)
        for domain in df_patient['ADM_lvl']: #smaller dataframe with only a given patient data
            
            #mean, sum, min, max value

            df['rolling_mean3'] = df['ADM_lvl'].rolling(window=3).mean()
            df['rolling_mean7'] = df['ADM_lvl'].rolling(window=7).mean()

            df['rolling_sum3'] = df['ADM_lvl'].rolling(window=3).sum()
            df['rolling_sum7'] = df['ADM_lvl'].rolling(window=7).sum()

            df['rolling_min3'] = df['ADM_lvl'].rolling(window=3).min()
            df['rolling_min7'] = df['ADM_lvl'].rolling(window=7).min()

            df['rolling_max3'] = df['ADM_lvl'].rolling(window=3).max()
            df['rolling_max7'] = df['ADM_lvl'].rolling(window=7).max()

            
            df['expanding_mean'] = df['ADM_lvl'].expanding(2).mean()

    df.isnull().sum()
    return df

df = rolling_and_expanding_mean(df)

In [13]:
df.to_csv('/Users/brunaguedes/Documents/GitHub/a-proof-time-series/processed/feature_engineering.csv')