In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sts
import seaborn as sns
from pandas.api.types import CategoricalDtype
import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def get_missing_values_df(df):
    missing_values_df = pd.DataFrame(columns=['year', 'count'] + list(df.columns))
    for year in df['JAHR'].unique():
        year_df = df[df['JAHR'] == year]
        count = len(year_df)
        missing_values = count-year_df.isnull().sum()
        missing_values_df = missing_values_df.append({'year': year, 'count': count, **missing_values}, ignore_index=True)
    return missing_values_df

from IPython.display import HTML

# Define a custom CSS style to make tables horizontally scrollable
css_style = """
<style>
    table.dataframe {
        display: block;
        overflow-x: auto;
    }
</style>
"""

HTML(css_style)


############## R E L E V A N T   C O D E

warnings.filterwarnings("ignore")
data = pd.read_csv('./data/kzp-2008-2020-timeseries.csv', encoding="ISO-8859-1")
df=data
df3=data
df5=data
df7=data
missing_values_df = pd.DataFrame(columns=['year', 'count'] + list(df.columns))

#create df with most possible non 0 columns per year
# Loop through each year in the 'JAHR' column
for year in df['JAHR'].unique():
    year_df = df[df['JAHR'] == year]
    count = len(year_df)
    #count missing values
    missing_values = year_df.isnull().sum()
    # amount of non missing values
    non_missing_values = count - missing_values
    #adding count
    missing_values_df = missing_values_df.append({'year': year, 'count': count, **non_missing_values}, ignore_index=True)

years = [2014, 2015, 2016, 2017, 2018, 2019, 2020] #years to keep
cols = []
for var in years:
    df_year = missing_values_df[missing_values_df['year'] == var] # only want specified years
    cols.extend([col for col in df_year.columns if (df_year[col] == 0).any()]) # add col names where value=0 to list
cols_to_remove = list(set(cols)) # reduce the list to only include unique values
#print(cols_to_remove)
for var in cols_to_remove:
    if var in df.columns:
        df=df.drop(var, axis=1)
        #print(var,",was dropped")
years_to_drop=[2013,2012,2011,2010,2009,2008]

df=df.drop(df[df['JAHR'].isin(years_to_drop)].index)
df2=df
#onehot encode certain variables
df["Akt"]=df['Akt'].astype('string')
df["Akt"]=df['Akt'].str.split(', ')
Akt_dummies = pd.get_dummies(df['Akt'].explode())
Akt_dummies= Akt_dummies.groupby(level=0).sum()
Akt_dummies.columns= 'Akt_' + Akt_dummies.columns
#print(Akt_dummies)


df["SL"]=df["SL"].fillna("none")
df["SL"]=df['SL'].astype('string')
df["SL"]=df['SL'].str.split(', ')
SL_dummies = pd.get_dummies(df['SL'].explode())
SL_dummies= SL_dummies.groupby(level=0).sum()
SL_dummies=SL_dummies.drop("none", axis=1)
SL_dummies.columns= 'SL_' + SL_dummies.columns
#print(SL_dummies)


df["SA"]=df["SA"].fillna("none")
df["SA"]=df['SA'].astype('string')
df["SA"]=df['SA'].str.split(', ')
SA_dummies = pd.get_dummies(df['SA'].explode())
SA_dummies= SA_dummies.groupby(level=0).sum()
SA_dummies=SA_dummies.drop("none", axis=1)
SA_dummies.columns= 'SA_' + SA_dummies.columns
#print(SA_dummies)

df["WB"]=df["WB"].fillna("none")
df["WB"]=df['WB'].astype('string')
df["WB"]=df['WB'].str.split(', ')
WB_dummies = pd.get_dummies(df['WB'].explode())
WB_dummies= WB_dummies.groupby(level=0).sum()
WB_dummies=WB_dummies.drop("none", axis=1)
WB_dummies.columns= 'WB_' + WB_dummies.columns

add_dummies=pd.get_dummies(df, columns=['Typ', 'KT', 'RForm'])

DF = pd.concat([df2, Akt_dummies, SL_dummies, SA_dummies, WB_dummies, add_dummies], axis=1)
DF=DF.drop(['SA','SL','Akt','Unnamed: 0','WB'], axis=1)
DF = DF.loc[:, ~DF.columns.duplicated()]
print(DF.columns)
#DF[['Typ', 'KT', 'Ort', 'RForm']]=df[['Typ', 'KT', 'Ort', 'RForm']].astype('category')
#DF['JAHR']=DF['JAHR'].astype(CategoricalDtype(categories=[2014, 2015, 2016, 2017, 2018, 2019, 2020],ordered=True))
######################################################################################################################################
# Loop through each year in the 'JAHR' column
missing_values_df2 = pd.DataFrame(columns=['year', 'count'] + list(df3.columns))
for year in df3['JAHR'].unique():
    year_df = df3[df3['JAHR'] == year]
    count = len(year_df)
    #count missing values
    missing_values = year_df.isnull().sum()
    # amount of non missing values
    non_missing_values = count - missing_values
    #adding count
    missing_values_df2 = missing_values_df2.append({'year': year, 'count': count, **non_missing_values}, ignore_index=True)

years = [ 2016, 2017, 2018, 2019, 2020] #years to keep
cols = []
for var in years:
    df_year = missing_values_df2[missing_values_df2['year'] == var] # only want specified years
    cols.extend([col for col in df_year.columns if (df_year[col] == 0).any()]) # add col names where value=0 to list
cols_to_remove2 = list(set(cols)) # reduce the list to only include unique values
#print(cols_to_remove)
for var in cols_to_remove2:
    if var in df3.columns:
        df3=df3.drop(var, axis=1)
        #print(var,",was dropped")
years_to_drop=[2015, 2014 ,2013,2012,2011,2010,2009,2008]

df3=df3.drop(df3[df3['JAHR'].isin(years_to_drop)].index)
df4=df3
df3["Akt"]=df3['Akt'].astype('string')
df3["Akt"]=df3['Akt'].str.split(', ')
Akt_dummies = pd.get_dummies(df3['Akt'].explode())
Akt_dummies= Akt_dummies.groupby(level=0).sum()
Akt_dummies.columns= 'Akt_' + Akt_dummies.columns
#print(Akt_dummies)


df3["SL"]=df3["SL"].fillna("none")
df3["SL"]=df3['SL'].astype('string')
df3["SL"]=df3['SL'].str.split(', ')
SL_dummies = pd.get_dummies(df3['SL'].explode())
SL_dummies= SL_dummies.groupby(level=0).sum()
SL_dummies=SL_dummies.drop("none", axis=1)
SL_dummies.columns= 'SL_' + SL_dummies.columns
#print(SL_dummies)


df3["SA"]=df3["SA"].fillna("none")
df3["SA"]=df3['SA'].astype('string')
df3["SA"]=df3['SA'].str.split(', ')
SA_dummies = pd.get_dummies(df3['SA'].explode())
SA_dummies= SA_dummies.groupby(level=0).sum()
SA_dummies=SA_dummies.drop("none", axis=1)
SA_dummies.columns= 'SA_' + SA_dummies.columns
#print(SA_dummies)

df3["WB"]=df3["WB"].fillna("none")
df3["WB"]=df3['WB'].astype('string')
df3["WB"]=df3['WB'].str.split(', ')
WB_dummies = pd.get_dummies(df3['WB'].explode())
WB_dummies= WB_dummies.groupby(level=0).sum()
WB_dummies=WB_dummies.drop("none", axis=1)
WB_dummies.columns= 'WB_' + WB_dummies.columns
#print(SA_dummies)

df3["LA"]=df3["LA"].fillna("none")
df3["LA"]=df3['LA'].astype('string')
df3["LA"]=df3['LA'].str.split(', ')
LA_dummies = pd.get_dummies(df3['LA'].explode())
LA_dummies= LA_dummies.groupby(level=0).sum()
LA_dummies.columns= 'LA_' + LA_dummies.columns

add_dummies2=pd.get_dummies(df3, columns=['Typ', 'KT', 'RForm'])

DF2 = pd.concat([df4, Akt_dummies, SL_dummies, SA_dummies, WB_dummies, LA_dummies,add_dummies2], axis=1)
DF2=DF2.drop(['SA','SL','Akt','Unnamed: 0','WB','LA'], axis=1)
print(DF2.columns)
DF2 = DF2.loc[:, ~DF2.columns.duplicated()]
####################################################################################################################
missing_values_df3 = pd.DataFrame(columns=['year', 'count'] + list(df5.columns))
# Loop through each year in the 'JAHR' column
for year in df5['JAHR'].unique():
    year_df = df5[df5['JAHR'] == year]
    count = len(year_df)
    #count missing values
    missing_values = year_df.isnull().sum()
    # amount of non missing values
    non_missing_values = count - missing_values
    #adding count
    missing_values_df3 = missing_values_df3.append({'year': year, 'count': count, **non_missing_values}, ignore_index=True)

years = [ 2013,2012,2011,2010] #years to keep
cols = []
for var in years:
    df_year = missing_values_df3[missing_values_df3['year'] == var] # only want specified years
    cols.extend([col for col in df_year.columns if (df_year[col] == 0).any()]) # add col names where value=0 to list
cols_to_remove3 = list(set(cols)) # reduce the list to only include unique values
#print(cols_to_remove)
for var in cols_to_remove3:
    if var in df5.columns:
        df5=df5.drop(var, axis=1)
        #print(var,",was dropped")
years_to_drop=[2015, 2014 ,2009,2008, 2016, 2017, 2018, 2019, 2020]

df5=df5.drop(df5[df5['JAHR'].isin(years_to_drop)].index)
df6=df5
df5["Akt"]=df5['Akt'].astype('string')
df5["Akt"]=df5['Akt'].str.split(', ')
Akt_dummies = pd.get_dummies(df5['Akt'].explode())
Akt_dummies= Akt_dummies.groupby(level=0).sum()
Akt_dummies.columns= 'Akt_' + Akt_dummies.columns
#print(Akt_dummies)


df5["SL"]=df5["SL"].fillna("none")
df5["SL"]=df5['SL'].astype('string')
df5["SL"]=df5['SL'].str.split(', ')
SL_dummies = pd.get_dummies(df5['SL'].explode())
SL_dummies= SL_dummies.groupby(level=0).sum()
SL_dummies=SL_dummies.drop("none", axis=1)
SL_dummies.columns= 'SL_' + SL_dummies.columns
#print(SL_dummies)


df5["SA"]=df5["SA"].fillna("none")
df5["SA"]=df5['SA'].astype('string')
df5["SA"]=df5['SA'].str.split(', ')
SA_dummies = pd.get_dummies(df5['SA'].explode())
SA_dummies= SA_dummies.groupby(level=0).sum()
SA_dummies=SA_dummies.drop("none", axis=1)
SA_dummies.columns= 'SA_' + SA_dummies.columns
#print(SA_dummies)

df5["WB"]=df5["WB"].fillna("none")
df5["WB"]=df5['WB'].astype('string')
df5["WB"]=df5['WB'].str.split(', ')
WB_dummies = pd.get_dummies(df5['WB'].explode())
WB_dummies= WB_dummies.groupby(level=0).sum()
WB_dummies=WB_dummies.drop("none", axis=1)
WB_dummies.columns= 'WB_' + WB_dummies.columns
#print(SA_dummies)

add_dummies3=pd.get_dummies(df5, columns=['Typ', 'KT'])

DF3 = pd.concat([df6, Akt_dummies, SL_dummies, SA_dummies, WB_dummies, add_dummies3], axis=1)
DF3=DF3.drop(['SA','SL','Akt','Unnamed: 0','WB'], axis=1)
print(DF3.columns)
DF3 = DF3.loc[:, ~DF3.columns.duplicated()]
####################################################################################################################
missing_values_df4 = pd.DataFrame(columns=['year', 'count'] + list(df7.columns))
# Loop through each year in the 'JAHR' column
for year in df7['JAHR'].unique():
    year_df = df7[df7['JAHR'] == year]
    count = len(year_df)
    #count missing values
    missing_values = year_df.isnull().sum()
    # amount of non missing values
    non_missing_values = count - missing_values
    #adding count
    missing_values_df4 = missing_values_df4.append({'year': year, 'count': count, **non_missing_values}, ignore_index=True)

years = [2010 ,2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020] #years to keep
cols = []
for var in years:
    df_year = missing_values_df4[missing_values_df4['year'] == var] # only want specified years
    cols.extend([col for col in df_year.columns if (df_year[col] == 0).any()]) # add col names where value=0 to list
cols_to_remove4 = list(set(cols)) # reduce the list to only include unique values
#print(cols_to_remove)
for var in cols_to_remove4:
    if var in df7.columns:
        df7=df7.drop(var, axis=1)
        #print(var,",was dropped")
years_to_drop=[2009,2008]

df7=df7.drop(df7[df7['JAHR'].isin(years_to_drop)].index)
df8=df7
df7["Akt"]=df7['Akt'].astype('string')
df7["Akt"]=df7['Akt'].str.split(', ')
Akt_dummies = pd.get_dummies(df7['Akt'].explode())
Akt_dummies= Akt_dummies.groupby(level=0).sum()
Akt_dummies.columns= 'Akt_' + Akt_dummies.columns
#print(Akt_dummies)


df7["SL"]=df7["SL"].fillna("none")
df7["SL"]=df7['SL'].astype('string')
df7["SL"]=df7['SL'].str.split(', ')
SL_dummies = pd.get_dummies(df7['SL'].explode())
SL_dummies= SL_dummies.groupby(level=0).sum()
SL_dummies=SL_dummies.drop("none", axis=1)
SL_dummies.columns= 'SL_' + SL_dummies.columns
#print(SL_dummies)


df7["SA"]=df7["SA"].fillna("none")
df7["SA"]=df7['SA'].astype('string')
df7["SA"]=df7['SA'].str.split(', ')
SA_dummies = pd.get_dummies(df7['SA'].explode())
SA_dummies= SA_dummies.groupby(level=0).sum()
SA_dummies=SA_dummies.drop("none", axis=1)
SA_dummies.columns= 'SA_' + SA_dummies.columns
#print(SA_dummies)

df7["WB"]=df7["WB"].fillna("none")
df7["WB"]=df7['WB'].astype('string')
df7["WB"]=df7['WB'].str.split(', ')
WB_dummies = pd.get_dummies(df7['WB'].explode())
WB_dummies= WB_dummies.groupby(level=0).sum()
WB_dummies=WB_dummies.drop("none", axis=1)
WB_dummies.columns= 'WB_' + WB_dummies.columns
#print(SA_dummies)

add_dummies4=pd.get_dummies(df7, columns=['Typ', 'KT'])

DF4 = pd.concat([df8, Akt_dummies, SL_dummies, SA_dummies, WB_dummies,add_dummies4], axis=1)
DF4=DF4.drop(['SA','SL','Akt','Unnamed: 0','WB'], axis=1)
print(DF4.columns)
DF4 = DF4.loc[:, ~DF4.columns.duplicated()]


Index(['JAHR', 'KT', 'Inst', 'Adr', 'Ort', 'Typ', 'AnzStand', 'RForm',
       'PtageStatMST', 'AustStatMST',
       ...
       'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH', 'RForm_R1',
       'RForm_R2', 'RForm_R3', 'RForm_R4'],
      dtype='object', length=161)
Index(['JAHR', 'KT', 'Inst', 'Adr', 'Ort', 'Typ', 'AnzStand', 'RForm',
       'AmbKonsT', 'PtageStatMST',
       ...
       'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH', 'RForm_R1',
       'RForm_R2', 'RForm_R3', 'RForm_R4'],
      dtype='object', length=319)
Index(['JAHR', 'KT', 'Inst', 'Adr', 'Ort', 'Typ', 'AnzStand', 'PtageStatT',
       'AustStatT', 'NeugStatT',
       ...
       'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS',
       'KT_ZG', 'KT_ZH'],
      dtype='object', length=296)
Index(['JAHR', 'KT', 'Inst', 'Adr', 'Ort', 'Typ', 'AnzStand', 'Ops', 'Gebs',
       'CMIb',
       ...
       'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS',
       'KT_ZG', 'KT_

In [3]:
huh=DF2.columns
hah=DF.columns
#difference = [item for item in huh if item not in hah]
#print(difference)
#output_adjusted_to_non_finacial=['AmbKonsT', 'pMRI_AMB', 'pMRI_STAT', 'pCT_AMB', 'pCT_STAT', 'pANGIO_AMB', 'pANGIO_STAT', 'pDIA_AMB', 'pDIA_STAT', 'AmbKonsA', 'AmbKonsP', 'AmbKonsR', 'AmbKonsB', 'LA_Amb', 'LA_Stat']
#0_addable=[ 'AmbKonsT',
#fix_per_activity=['AmbKons',
#onehot_encode=['LA']
#print(DF2.shape, DF3.shape, DF4.shape)
#DF.tail(15)
#For DF3:
ho=DF4.columns
ha=DF3.columns
difference=[item for item in ho if item not in ha]
print(difference)
#DF3.head(15)
print(huh)
print(*DF.columns)

['SA_Angio', 'SA_Dia']
Index(['JAHR', 'KT', 'Inst', 'Adr', 'Ort', 'Typ', 'AnzStand', 'RForm',
       'AmbKonsT', 'PtageStatMST',
       ...
       'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH', 'RForm_R1',
       'RForm_R2', 'RForm_R3', 'RForm_R4'],
      dtype='object', length=192)
JAHR KT Inst Adr Ort Typ AnzStand RForm PtageStatMST AustStatMST NeugStatMST Ops Gebs CMIb CMIn pPatWAU pPatWAK pPatLKP pPatHOK PersA PersP PersMT PersT StdBelA StdBelP AwBesold AwInvest AwSonst AwT EtMedL EtSonst EtSubv FiErg PtageStatMSA AustStatMSA BettenStatA DADStatMSA pBettenBelStatA KostAmbA KostOKPAmbA KostStatA KostKVGStatA AnlKVGStatA KostZvOKPStatA AnlZvOKPStatA ErlOKPAmbA ErlKVGStatA ErlKVGStatVA ErlZvOKPStatA ErlZvOKPStatVA PtageStatMSP AustStatMSP BettenStatP DADStatMSP pBettenBelStatP KostAmbP KostOKPAmbP KostStatP KostKVGStatP AnlKVGStatP KostZvOKPStatP AnlZvOKPStatP ErlOKPAmbP ErlKVGStatP ErlKVGStatVP ErlZvOKPStatP ErlZvOKPStatVP PtageStatMSR AustStatMSR BettenStatR DADStatMSR pBette

In [4]:
yearsDF4=[2010 ,2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
def fill_target(df, target, years):
    for year in years:
        mask = df['JAHR'] == year
        if df.loc[mask, target].eq(0).all():
            next_year = next((y for y in years if not df.loc[df['JAHR'] == y, target].eq(0).all()), None)
            if next_year:
                for i, row in df.loc[mask].iterrows():
                    mask2 = (df['Inst'] == row['Inst']) & (df['Adr'] == row['Adr'])
                    if df.loc[mask2 & (df['JAHR'] == next_year), target].eq(1).any():
                        df.loc[i, target] = 1
    
    return df
list_of_targets1=['SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs','WB_MSt']
list_of_targets2=['SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs','WB_MSt','LA_Amb', 'LA_Stat']
list_of_targets3=['SL_IPS', 'SL_NF', 'SA_CC', 'SA_CT', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs','WB_MSt']
list_of_targets4=['SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs','WB_MSt']
DF_S=['DF', 'DF2', 'DF3', 'DF4']

yearsDF=[2014, 2015, 2016, 2017, 2018, 2019, 2020]
yearsDF2=[2016, 2017, 2018, 2019, 2020]
yearsDF3=[2010 ,2011, 2012, 2013]
yearsDF4=[2010 ,2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

for var in list_of_targets1:
    DF=fill_target(DF, var, yearsDF)
for var in list_of_targets2:
    DF2=fill_target(DF2, var, yearsDF2)
for var in list_of_targets3:
    DF3=fill_target(DF3, var,yearsDF3)
for var in list_of_targets4:
    DF4=fill_target(DF4, var, yearsDF4)
#DF4=fill_target(DF4,'SA_Angio', yearsDF4)

In [19]:
DF.head(30)

Unnamed: 0,JAHR,KT,Inst,Adr,Ort,Typ,AnzStand,RForm,PtageStatMST,AustStatMST,NeugStatMST,Ops,Gebs,CMIb,CMIn,pPatWAU,pPatWAK,pPatLKP,pPatHOK,PersA,PersP,PersMT,PersT,StdBelA,StdBelP,AwBesold,AwInvest,AwSonst,AwT,EtMedL,EtSonst,EtSubv,FiErg,PtageStatMSA,AustStatMSA,BettenStatA,DADStatMSA,pBettenBelStatA,KostAmbA,KostOKPAmbA,KostStatA,KostKVGStatA,AnlKVGStatA,KostZvOKPStatA,AnlZvOKPStatA,ErlOKPAmbA,ErlKVGStatA,ErlKVGStatVA,ErlZvOKPStatA,ErlZvOKPStatVA,PtageStatMSP,AustStatMSP,BettenStatP,DADStatMSP,pBettenBelStatP,KostAmbP,KostOKPAmbP,KostStatP,KostKVGStatP,AnlKVGStatP,KostZvOKPStatP,AnlZvOKPStatP,ErlOKPAmbP,ErlKVGStatP,ErlKVGStatVP,ErlZvOKPStatP,ErlZvOKPStatVP,PtageStatMSR,AustStatMSR,BettenStatR,DADStatMSR,pBettenBelStatR,KostAmbR,KostOKPAmbR,KostStatR,KostKVGStatR,AnlKVGStatR,KostZvOKPStatR,AnlZvOKPStatR,ErlOKPAmbR,ErlKVGStatR,ErlKVGStatVR,ErlZvOKPStatR,ErlZvOKPStatVR,PtageStatMSB,AustStatMSB,BettenStatB,DADStatMSB,pBettenBelStatB,KostAmbB,KostOKPAmbB,KostStatB,KostKVGStatB,AnlKVGStatB,KostZvOKPStatB,AnlZvOKPStatB,ErlOKPAmbB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,Akt_A,Akt_B,Akt_P,Akt_R,SL_IPS,SL_NF,SA_Angio,SA_CC,SA_CT,SA_Dia,SA_LB,SA_Lito,SA_MRI,SA_PET,WB_Arzt,WB_BGs,WB_MSt,Typ_K111,Typ_K112,Typ_K121,Typ_K122,Typ_K123,Typ_K211,Typ_K212,Typ_K221,Typ_K231,Typ_K232,Typ_K233,Typ_K234,Typ_K235,KT_AG,KT_AI,KT_AR,KT_BE,KT_BL,KT_BS,KT_FR,KT_GE,KT_GL,KT_GR,KT_JU,KT_LU,KT_NE,KT_NW,KT_OW,KT_SG,KT_SH,KT_SO,KT_SZ,KT_TG,KT_TI,KT_UR,KT_VD,KT_VS,KT_ZG,KT_ZH,RForm_R1,RForm_R2,RForm_R3,RForm_R4
1819,2014,AG,Kantonsspital Aarau AG,Tellstrasse 15,5000 Aarau,K112,1.0,R1,159012.0,26426.0,1405.0,17.0,5.0,1.15,1.134,0.65,11.46,17.04,92.94,574.68,1350.82,343.28,3208.25,0.0,0.0,373864723.0,38427832.0,178070865.0,591171786.0,509389867.0,35835673.0,8767870.0,-30823598.0,159012.0,26426.0,507.0,6.0,82.9,211143508.0,211143508.0,372087665.0,265237018.0,29742928.0,85602605.0,7978552.0,0.0,216480401.0,56.6,82930175.0,75.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1820,2014,AG,Kantonsspital Baden AG,Im Ergel,5404 Baden,K112,4.0,R1,106526.0,18594.0,1542.0,8.0,6.0,0.928,0.922,0.61,4.47,20.76,95.52,278.23,734.76,194.12,1706.8,9695.0,0.0,184347426.0,15677737.0,88457963.0,298407910.0,284508743.0,22932042.0,4462063.0,23736533.0,106526.0,18594.0,356.224657,5.7,76.8,93247547.0,88259750.0,199956216.0,139246935.0,18481278.0,43304170.0,5778606.0,85315452.0,125571708.0,50.7,39294016.0,51.9,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1821,2014,AG,Hirslanden Klinik Aarau,Schänisweg 1,5000 Aarau,K112,1.0,R1,47599.0,9921.0,674.0,7.0,2.0,1.196,1.182,0.46,17.8,48.99,91.86,10.94,304.23,69.77,535.88,97652.0,18950.0,54866431.0,25470066.0,52896703.0,133234031.0,131602067.0,8572204.0,114081.0,7054321.0,47599.0,9921.0,143.189041,4.8,85.5,24950577.0,22775938.0,107415510.0,46008139.0,3768132.0,54510578.0,4555482.0,18770953.0,52637728.0,51.0,57058755.0,51.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,0,1,0,1,1,1,1,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1822,2014,AG,Gesundheitszentrum Fricktal,Riburgerstrasse 12,4310 Rheinfelden,K121,3.0,R1,35356.0,7666.0,382.0,6.0,3.0,0.795,0.745,7.04,7.57,22.64,90.05,74.9,229.13,59.45,535.03,0.0,0.0,58322403.0,4251881.0,22452229.0,85026513.0,79896871.0,5712222.0,1122038.0,1704618.0,35356.0,7666.0,138.167123,4.6,66.6,21367309.0,18541480.0,61845049.0,39493289.0,2943949.0,16106681.0,995948.0,14130601.0,37514810.0,51.0,18770549.0,51.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1823,2014,AG,Kreisspital für das Freiamt,Spitalstrasse 144,5630 Muri AG,K121,1.0,R2,33713.0,7767.0,516.0,4.0,2.0,0.812,0.782,0.38,6.36,16.47,90.67,71.45,241.27,63.8,492.63,264.0,0.0,54072375.0,5474959.0,26530958.0,86309923.0,83098118.0,5208087.0,1583704.0,3581159.0,33713.0,7767.0,114.0,4.3,75.7,20935228.0,18437023.0,62232527.0,45971259.0,4240190.0,10113310.0,1114794.0,15782381.0,43296898.0,51.4,12636109.0,61.5,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,1,0,0,1,1,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1824,2014,AG,Spital Zofingen AG,Mühlethalstrasse 27,4800 Zofingen,K122,1.0,R1,25482.0,5240.0,0.0,4.0,0.0,0.892,0.836,0.44,12.04,14.08,90.32,51.69,181.56,93.69,473.49,,,44822122.0,2661309.0,21380789.0,68864220.0,63050027.0,4656887.0,494435.0,-662872.0,25482.0,5240.0,84.0,4.9,83.2,19786092.0,17152704.0,51021579.0,38626950.0,3533299.0,8850512.0,732082.0,13350769.0,34115275.0,51.0,9494323.0,61.5,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1825,2014,AG,Asana Gruppe AG Spital Leuggern,Kommendeweg,5316 Leuggern,K122,1.0,R1,16232.0,3797.0,606.0,2.0,3.0,0.752,0.726,10.87,2.85,10.79,90.78,9.25,104.39,9.58,163.02,4999.0,2238.0,17915607.0,2295109.0,11819351.0,32030067.0,28796747.0,2382449.0,178578.0,337669.0,16232.0,3797.0,38.591781,4.3,96.6,3407581.0,0.0,22752339.0,17075878.0,1054047.0,3734228.0,211806.0,0.0,19415419.0,50.0,4965547.0,63.3,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1826,2014,AG,Asana Gruppe AG Spital Menziken,Spitalstrasse 1,5737 Menziken,K122,1.0,R1,15492.0,3443.0,177.0,3.0,2.0,0.76,0.729,0.29,6.8,12.0,90.28,25.42,91.57,28.63,225.14,4254.0,6651.0,20730967.0,1606813.0,10309743.0,32647523.0,28542940.0,2544316.0,162708.0,-369815.0,15492.0,3443.0,51.0,4.5,78.2,2015428.0,0.0,24897212.0,18414744.0,707532.0,4617754.0,146586.0,0.0,17269099.0,50.0,3718669.0,81.3,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1827,2014,AG,Klinik Villa im Park AG,Bernstrasse 84,4852 Rothrist,K122,1.0,R1,8888.0,2674.0,624.0,2.0,2.0,0.681,0.659,0.04,52.53,14.1,90.73,0.0,48.43,6.9,82.41,5708.0,0.0,6826094.0,2198898.0,8207219.0,17354735.0,17316918.0,739582.0,0.0,701764.0,8888.0,2674.0,32.0,3.3,54.7,525765.0,525765.0,17500073.0,13067955.0,537680.0,1976910.0,81374.0,0.0,12458195.0,51.0,1881212.0,51.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1828,2014,AG,Psychiatrische Dienste Aargau AG,Zürcherstrasse 241,5210 Windisch,K211,11.0,R1,116308.0,3431.0,0.0,0.0,0.0,,,0.44,4.15,5.66,99.7,101.43,319.4,158.86,846.74,,,89246701.0,11024162.0,27054777.0,128838758.0,116488445.0,6642461.0,7037800.0,3476796.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,116308.0,3431.0,322.0,33.9,100.2,34526350.0,34526350.0,86665943.0,68049228.0,9893181.0,5959861.0,796451.0,19681066.0,66974617.0,52.0,6612195.0,52.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [5]:
get_missing_values_df(DF2)
# display the resulting DataFrame
get_missing_values_df(DF4)

#years = [2010 ,2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
#for year in years:
#    df_year = DF4[DF4['JAHR'] == year]
#    count_0 = len(df_year[df_year['SA_Dia'] == 0])
#    count_1 = len(df_year[df_year['SA_Dia'] == 1])
#    print(f"In year {year}, there are {count_0} zeros and {count_1} ones in the SA_Dia column.")

Unnamed: 0,year,count,JAHR,KT,Inst,Adr,Ort,Typ,AnzStand,Ops,Gebs,CMIb,CMIn,pPatWAU,pPatWAK,pPatLKP,pPatHOK,PersA,PersP,PersMT,PersT,AwBesold,AwInvest,AwSonst,AwT,EtMedL,EtSonst,EtSubv,FiErg,BettenStatA,pBettenBelStatA,KostAmbA,KostOKPAmbA,KostStatA,KostKVGStatA,AnlKVGStatA,KostZvOKPStatA,AnlZvOKPStatA,ErlOKPAmbA,ErlKVGStatA,ErlKVGStatVA,ErlZvOKPStatA,ErlZvOKPStatVA,BettenStatP,pBettenBelStatP,KostAmbP,KostOKPAmbP,KostStatP,KostKVGStatP,AnlKVGStatP,KostZvOKPStatP,AnlZvOKPStatP,ErlOKPAmbP,ErlKVGStatP,ErlKVGStatVP,ErlZvOKPStatP,ErlZvOKPStatVP,BettenStatR,pBettenBelStatR,KostAmbR,KostOKPAmbR,KostStatR,KostKVGStatR,AnlKVGStatR,KostZvOKPStatR,AnlZvOKPStatR,ErlOKPAmbR,ErlKVGStatR,ErlKVGStatVR,ErlZvOKPStatR,ErlZvOKPStatVR,BettenStatB,pBettenBelStatB,KostAmbB,KostOKPAmbB,KostStatB,KostKVGStatB,AnlKVGStatB,KostZvOKPStatB,AnlZvOKPStatB,ErlOKPAmbB,ErlKVGStatB,ErlKVGStatVB,ErlZvOKPStatB,ErlZvOKPStatVB,Akt_A,Akt_B,Akt_P,Akt_R,SL_IPS,SL_NF,SA_Angio,SA_CC,SA_CT,SA_Dia,SA_LB,SA_Lito,SA_MRI,SA_PET,WB_Arzt,WB_BGs,WB_MSt,Typ_K111,Typ_K112,Typ_K121,Typ_K122,Typ_K123,Typ_K211,Typ_K212,Typ_K221,Typ_K231,Typ_K232,Typ_K233,Typ_K234,Typ_K235,KT_AG,KT_AI,KT_AR,KT_BE,KT_BL,KT_BS,KT_FR,KT_GE,KT_GL,KT_GR,KT_JU,KT_LU,KT_NE,KT_NW,KT_OW,KT_SG,KT_SH,KT_SO,KT_SZ,KT_TG,KT_TI,KT_UR,KT_VD,KT_VS,KT_ZG,KT_ZH
0,2010,300,300,300,300,279,300,300,300,297,298,158,158,286,285,286,286,293,293,299,293,298,299,299,299,296,299,299,299,299,176,299,299,299,299,299,299,299,299,299,137,299,129,299,71,299,299,299,299,299,299,299,299,299,60,299,34,299,82,299,299,299,299,299,299,299,299,299,69,299,48,299,13,299,299,299,299,299,299,299,299,299,3,299,4,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
1,2011,300,300,300,300,280,300,300,300,299,299,167,167,296,296,296,296,299,299,299,299,299,299,299,299,296,299,299,299,299,177,299,299,299,299,299,299,299,299,299,141,299,140,299,70,299,299,299,299,299,299,299,299,299,63,299,34,299,83,299,299,299,299,299,299,299,299,299,75,299,57,299,13,299,299,299,299,299,299,299,299,299,5,299,4,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300,300
2,2012,298,298,298,298,279,298,298,298,297,297,165,165,292,292,292,292,291,291,291,291,297,297,297,297,297,297,297,297,174,171,174,174,174,174,174,174,174,174,174,154,174,147,72,71,72,72,72,72,72,72,72,72,72,65,72,35,89,86,89,89,89,89,89,89,89,89,89,76,89,59,20,16,20,20,20,20,20,20,20,20,20,12,20,5,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298,298
3,2013,293,293,293,293,277,293,293,293,289,287,164,164,286,286,286,286,286,286,286,286,290,291,291,291,291,291,291,291,175,170,175,175,175,175,175,175,175,175,175,156,175,150,78,74,78,78,78,78,78,78,78,78,78,72,78,42,93,83,93,93,93,93,93,93,93,93,93,76,93,61,18,14,18,18,18,18,18,18,18,18,18,11,18,4,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293
4,2014,289,289,289,289,273,289,289,289,286,285,160,160,284,284,284,284,285,285,285,285,288,288,288,288,285,288,288,288,168,167,168,168,168,168,168,168,168,168,168,158,168,153,76,74,76,76,76,76,76,76,76,76,76,72,76,41,93,87,93,93,93,93,93,93,93,93,93,86,93,72,18,14,18,18,18,18,18,18,18,18,18,13,18,4,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289,289
5,2015,288,288,288,288,275,288,288,288,283,282,156,156,284,284,284,284,282,282,282,282,285,285,285,285,285,285,285,285,168,166,168,168,168,168,168,168,168,168,168,159,168,155,76,74,76,76,76,76,76,76,76,76,76,72,76,44,93,90,93,93,93,93,93,93,93,93,93,89,93,75,14,14,18,18,18,18,18,18,18,18,18,13,18,5,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288,288
6,2016,283,283,283,283,273,283,283,283,283,283,155,155,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,166,164,166,166,166,166,166,166,166,166,166,156,166,153,77,75,77,77,77,77,77,77,77,77,77,73,77,46,96,94,96,96,96,96,96,96,96,96,96,92,96,78,14,14,14,14,14,14,14,14,14,14,14,14,14,5,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283
7,2017,281,281,281,281,272,281,281,281,280,280,155,155,280,280,280,280,280,280,280,280,280,280,280,280,280,280,280,280,164,162,164,164,164,164,164,164,164,164,164,154,164,150,76,74,76,76,76,76,76,76,76,76,76,71,76,45,96,93,96,96,96,96,96,96,96,96,96,91,96,79,15,15,15,15,15,15,15,15,15,15,15,15,15,6,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281
8,2018,281,281,281,281,272,281,281,281,279,279,154,154,278,278,278,278,279,279,279,279,279,279,279,279,279,279,279,279,163,159,163,163,163,163,163,163,163,163,163,151,163,149,78,75,78,78,78,78,78,78,78,78,78,74,78,45,95,91,95,95,95,95,95,95,95,95,95,89,95,79,15,15,15,15,15,15,15,15,15,15,15,15,15,5,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281
9,2019,281,281,281,281,273,281,281,281,278,278,157,157,277,277,277,277,278,278,278,278,278,278,278,278,278,278,278,278,166,162,166,166,166,166,166,166,166,166,166,154,166,152,76,74,76,76,76,76,76,76,76,76,76,73,76,48,91,88,91,91,91,91,91,91,91,91,91,85,91,77,16,16,16,16,16,16,16,16,16,16,16,16,16,6,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281


In [6]:
############## R E L E V A N T   C O D E

list_of_fixable = ['PtageStatMS', 'AustStatMS', 'BettenStat', 'DADStatMS', 'KostAmb', 'KostOKPAmb', 'KostStat', 'KostKVGStat', 'AnlKVGStat', 'KostZvOKPStat', 'AnlZvOKPStat', 'ErlOKPAmb', 'ErlKVGStat']
list_of_bools = ['Akt_A', 'Akt_B', 'Akt_P', 'Akt_R']
list_of_fixable_A = [element + 'A' for element in list_of_fixable]
list_of_fixable_P = [element + 'P' for element in list_of_fixable]
list_of_fixable_R = [element + 'R' for element in list_of_fixable]
list_of_fixable_B = [element + 'A' for element in list_of_fixable]
matrix_with_indices = np.zeros((len(DF), len(list_of_fixable)))
for var in range(len(list_of_fixable)):
    for i in range(len(DF)):
        if DF['Akt_A'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_A[var]][i+1819]):
            matrix_with_indices[i,var] += 1
        elif DF['Akt_P'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_P[var]][i+1819]):
            matrix_with_indices[i,var] += 1
        elif DF['Akt_R'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_R[var]][i+1819]):
            matrix_with_indices[i,var] += 1
        elif DF['Akt_B'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_B[var]][i+1819]):
            matrix_with_indices[i,var] += 1
        if not ((DF['Akt_A'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_A[var]][i+1819])) or (DF['Akt_P'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_P[var]][i+1819])) or (DF['Akt_R'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_R[var]][i+1819])) or (DF['Akt_B'][i+1819] == 1 and pd.notnull(DF[list_of_fixable_B[var]][i+1819]))):
            matrix_with_indices[i,var] -= 0
np.set_printoptions(threshold=np.inf)
#print(matrix_with_indices)
############## R E L E V A N T   C O D E


for var in range(len(list_of_fixable)):
    for i in range(len(DF)):
        if matrix_with_indices[i,var] == 1:
            if pd.isnull(DF.loc[i+1819, list_of_fixable_A[var]]):
                DF.loc[i+1819, list_of_fixable_A[var]] = 0
            if pd.isnull(DF.loc[i+1819, list_of_fixable_P[var]]):
                DF.loc[i+1819, list_of_fixable_P[var]] = 0
            if pd.isnull(DF.loc[i+1819, list_of_fixable_R[var]]):
                DF.loc[i+1819, list_of_fixable_R[var]] = 0
            if pd.isnull(DF.loc[i+1819, list_of_fixable_B[var]]):
                DF.loc[i+1819, list_of_fixable_B[var]] = 0

In [7]:
#do same fixing for DF2 with AmbKons added to list
list_of_fixable2 = ['AmbKons','PtageStatMS', 'AustStatMS', 'BettenStat', 'DADStatMS', 'KostAmb', 'KostOKPAmb', 'KostStat', 'KostKVGStat', 'AnlKVGStat', 'KostZvOKPStat', 'AnlZvOKPStat', 'ErlOKPAmb', 'ErlKVGStat']
list_of_bools2 = ['Akt_A', 'Akt_B', 'Akt_P', 'Akt_R']
list_of_fixable_A2 = [element + 'A' for element in list_of_fixable2]
list_of_fixable_P2 = [element + 'P' for element in list_of_fixable2]
list_of_fixable_R2 = [element + 'R' for element in list_of_fixable2]
list_of_fixable_B2 = [element + 'B' for element in list_of_fixable2]
matrix_with_indices2 = np.zeros((len(DF2), len(list_of_fixable2)))
for var in range(len(list_of_fixable2)):
    for i in range(len(DF2)):
        if DF2['Akt_A'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_A2[var]][i+2396]):
            matrix_with_indices2[i,var] += 1
        elif DF2['Akt_P'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_P2[var]][i+2396]):
            matrix_with_indices2[i,var] += 1
        elif DF2['Akt_R'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_R2[var]][i+2396]):
            matrix_with_indices2[i,var] += 1
        elif DF2['Akt_B'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_B2[var]][i+2396]):
            matrix_with_indices2[i,var] += 1
        if not ((DF2['Akt_A'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_A2[var]][i+2396])) or (DF2['Akt_P'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_P2[var]][i+2396])) or (DF2['Akt_R'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_R2[var]][i+2396])) or (DF2['Akt_B'][i+2396] == 1 and pd.notnull(DF2[list_of_fixable_B2[var]][i+2396]))):
            matrix_with_indices2[i,var] -= 0
np.set_printoptions(threshold=np.inf)
#print(matrix_with_indices)
############## R E L E V A N T   C O D E


for var in range(len(list_of_fixable2)):
    for i in range(len(DF2)):
        if matrix_with_indices2[i,var] == 1:
            if pd.isnull(DF2.loc[i+2396, list_of_fixable_A2[var]]):
                DF2.loc[i+2396, list_of_fixable_A2[var]] = 0
            if pd.isnull(DF2.loc[i+2396, list_of_fixable_P2[var]]):
                DF2.loc[i+2396, list_of_fixable_P2[var]] = 0
            if pd.isnull(DF2.loc[i+2396, list_of_fixable_R2[var]]):
                DF2.loc[i+2396, list_of_fixable_R2[var]] = 0
            if pd.isnull(DF2.loc[i+2396, list_of_fixable_B2[var]]):
                DF2.loc[i+2396, list_of_fixable_B2[var]] = 0

In [8]:
#same action for DF3

list_of_fixable3 = ['PtageStat', 'AustStat', 'BettenStat', 'DADStat', 'KostAmb', 'KostOKPAmb', 'KostStat', 'KostKVGStat', 'AnlKVGStat', 'KostZvOKPStat', 'AnlZvOKPStat', 'KostLang', 'AuftragLF', 'ErlAmb', 'ErlOKPAmb', 'ErlStat', 'ErlKVGStat', 'ErlZvOKPStat', 'ErlLang']
list_of_bools3 = ['Akt_A', 'Akt_B', 'Akt_P', 'Akt_R']
list_of_fixable_A3 = [element + 'A' for element in list_of_fixable3]
list_of_fixable_P3 = [element + 'P' for element in list_of_fixable3]
list_of_fixable_R3 = [element + 'R' for element in list_of_fixable3]
list_of_fixable_B3 = [element + 'B' for element in list_of_fixable3]
matrix_with_indices3 = np.zeros((len(DF3), len(list_of_fixable3)))
for var in range(len(list_of_fixable3)):
    for i in range(len(DF3)):
        if DF3['Akt_A'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_A3[var]][i+628]):
            matrix_with_indices3[i,var] += 1
        elif DF3['Akt_P'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_P3[var]][i+628]):
            matrix_with_indices3[i,var] += 1
        elif DF3['Akt_R'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_R3[var]][i+628]):
            matrix_with_indices3[i,var] += 1
        elif DF3['Akt_B'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_B3[var]][i+628]):
            matrix_with_indices3[i,var] += 1
        if not ((DF3['Akt_A'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_A3[var]][i+628])) or (DF3['Akt_P'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_P3[var]][i+628])) or (DF3['Akt_R'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_R3[var]][i+628])) or (DF3['Akt_B'][i+628] == 1 and pd.notnull(DF3[list_of_fixable_B3[var]][i+628]))):
            matrix_with_indices3[i,var] -= 0
np.set_printoptions(threshold=np.inf)

for var in range(len(list_of_fixable3)):
    for i in range(len(DF3)):
        if matrix_with_indices3[i,var] == 1:
            if pd.isnull(DF3.loc[i+628, list_of_fixable_A3[var]]):
                DF3.loc[i+628, list_of_fixable_A3[var]] = 0
            if pd.isnull(DF3.loc[i+628, list_of_fixable_P3[var]]):
                DF3.loc[i+628, list_of_fixable_P3[var]] = 0
            if pd.isnull(DF3.loc[i+628, list_of_fixable_R3[var]]):
                DF3.loc[i+628, list_of_fixable_R3[var]] = 0
            if pd.isnull(DF3.loc[i+628, list_of_fixable_B3[var]]):
                DF3.loc[i+628, list_of_fixable_B3[var]] = 0


In [9]:
#same action for DF4

list_of_fixable4 = ['BettenStat', 'KostAmb', 'KostOKPAmb', 'KostStat', 'KostKVGStat', 'AnlKVGStat', 'KostZvOKPStat', 'AnlZvOKPStat', 'ErlOKPAmb', 'ErlKVGStat', 'ErlZvOKPStat']
list_of_bools4 = ['Akt_A', 'Akt_B', 'Akt_P', 'Akt_R']
list_of_fixable_A4 = [element + 'A' for element in list_of_fixable4]
list_of_fixable_P4 = [element + 'P' for element in list_of_fixable4]
list_of_fixable_R4 = [element + 'R' for element in list_of_fixable4]
list_of_fixable_B4 = [element + 'B' for element in list_of_fixable4]
matrix_with_indices4 = np.zeros((len(DF4), len(list_of_fixable4)))
for var in range(len(list_of_fixable4)):
    for i in range(len(DF4)):
        if DF4['Akt_A'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_A4[var]][i+628]):
            matrix_with_indices4[i,var] += 1
        elif DF4['Akt_P'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_P4[var]][i+628]):
            matrix_with_indices4[i,var] += 1
        elif DF4['Akt_R'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_R4[var]][i+628]):
            matrix_with_indices4[i,var] += 1
        elif DF4['Akt_B'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_B4[var]][i+628]):
            matrix_with_indices4[i,var] += 1
        if not ((DF4['Akt_A'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_A4[var]][i+628])) or (DF4['Akt_P'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_P4[var]][i+628])) or (DF4['Akt_R'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_R4[var]][i+628])) or (DF4['Akt_B'][i+628] == 1 and pd.notnull(DF4[list_of_fixable_B4[var]][i+628]))):
            matrix_with_indices4[i,var] -= 0
np.set_printoptions(threshold=np.inf)

for var in range(len(list_of_fixable4)):
    for i in range(len(DF4)):
        if matrix_with_indices4[i,var] == 1:
            if pd.isnull(DF4.loc[i+628, list_of_fixable_A4[var]]):
                DF4.loc[i+628, list_of_fixable_A4[var]] = 0
            if pd.isnull(DF4.loc[i+628, list_of_fixable_P4[var]]):
                DF4.loc[i+628, list_of_fixable_P4[var]] = 0
            if pd.isnull(DF4.loc[i+628, list_of_fixable_R4[var]]):
                DF4.loc[i+628, list_of_fixable_R4[var]] = 0
            if pd.isnull(DF4.loc[i+628, list_of_fixable_B4[var]]):
                DF4.loc[i+628, list_of_fixable_B4[var]] = 0


In [10]:
cor_matrix=DF.corr()
cor_matrix2=DF2.corr()
cor_matrix3=DF3.corr()
cor_matrix4=DF4.corr()
print(cor_matrix2[['AwT','EtMedL','EtSonst','FiErg']])

                      AwT    EtMedL       EtSonst     FiErg
JAHR             0.019293  0.013891  4.591518e-02 -0.119990
AnzStand         0.202961  0.210476  1.705161e-01 -0.012772
AmbKonsT         0.950419  0.915201  8.848583e-01  0.052949
PtageStatMST     0.927867  0.878545  8.046253e-01  0.072412
AustStatMST      0.939143  0.958192  8.645889e-01  0.073366
NeugStatMST      0.798822  0.807383  7.539598e-01  0.048816
Ops              0.896674  0.923997  8.335171e-01  0.074132
Gebs             0.696810  0.726108  6.629180e-01  0.012728
CMIb             0.419542  0.447235  3.901154e-01  0.104640
CMIn             0.437820  0.460072  4.059283e-01  0.097093
pPatWAU         -0.044424 -0.061957 -3.816102e-02 -0.008490
pPatWAK         -0.167562 -0.175941 -1.252693e-01 -0.010664
pPatLKP         -0.081483 -0.079579 -6.482458e-02  0.061766
pPatHOK          0.017597  0.026261  2.691940e-02  0.007175
PersA            0.986333  0.954345  9.017481e-01  0.071516
PersP            0.987724  0.955493  8.7

In [11]:
#for 1st DF
#I ALSO REMOVED JAHR, ADR+ ORT IN THIS STEP BY REMOVING IT FROM ALL LISTS
list_non_financial_AwT=['KT', 'Inst', 'Typ', 'AnzStand', 'RForm', 'PtageStatMST', 'AustStatMST', 'NeugStatMST', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'StdBelA', 'StdBelP', 'AwT', 'PtageStatMSA', 'AustStatMSA', 'BettenStatA', 'DADStatMSA', 'pBettenBelStatA', 'PtageStatMSP', 'AustStatMSP', 'BettenStatP', 'DADStatMSP', 'pBettenBelStatP', 'PtageStatMSR', 'AustStatMSR', 'BettenStatR', 'DADStatMSR', 'pBettenBelStatR', 'PtageStatMSB', 'AustStatMSB', 'BettenStatB', 'DADStatMSB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG','KT_ZH','RForm_R1','RForm_R2','RForm_R3','RForm_R4']
list_non_financial_EtMedL=['KT', 'Inst', 'Typ', 'AnzStand', 'RForm', 'PtageStatMST', 'AustStatMST', 'NeugStatMST', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'StdBelA', 'StdBelP', 'EtMedL', 'PtageStatMSA', 'AustStatMSA', 'BettenStatA', 'DADStatMSA', 'pBettenBelStatA', 'PtageStatMSP', 'AustStatMSP', 'BettenStatP', 'DADStatMSP', 'pBettenBelStatP', 'PtageStatMSR', 'AustStatMSR', 'BettenStatR', 'DADStatMSR', 'pBettenBelStatR', 'PtageStatMSB', 'AustStatMSB', 'BettenStatB', 'DADStatMSB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG','KT_ZH','RForm_R1','RForm_R2','RForm_R3','RForm_R4']
list_non_financial_EtSonst=['KT', 'Inst', 'Typ', 'AnzStand', 'RForm', 'PtageStatMST', 'AustStatMST', 'NeugStatMST', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'StdBelA', 'StdBelP', 'EtSonst', 'PtageStatMSA', 'AustStatMSA', 'BettenStatA', 'DADStatMSA', 'pBettenBelStatA', 'PtageStatMSP', 'AustStatMSP', 'BettenStatP', 'DADStatMSP', 'pBettenBelStatP', 'PtageStatMSR', 'AustStatMSR', 'BettenStatR', 'DADStatMSR', 'pBettenBelStatR', 'PtageStatMSB', 'AustStatMSB', 'BettenStatB', 'DADStatMSB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG','KT_ZH','RForm_R1','RForm_R2','RForm_R3','RForm_R4']
DF_non_financial_AwT=DF[list_non_financial_AwT]
DF_non_financial_EtMedL=DF[list_non_financial_EtMedL]
DF_non_financial_EtSonst=DF[list_non_financial_EtSonst]
#for DF2
list_non_financial_DF2_AwT=['KT', 'Inst', 'Typ', 'AnzStand', 'RForm', 'PtageStatMST', 'AustStatMST', 'NeugStatMST', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'StdBelA', 'StdBelP', 'AwT', 'PtageStatMSA', 'AustStatMSA', 'BettenStatA', 'DADStatMSA', 'pBettenBelStatA', 'PtageStatMSP', 'AustStatMSP', 'BettenStatP', 'DADStatMSP', 'pBettenBelStatP', 'PtageStatMSR', 'AustStatMSR', 'BettenStatR', 'DADStatMSR', 'pBettenBelStatR', 'PtageStatMSB', 'AustStatMSB', 'BettenStatB', 'DADStatMSB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','AmbKonsT', 'pMRI_AMB', 'pMRI_STAT', 'pCT_AMB', 'pCT_STAT', 'pANGIO_AMB', 'pANGIO_STAT', 'pDIA_AMB', 'pDIA_STAT', 'AmbKonsA', 'AmbKonsP', 'AmbKonsR', 'AmbKonsB', 'LA_Amb', 'LA_Stat','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH','RForm_R1','RForm_R2','RForm_R3','RForm_R4']
list_non_financial_DF2_EtMedL=['KT', 'Inst', 'Typ', 'AnzStand', 'RForm', 'PtageStatMST', 'AustStatMST', 'NeugStatMST', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'StdBelA', 'StdBelP', 'EtMedL', 'PtageStatMSA', 'AustStatMSA', 'BettenStatA', 'DADStatMSA', 'pBettenBelStatA', 'PtageStatMSP', 'AustStatMSP', 'BettenStatP', 'DADStatMSP', 'pBettenBelStatP', 'PtageStatMSR', 'AustStatMSR', 'BettenStatR', 'DADStatMSR', 'pBettenBelStatR', 'PtageStatMSB', 'AustStatMSB', 'BettenStatB', 'DADStatMSB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','AmbKonsT', 'pMRI_AMB', 'pMRI_STAT', 'pCT_AMB', 'pCT_STAT', 'pANGIO_AMB', 'pANGIO_STAT', 'pDIA_AMB', 'pDIA_STAT', 'AmbKonsA', 'AmbKonsP', 'AmbKonsR', 'AmbKonsB', 'LA_Amb', 'LA_Stat','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH','RForm_R1','RForm_R2','RForm_R3','RForm_R4']
list_non_financial_DF2_EtSonst=['KT', 'Inst', 'Typ', 'AnzStand', 'RForm', 'PtageStatMST', 'AustStatMST', 'NeugStatMST', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'StdBelA', 'StdBelP', 'EtSonst', 'PtageStatMSA', 'AustStatMSA', 'BettenStatA', 'DADStatMSA', 'pBettenBelStatA', 'PtageStatMSP', 'AustStatMSP', 'BettenStatP', 'DADStatMSP', 'pBettenBelStatP', 'PtageStatMSR', 'AustStatMSR', 'BettenStatR', 'DADStatMSR', 'pBettenBelStatR', 'PtageStatMSB', 'AustStatMSB', 'BettenStatB', 'DADStatMSB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_Angio', 'SA_CC', 'SA_CT', 'SA_Dia', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','AmbKonsT', 'pMRI_AMB', 'pMRI_STAT', 'pCT_AMB', 'pCT_STAT', 'pANGIO_AMB', 'pANGIO_STAT', 'pDIA_AMB', 'pDIA_STAT', 'AmbKonsA', 'AmbKonsP', 'AmbKonsR', 'AmbKonsB', 'LA_Amb', 'LA_Stat','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH','RForm_R1','RForm_R2','RForm_R3','RForm_R4']
DF2_non_financial_AwT=DF2[list_non_financial_DF2_AwT]
DF2_non_financial_EtMedL=DF2[list_non_financial_DF2_EtMedL]
DF2_non_financial_EtSonst=DF2[list_non_financial_DF2_EtSonst]

#for DF3:
list_non_financial_DF3_AwT=['KT', 'Inst', 'Typ', 'AnzStand', 'PtageStatT', 'AustStatT', 'NeugStatT', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'PersAFall', 'PersPFall', 'PersMTFall', 'PersTFall', 'AwT', 'PtageStatA', 'AustStatA', 'BettenStatA', 'DADStatA', 'pBettenBelStatA', 'PtageStatP', 'AustStatP', 'BettenStatP', 'DADStatP', 'pBettenBelStatP', 'PtageStatR', 'AustStatR', 'BettenStatR', 'DADStatR', 'pBettenBelStatR', 'PtageStatB', 'AustStatB', 'BettenStatB', 'DADStatB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_CC', 'SA_CT', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH']
list_non_financial_DF3_EtMedL=['KT', 'Inst', 'Typ', 'AnzStand', 'PtageStatT', 'AustStatT', 'NeugStatT', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'PersAFall', 'PersPFall', 'PersMTFall', 'PersTFall', 'EtMedL', 'PtageStatA', 'AustStatA', 'BettenStatA', 'DADStatA', 'pBettenBelStatA', 'PtageStatP', 'AustStatP', 'BettenStatP', 'DADStatP', 'pBettenBelStatP', 'PtageStatR', 'AustStatR', 'BettenStatR', 'DADStatR', 'pBettenBelStatR', 'PtageStatB', 'AustStatB', 'BettenStatB', 'DADStatB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_CC', 'SA_CT', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH']
list_non_financial_DF3_EtSonst=['KT', 'Inst', 'Typ', 'AnzStand', 'PtageStatT', 'AustStatT', 'NeugStatT', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'PersAFall', 'PersPFall', 'PersMTFall', 'PersTFall', 'EtSonst', 'PtageStatA', 'AustStatA', 'BettenStatA', 'DADStatA', 'pBettenBelStatA', 'PtageStatP', 'AustStatP', 'BettenStatP', 'DADStatP', 'pBettenBelStatP', 'PtageStatR', 'AustStatR', 'BettenStatR', 'DADStatR', 'pBettenBelStatR', 'PtageStatB', 'AustStatB', 'BettenStatB', 'DADStatB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_CC', 'SA_CT', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH']
DF3_non_financial_AwT=DF3[list_non_financial_DF3_AwT]
DF3_non_financial_EtMedL=DF3[list_non_financial_DF3_EtMedL]
DF3_non_financial_EtSonst=DF3[list_non_financial_DF3_EtSonst]
#for DF4:
list_non_financial_DF4_AwT=['KT', 'Inst', 'Typ', 'AnzStand', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'AwT', 'BettenStatA', 'pBettenBelStatA', 'BettenStatP', 'pBettenBelStatP', 'BettenStatR', 'pBettenBelStatR', 'BettenStatB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_CC', 'SA_CT', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','SA_Angio','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH']
list_non_financial_DF4_EtMedL=['KT', 'Inst', 'Typ', 'AnzStand', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'EtMedL', 'BettenStatA', 'pBettenBelStatA', 'BettenStatP', 'pBettenBelStatP', 'BettenStatR', 'pBettenBelStatR', 'BettenStatB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_CC', 'SA_CT', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','SA_Angio','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH']
list_non_financial_DF4_EtSonst=['KT', 'Inst', 'Typ', 'AnzStand', 'Ops', 'Gebs', 'CMIb', 'CMIn', 'pPatWAU', 'pPatWAK', 'pPatLKP', 'pPatHOK', 'PersA', 'PersP', 'PersMT', 'PersT', 'EtSonst', 'BettenStatA', 'pBettenBelStatA', 'BettenStatP', 'pBettenBelStatP', 'BettenStatR', 'pBettenBelStatR', 'BettenStatB', 'pBettenBelStatB', 'Akt_A', 'Akt_B', 'Akt_P', 'Akt_R', 'SL_IPS', 'SL_NF', 'SA_CC', 'SA_CT', 'SA_LB', 'SA_Lito', 'SA_MRI', 'SA_PET', 'WB_Arzt', 'WB_BGs', 'WB_MSt','SA_Angio','Typ_K111', 'Typ_K112', 'Typ_K121', 'Typ_K122', 'Typ_K123', 'Typ_K211', 'Typ_K212', 'Typ_K221', 'Typ_K231', 'Typ_K232', 'Typ_K233', 'Typ_K234', 'Typ_K235', 'KT_AG', 'KT_AI', 'KT_AR', 'KT_BE', 'KT_BL', 'KT_BS', 'KT_FR', 'KT_GE', 'KT_GL', 'KT_GR', 'KT_JU', 'KT_LU', 'KT_NE', 'KT_NW', 'KT_OW', 'KT_SG', 'KT_SH', 'KT_SO', 'KT_SZ', 'KT_TG', 'KT_TI', 'KT_UR', 'KT_VD', 'KT_VS', 'KT_ZG', 'KT_ZH']
DF4_non_financial_AwT=DF4[list_non_financial_DF4_AwT]
DF4_non_financial_EtMedL=DF4[list_non_financial_DF4_EtMedL]
DF4_non_financial_EtSonst=DF4[list_non_financial_DF4_EtSonst]

In [12]:
RowsDF=[]
def drop_most_missing(df, rows):
    complete_dfs = []
    dropped_columns=[]
    for i in range(len(df.columns)):
        col_with_most_missing = df.isnull().sum().idxmax()
        print(1-(df[col_with_most_missing].isnull().sum()/len(df)))
        df = df.drop(col_with_most_missing, axis=1)
        print(f"Dropped column: {col_with_most_missing}")
        dropped_columns.append(col_with_most_missing)
        print(f"New shape: {df.shape}")
        complete_rows = df.dropna()
        print(f"Shape with complete rows only: {complete_rows.shape}")
        if i in rows:
            complete_dfs.append(complete_rows)
        print(i)
    return complete_dfs, dropped_columns

def drop_poorly_correlated(df, target, cor_matrix):
    new_df = df.copy()
    for i in new_df.columns:
        if i in cor_matrix.index:
            correlation = cor_matrix[target][i]
            if abs(correlation) < 0.5:
                new_df.drop(i, axis=1, inplace=True)
                print(i, correlation)
    print(new_df.shape)
    return new_df


In [13]:
#for DF
DF_non_financial_EtMedL=drop_poorly_correlated(DF_non_financial_EtMedL,'EtMedL',cor_matrix)
DF_non_financial_EtSonst=drop_poorly_correlated(DF_non_financial_EtSonst,'EtSonst',cor_matrix)
DF_non_financial_AwT=drop_poorly_correlated(DF_non_financial_AwT,'AwT',cor_matrix)
#ForDF2
DF2_non_financial_EtMedL=drop_poorly_correlated(DF2_non_financial_EtMedL,'EtMedL',cor_matrix2)
DF2_non_financial_EtSonst=drop_poorly_correlated(DF2_non_financial_EtSonst,'EtSonst',cor_matrix2)
DF2_non_financial_AwT=drop_poorly_correlated(DF2_non_financial_AwT,'AwT',cor_matrix2)
#for DF3
DF3_non_financial_EtMedL=drop_poorly_correlated(DF3_non_financial_EtMedL,'EtMedL',cor_matrix3)
DF3_non_financial_EtSonst=drop_poorly_correlated(DF3_non_financial_EtSonst,'EtSonst',cor_matrix3)
DF3_non_financial_AwT=drop_poorly_correlated(DF3_non_financial_AwT,'AwT',cor_matrix3)
#for DF4:
DF4_non_financial_EtMedL=drop_poorly_correlated(DF4_non_financial_EtMedL,'EtMedL',cor_matrix4)
DF4_non_financial_EtSonst=drop_poorly_correlated(DF4_non_financial_EtSonst,'EtSonst',cor_matrix4)
DF4_non_financial_AwT=drop_poorly_correlated(DF4_non_financial_AwT,'AwT',cor_matrix4)

AnzStand 0.19626862523331745
CMIb 0.4478294385920264
CMIn 0.46094899496310543
pPatWAU -0.03363629949946462
pPatWAK -0.08221350790721467
pPatLKP -0.03975093651418386
pPatHOK -0.022252713412512663
StdBelA 0.17485034981083214
StdBelP 0.314427963610644
DADStatMSA 0.08329300217801547
pBettenBelStatA 0.2681754545104151
PtageStatMSP 0.15750701204487855
AustStatMSP 0.19486114153182696
BettenStatP 0.15070834929845361
DADStatMSP 0.054373832848474836
pBettenBelStatP -0.09620162464747536
PtageStatMSR 0.2677431028370091
AustStatMSR 0.25747673296295304
BettenStatR 0.22769652647492503
DADStatMSR 0.09059526981885417
pBettenBelStatR 0.07766722840919156
Akt_A 0.30100079361547055
Akt_B -0.12267345097753425
Akt_P 0.08895720738707398
Akt_R 0.0999896998059204
SL_NF 0.41819542103572305
SA_CT 0.4965223949958335
WB_Arzt 0.30142638260117427
WB_BGs 0.1405726061697602
WB_MSt 0.3667293220250031
Typ_K112 0.4601777464148645
Typ_K121 0.018472688224806872
Typ_K122 -0.05795473895607456
Typ_K123 -0.09999699118332775
Typ

In [14]:
RowsAwT=[4]
complete_dfs1, dropped_columns = drop_most_missing(DF_non_financial_AwT, RowsAwT)
RowsEtMedL=[4]
complete_dfs2, dropped_columns2 = drop_most_missing(DF_non_financial_EtMedL, RowsEtMedL)
RowsEtSonst=[3]
complete_dfs3, dropped_columns3 = drop_most_missing(DF_non_financial_EtSonst, RowsEtSonst)

#for DF2
RowsAwT2=[1]
complete_dfs4, dropped_columns4 = drop_most_missing(DF2_non_financial_AwT, RowsAwT2)
RowsEtMedL2=[1]
complete_dfs5, dropped_columns5 = drop_most_missing(DF2_non_financial_EtMedL, RowsEtMedL2)
RowsEtSonst2=[1]
complete_dfs6, dropped_columns6 = drop_most_missing(DF2_non_financial_EtSonst, RowsEtSonst2)

#for DF3
RowsAwT3=[0]
complete_dfs7, dropped_columns7 = drop_most_missing(DF3_non_financial_AwT, RowsAwT3)
RowsEtMedL3=[0]
complete_dfs8, dropped_columns8 = drop_most_missing(DF3_non_financial_EtMedL, RowsEtMedL3)
RowsEtSonst3=[0]
complete_dfs9, dropped_columns9 = drop_most_missing(DF3_non_financial_EtSonst, RowsEtSonst3)
complete_dfs9[0]=DF3_non_financial_EtSonst
#for DF4
RowsAwT4=[0]
complete_dfs10, dropped_columns10 = drop_most_missing(DF4_non_financial_AwT, RowsAwT4)
RowsEtMedL4=[0]
complete_dfs11, dropped_columns11 = drop_most_missing(DF4_non_financial_EtMedL, RowsEtMedL4)
RowsEtSonst4=[0]
complete_dfs12, dropped_columns12 = drop_most_missing(DF4_non_financial_EtSonst, RowsEtSonst4)
complete_dfs12[0]=DF4_non_financial_EtSonst

0.05154118241536132
Dropped column: DADStatMSB
New shape: (1979, 29)
Shape with complete rows only: (0, 29)
0
0.05204648812531587
Dropped column: PtageStatMSB
New shape: (1979, 28)
Shape with complete rows only: (0, 28)
1
0.05204648812531587
Dropped column: AustStatMSB
New shape: (1979, 27)
Shape with complete rows only: (0, 27)
2
0.05204648812531587
Dropped column: pBettenBelStatB
New shape: (1979, 26)
Shape with complete rows only: (0, 26)
3
0.05406771096513385
Dropped column: BettenStatB
New shape: (1979, 25)
Shape with complete rows only: (1848, 25)
4
0.9388580090955028
Dropped column: BettenStatA
New shape: (1979, 24)
Shape with complete rows only: (1848, 24)
5
0.9403739262253663
Dropped column: PtageStatMSA
New shape: (1979, 23)
Shape with complete rows only: (1848, 23)
6
0.9403739262253663
Dropped column: AustStatMSA
New shape: (1979, 22)
Shape with complete rows only: (1950, 22)
7
0.9903991915108641
Dropped column: PtageStatMST
New shape: (1979, 21)
Shape with complete rows onl

In [15]:
#target=['AwT']
#def drop_arbitrary(df, cor, na, target):
#    missing_per_row=[]
#    dropped_columns=[]
#    for i in(df.columns):
#        fraction=(1 - (df[i].isnull().sum() / len(df)))
#        correlation=cor_matrix[target][i]
#        print('fraction',fraction, 'correlation',correlation)
#        if fraction<na or cor<correlation:
#            df.drop(i, axis=1)
#            print(i,'was dropped')
#            print(df.shape)
#            

In [16]:
########################### relevant code

complete_dfs1[0].head(15)
#complete_dfs2[0]
#complete_dfs3[0]

#for DF2:
complete_dfs4[0].head(15)
#complete_dfs5[0]
#complete_dfs6[0]

#forDF3:
complete_dfs7[0].head(15)
#complete_dfs8[0]
#complete_dfs9[0]

#fordf4:
complete_dfs10[0].head(15)
#complete_dfs11[0]
#complete_dfs12[0]

Unnamed: 0,KT,Inst,Typ,Ops,Gebs,PersA,PersP,PersMT,PersT,AwT,BettenStatA,SL_IPS,SA_CC,SA_LB,SA_Lito,SA_PET,SA_Angio,Typ_K111
628,AG,Kantonsspital Aarau AG,K112,14.0,5.0,473.17,1109.04,283.61,2714.99,489904988.0,572.0,1,1,1,1,1,1,0
629,AG,Kantonsspital Baden AG,K112,8.0,6.0,222.12,626.73,163.47,1455.52,251412062.0,355.6,1,1,0,1,0,1,0
630,AG,Hirslanden Klinik Aarau,K122,7.0,2.0,10.9,163.37,155.46,463.99,112870924.0,142.6,1,0,1,1,0,1,0
631,AG,Gesundheitszentrum Fricktal,K122,6.0,3.0,56.04,224.16,46.23,437.39,65831922.0,132.0,0,0,0,0,0,0,0
632,AG,Kreisspital für das Freiamt,K122,4.0,2.0,51.38,182.91,38.66,368.45,64823939.0,111.0,0,0,0,0,0,0,0
633,AG,Spital Zofingen AG,K122,3.0,2.0,41.62,185.2,44.56,363.09,55077637.0,81.0,1,0,0,0,0,0,0
634,AG,Asana Gruppe AG Spital Menziken,K123,3.0,2.0,23.44,70.63,35.37,214.23,30488018.0,57.0,0,0,0,0,0,0,0
635,AG,Asana Gruppe AG Spital Leuggern,K123,2.0,2.0,12.87,66.12,9.54,127.73,24774193.0,45.0,0,0,0,0,0,0,0
636,AG,Psychiatrische Dienste Aargau AG _ Psychiatris...,K211,0.0,0.0,48.39,210.49,11.02,552.19,85701082.0,0.0,0,0,0,0,0,0,0
637,AG,Schützen Rheinfelden AG Klinik & Ambulatorium,K212,0.0,0.0,17.87,41.35,24.61,96.48,23312723.0,0.0,0,0,0,0,0,0,0


In [17]:
complete_dfs1[0].head(15)

Unnamed: 0,KT,Inst,Typ,RForm,PtageStatMST,AustStatMST,NeugStatMST,Ops,Gebs,PersA,PersP,PersMT,PersT,AwT,PtageStatMSA,AustStatMSA,BettenStatA,SL_IPS,SA_Angio,SA_CC,SA_Dia,SA_LB,SA_Lito,SA_PET,Typ_K111
1819,AG,Kantonsspital Aarau AG,K112,R1,159012.0,26426.0,1405.0,17.0,5.0,574.68,1350.82,343.28,3208.25,591171786.0,159012.0,26426.0,507.0,1,1,1,1,1,1,1,0
1820,AG,Kantonsspital Baden AG,K112,R1,106526.0,18594.0,1542.0,8.0,6.0,278.23,734.76,194.12,1706.8,298407910.0,106526.0,18594.0,356.224657,1,1,1,1,0,1,1,0
1821,AG,Hirslanden Klinik Aarau,K112,R1,47599.0,9921.0,674.0,7.0,2.0,10.94,304.23,69.77,535.88,133234031.0,47599.0,9921.0,143.189041,1,1,0,1,1,1,0,0
1822,AG,Gesundheitszentrum Fricktal,K121,R1,35356.0,7666.0,382.0,6.0,3.0,74.9,229.13,59.45,535.03,85026513.0,35356.0,7666.0,138.167123,0,0,0,0,0,0,0,0
1823,AG,Kreisspital für das Freiamt,K121,R2,33713.0,7767.0,516.0,4.0,2.0,71.45,241.27,63.8,492.63,86309923.0,33713.0,7767.0,114.0,0,0,0,1,0,0,0,0
1824,AG,Spital Zofingen AG,K122,R1,25482.0,5240.0,0.0,4.0,0.0,51.69,181.56,93.69,473.49,68864220.0,25482.0,5240.0,84.0,1,0,0,0,0,0,0,0
1825,AG,Asana Gruppe AG Spital Leuggern,K122,R1,16232.0,3797.0,606.0,2.0,3.0,9.25,104.39,9.58,163.02,32030067.0,16232.0,3797.0,38.591781,0,0,0,0,0,0,0,0
1826,AG,Asana Gruppe AG Spital Menziken,K122,R1,15492.0,3443.0,177.0,3.0,2.0,25.42,91.57,28.63,225.14,32647523.0,15492.0,3443.0,51.0,0,0,0,0,0,0,0,0
1827,AG,Klinik Villa im Park AG,K122,R1,8888.0,2674.0,624.0,2.0,2.0,0.0,48.43,6.9,82.41,17354735.0,8888.0,2674.0,32.0,0,0,0,0,0,0,0,0
1828,AG,Psychiatrische Dienste Aargau AG,K211,R1,116308.0,3431.0,0.0,0.0,0.0,101.43,319.4,158.86,846.74,128838758.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0


In [18]:
#def drop_most_missing(df, rows, missing_fraction_threshold, correlation_threshold, target):
#    complete_dfs = []
#    dropped_columns = []
#    target = df.columns[0]
#    for i in range(len(df.columns)):
#        col_with_most_missing = df.isnull().sum().idxmax()
#        missing_fraction = 1 - (df[col_with_most_missing].isnull().sum() / len(df))
#        if missing_fraction > missing_fraction_threshold and correlation < correlation_threshold:
#           df = df.drop(col_with_most_missing, axis=1)
#            print(f"Dropped column: {col_with_most_missing}")
#            dropped_columns.append(col_with_most_missing)
#           print(f"New shape: {df.shape}")
#        complete_rows = df.dropna()
#        print(f"Shape with complete rows only: {complete_rows.shape}")
#        if i in rows:
#            complete_dfs.append(complete_rows)
#    return complete_dfs, dropped_column