In [270]:
import numpy as np
import pandas as pd
import os
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVR, SVR, SVC,LinearSVC
from sklearn.datasets import make_regression
from sklearn.preprocessing import normalize 


In [487]:
df_training_original = pd.read_csv('train_features.csv')
df_training_label = pd.read_csv('train_labels.csv')
all_pids = [pid for pid in df_training_original['pid'].unique()]


In [191]:
def getPatientData(trainingData, pids, patients=0, mode='pid'):
    if mode == 'number':
        pids = all_pids[:patients]
    if len(pids) == 0:
        return trainingData
    #pids = np.array(pids).astype(np.float)
    patients = [trainingData.iloc[idx] for idx in range(0, len(trainingData)) if trainingData['pid'][idx] in pids]    
    #patientTrainingDataIndex = [trainingData.iloc[idx] for idx, col in enumerate(trainingData) if trainingData['pid'][idx] in pids]    
    return pd.DataFrame(patients)

def partitionData(trainingDataPids, trainingPartition=80):
    validationPartition = 100 - trainingPartition
    countTraining = int((trainingPartition/100)*len(trainingDataPids))
    training = trainingDataPids[:countTraining]
    validation = trainingDataPids[countTraining:]
    print('')
    print('Training size: ' + str(countTraining))
    print('Validation size: ' + str(len(validation)))
    return training, validation

def populateData(X,Y):
    Z = pd.merge(X, Y, on='pid')
    YData = Z[Y.columns].iloc[:,1:]
    XData = Z[X.columns].iloc[:,1:]
    return XData, YData

In [488]:
df_training = df_training_original.copy()

# See how many values are missing in which rows
loss = df_training_original.isnull().any()
lossRow = df_training[df_training_original == loss].sum()
print('Number of missing values in data:')
print(lossRow[lossRow > 0].sort_values(ascending=False))

# Filter out certain rows with a lot of missing data
lossColumns = list(lossRow[lossRow > 100].index)
df_training = df_training.drop(columns=lossColumns)
#df_training_label.drop(columns=lossColumns, axis=0, inplace=True)

# Set all time data to be between 1-12
X_ALL = pd.DataFrame(columns = df_training.columns)
for pids in all_pids:
    df_trainingTemp = df_training[df_training['pid'] == pids]
    df_trainingTemp['Time'] = df_trainingTemp['Time'] - (df_trainingTemp['Time'].min() -1)
    X_ALL = pd.concat([X_ALL, df_trainingTemp])
    
df_training = X_ALL.copy()
# Partition data in training and validation
trainingPIDS, validationPIDS = partitionData(all_pids)
X_pid = getPatientData(df_training, trainingPIDS)
X_pid_val = getPatientData(df_training, validationPIDS)


Number of missing values in data:
FiO2                3246.0
BaseExcess          1267.0
Creatinine           837.0
Lactate              295.0
Bilirubin_total      219.0
RRate                101.0
Magnesium             59.0
Phosphate             59.0
Bilirubin_direct      19.0
Calcium               19.0
TroponinI             11.0
BUN                    9.0
WBC                    8.0
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Training size: 15196
Validation size: 3799


In [548]:
df_data_loss = pd.DataFrame(index=df_training.count().index,columns=['Entries', 'MissingEntries', 'Percentage Missing'])
#print(df_training_original.count())
# See how many values are missing in which rows
loss = df_training.isnull().any()
lossRow = df_training[df_training == loss].sum()
#print('Number of missing values in data:')
#print(lossRow[lossRow > 0].sort_values(ascending=False))
temp = lossRow[lossRow > 0].sort_values(ascending=False)
print(temp.index)
colNames = temp.index
entries = pd.DataFrame(df_training.isnull().count(), columns=['Entries'])
missingEntries = df_training.isnull().sum()
percentage = missingEntries /df_training.isnull().count()


Index(['Magnesium', 'Phosphate', 'Bilirubin_direct', 'Calcium', 'TroponinI',
       'BUN', 'WBC'],
      dtype='object')


In [550]:

df_data_total = df_data_loss.copy()
df_data_total['Entries'] = entries
df_data_total['MissingEntries'] = missingEntries
df_data_total['Percentage Missing'] = percentage

print(df_data_total.sort_values(by='Percentage Missing', ascending=True))

                  Entries  MissingEntries  Percentage Missing
pid                227940               0            0.000000
Time               227940               0            0.000000
Age                227940               0            0.000000
Heartrate          227940           27812            0.122015
ABPm               227940           32051            0.140612
SpO2               227940           32748            0.143669
ABPs               227940           36290            0.159209
ABPd               227940           75522            0.331324
Temp               227940          146825            0.644139
Glucose            227940          180904            0.793647
Potassium          227940          199547            0.875437
Hct                227940          200643            0.880245
pH                 227940          202894            0.890120
Hgb                227940          205645            0.902189
PaCO2              227940          206897            0.907682
BUN     

In [215]:
# Imputation
#X_pid = df_training
my_imputer = IterativeImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_pid))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_pid_val))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_pid.columns
imputed_X_valid_plus.columns = X_pid_val.columns

print(imputed_X_valid_plus.head())

In [237]:
# Only take first X patients
X_pid_train = getPatientData(imputed_X_train_plus, [], patients=1000, mode='number')
Y_pid_train = getPatientData(df_training_label, [], patients=1000, mode='number')
X_val = imputed_X_valid_plus
Y_val = getPatientData(df_training_label, validationPIDS)

# Make X and Y the same size, remove pid
X_train_proc, Y_train_proc = populateData(X_pid_train, Y_pid_train)
X_val_proc, Y_val_proc = populateData(X_val, Y_val)

# Normalize Data
norm_Xtrain = pd.DataFrame(normalize(X_train_proc, norm='max',axis=0))
norm_Xtrain.columns = X_train_proc.columns
norm_Xval = pd.DataFrame(normalize(X_val_proc, norm='max',axis=0))
norm_Xval.columns = X_val_proc.columns

In [311]:
print(X_train_proc.head())

   Time   Age      EtCO2        PTT        BUN       Temp        Hgb  \
0   1.0  34.0  33.367981  38.804622  12.000000  36.000000   8.700000   
1   2.0  34.0  33.608720  39.365101  23.133962  36.000000  10.641975   
2   3.0  34.0  34.271618  39.565520  23.133622  36.000000  10.641970   
3   4.0  34.0  34.554130  39.514079  23.133601  37.000000  10.642187   
4   5.0  34.0  34.396151  39.170251  23.133716  36.854604  10.640923   

        HCO3  Fibrinogen  Phosphate  ...   Calcium  Alkalinephos   SpO2  \
0  24.000000  215.585891   3.253653  ...  7.151633     96.077986  100.0   
1  22.954709  282.503029   3.753947  ...  7.171375    103.833333  100.0   
2  23.267328  277.311829   3.644657  ...  7.167266    102.677776  100.0   
3  23.393672  287.110658   3.614850  ...  7.168602    101.700022  100.0   
4  24.163537  248.346468   3.389598  ...  7.155465    100.148955  100.0   

   Bilirubin_direct    Chloride        Hct  Heartrate  TroponinI   ABPs    pH  
0          1.449792  114.000000  24.

In [246]:
regr = SVC(kernel='sigmoid', C=1.0)
regr.fit(np.array(norm_Xtrain), np.array(Y_train_proc.iloc[:,0]))




SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='sigmoid', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [247]:
f = regr.predict(norm_Xval)

In [249]:
print(np.array(f)[0:100])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [241]:
print(mean_absolute_error(f, Y_val_proc.iloc[:,0]))

0.265


In [431]:
## Trying a different approach ##


X_pid_train = pd.DataFrame(columns=['pid','Age', 'Temp1','Temp2','Temp3','Temp4','Temp5','Temp6',
                              'HR1','HR2','HR3','HR4','HR5','HR6',
                              'Abps1','Abps2','Abps3','Abps4','Abps5','Abps6',
                              'pH1','pH2','pH3','pH4','pH5','pH6'])

Y_pid_train = pd.DataFrame(columns = df_training_label.columns)
for pid in all_pids:
    uniqueData = df_training_original[df_training_original['pid']==pid]
    yResult = df_training_label[df_training_label['pid'] == pid]
    yResult['pid'] = yResult['pid'].astype(np.float)
    Temp = uniqueData['Temp']
    if Temp.isna().sum() > 3:
        continue
    HR = uniqueData['Heartrate']
    Abps = uniqueData['ABPs']
    pH  = uniqueData['pH']
    DF = pd.DataFrame([[np.nan,np.nan, np.nan, np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,
                       np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,
                       np.nan,np.nan,np.nan,np.nan,np.nan]], columns=['pid','Age', 'Temp1','Temp2','Temp3','Temp4','Temp5','Temp6',
                              'HR1','HR2','HR3','HR4','HR5','HR6',
                              'Abps1','Abps2','Abps3','Abps4','Abps5','Abps6',
                              'pH1','pH2','pH3','pH4','pH5','pH6'])

    DF['pid'] = uniqueData['pid'].iloc[0]
    DF['Age'] = uniqueData['Age'].iloc[0]
    for i in range(0,6):
        DF['Temp' + str(i+1)] = Temp.iloc[2*i]
        DF['HR' + str(i+1)] = HR.iloc[2*i]
        DF['Abps' + str(i+1)] = Abps.iloc[2*i]
        DF['pH' + str(i+1)] = pH.iloc[2*i]


    X_pid_train = pd.concat([X_pid_train, DF])
    Y_pid_train = pd.concat([Y_pid_train, yResult])

print(X_pid_train.shape)
#print(Y_pid_train)
#regr.fit(np.array(L), np.array(Y_train_proc.iloc[:,0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


(3063, 26)


In [432]:
my_imputer2 = IterativeImputer()
imputed_X_train_plus2 = pd.DataFrame(my_imputer2.fit_transform(X_pid_train))
imputed_X_train_plus2.columns = X_pid_train.columns

print(imputed_X_train_plus2.shape)

(3063, 26)


In [454]:
imputed_X_train_plus2 = imputed_X_train_plus2.reset_index(drop=True)
Y_pid_train = Y_pid_train.reset_index(drop=True)

trainingPIDS, validationPIDS = partitionData([pid for pid in imputed_X_train_plus2['pid'].unique()])
X_pid = getPatientData(imputed_X_train_plus2, trainingPIDS)
Y_pid = getPatientData(Y_pid_train, trainingPIDS)
X_pid_val = getPatientData(imputed_X_train_plus2, validationPIDS)
Y_pid_val = getPatientData(Y_pid_train, validationPIDS)

# Drop pid
X_pid = X_pid.iloc[:,1:]
Y_pid = Y_pid.iloc[:,1:]
X_pid_val = X_pid_val.iloc[:,1:]
Y_pid_val = Y_pid_val.iloc[:,1:]

X_pid_train_norm = pd.DataFrame(normalize(X_pid,axis=0))
X_pid_train_norm.columns = X_pid.columns
X_pid_val_norm = pd.DataFrame(normalize(X_pid_val,axis=0))
X_pid_val_norm.columns = X_pid_val.columns
#print(norm_Xtrain.head())


Training size: 2450
Validation size: 613


In [469]:
regr = LinearSVC(C=1)
regr.fit(np.array(X_pid_train_norm), np.array(Y_pid.iloc[:,1]))


LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [470]:
f = regr.predict(X_pid_val_norm)
print('Prediction')
print(f)
print('')
print('GT')
print(np.array(Y_pid_val.iloc[:,1]))

Prediction
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [464]:
print(mean_absolute_error(f, np.array(Y_pid_val.iloc[:,1])))

0.09787928221859707
