# Imputation using Keras

- This notebook provides a simple method to imputing missing values, and provides code for making a submission file.
- EDA revealed F_2 columns are missing no values
- all columns uncorrelated except for F_4*
- this means the best estimator for missing values for F_1,F_3 is the mean (since competition metric is RMSE)
- we only need to impute F_4*

Here we fit a Keras model for each column of F_4*. 
- for each column we create a training set of only the records that are not missing values
- we use dropout to help learn to deal with missing values


In [1]:
from pathlib import Path
import gc

import numpy as np 
import pandas as pd 

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras import layers as L
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
from tensorflow_addons.activations import mish
from tensorflow_addons.layers import WeightNormalization

from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

RANDOM_STATE=42
INPUT_PATH = Path('./input')
P=1/55

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


https://github.com/ryancheunggit/tabular_dae

In [2]:
!nvidia-smi

Thu Jun 30 12:41:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.95       Driver Version: 512.95       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:06:00.0  On |                  N/A |
| 32%   59C    P2    40W / 120W |   5803MiB /  6144MiB |     33%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
dtypes = {'row_id' : 'int',
          'F_2_0' : 'int', 'F_2_1' : 'int', 'F_2_2' : 'int',
          'F_2_3' : 'int', 'F_2_4' : 'int', 'F_2_5' : 'int', 
          'F_2_6' : 'int', 'F_2_7' : 'int', 'F_2_8' : 'int',
          'F_2_9' : 'int', 'F_2_10' : 'int', 'F_2_11' : 'int',
          'F_2_12' : 'int', 'F_2_13' : 'int', 'F_2_14' : 'int',
          'F_2_15' : 'int', 'F_2_16' : 'int', 'F_2_17' : 'int',
          'F_2_18' : 'int', 'F_2_19' : 'int', 'F_2_20' : 'int',
          'F_2_21' : 'int', 'F_2_22' : 'int', 'F_2_23' : 'int',
          'F_2_24' : 'int'}

data = pd.read_csv(INPUT_PATH / 'data.csv', 
                   index_col='row_id',
                   dtype = dtypes)
submission = pd.read_csv(INPUT_PATH / 'sample_submission.csv', 
                         index_col='row-col')

In [4]:
#We can see that the values are acceptably small and don't have to be normalised
data.agg(['min','mean','max'])

Unnamed: 0,F_1_0,F_1_1,F_1_2,F_1_3,F_1_4,F_1_5,F_1_6,F_1_7,F_1_8,F_1_9,F_1_10,F_1_11,F_1_12,F_1_13,F_1_14,F_2_0,F_2_1,F_2_2,F_2_3,F_2_4,F_2_5,F_2_6,F_2_7,F_2_8,F_2_9,F_2_10,F_2_11,F_2_12,F_2_13,F_2_14,F_2_15,F_2_16,F_2_17,F_2_18,F_2_19,F_2_20,F_2_21,F_2_22,F_2_23,F_2_24,F_3_0,F_3_1,F_3_2,F_3_3,F_3_4,F_3_5,F_3_6,F_3_7,F_3_8,F_3_9,F_3_10,F_3_11,F_3_12,F_3_13,F_3_14,F_3_15,F_3_16,F_3_17,F_3_18,F_3_19,F_3_20,F_3_21,F_3_22,F_3_23,F_3_24,F_4_0,F_4_1,F_4_2,F_4_3,F_4_4,F_4_5,F_4_6,F_4_7,F_4_8,F_4_9,F_4_10,F_4_11,F_4_12,F_4_13,F_4_14
min,-4.664,-4.791,-4.871,-5.053,-5.363,-5.508,-5.199,-6.99,-4.567,-4.998,-4.795,-4.612,-7.063,-6.896,-4.63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.694,-4.466,-4.886,-4.677,-5.009,-4.871,-5.019,-5.053,-5.508,-4.846,-4.626,-4.598,-4.533,-4.747,-5.363,-4.448,-4.822,-4.806,-5.199,-6.069,-4.998,-7.147,-4.741,-5.251,-4.891,-12.878,-12.532,-9.663,-9.942,-12.825,-12.537,-11.132,-11.678,-10.092,-9.864,-10.354,-26.277,-11.524,-10.662,-9.984
mean,-0.001,0.002,0.001,0.001,0.002,0.001,-0.0,-0.064,-0.0,0.0,0.0,-0.001,-0.061,-0.067,-0.001,2.688,2.514,0.977,2.517,2.941,1.533,1.492,2.646,1.178,1.111,3.28,2.466,2.759,2.48,1.718,1.78,1.801,1.243,1.557,1.603,2.231,2.032,1.606,0.709,3.134,0.002,-0.001,0.001,0.001,0.001,-0.002,0.0,0.002,0.001,-0.0,0.002,0.001,0.0,-0.002,0.001,-0.002,-0.001,-0.0,0.0,-0.065,0.002,-0.059,0.0,0.0,-0.001,0.327,-0.331,-0.086,-0.195,0.333,0.336,0.004,0.334,-0.072,-0.08,0.038,0.552,0.334,0.33,0.037
max,5.039,5.043,5.13,5.462,4.857,4.961,4.958,2.528,4.886,4.789,4.914,4.818,2.301,2.543,4.816,15.0,14.0,11.0,14.0,16.0,12.0,12.0,16.0,13.0,11.0,17.0,13.0,15.0,15.0,13.0,13.0,13.0,12.0,15.0,13.0,14.0,15.0,16.0,11.0,17.0,4.587,4.851,4.763,4.988,4.722,5.039,4.525,5.462,5.107,5.101,5.13,4.685,4.943,4.71,4.82,5.248,4.839,5.058,4.961,2.666,6.032,2.392,4.967,4.809,4.981,10.657,11.674,2.909,2.582,11.927,13.54,11.525,12.536,2.607,2.815,2.548,31.229,11.342,11.901,2.584


In [5]:
def cols_by_prefix(columns, prefix):
    return [x for x in columns if x.startswith(prefix)]

cols_f1 = cols_by_prefix(data.columns, 'F_1')
cols_f2 = cols_by_prefix(data.columns, 'F_2')
cols_f3 = cols_by_prefix(data.columns, 'F_3')
cols_f4 = cols_by_prefix(data.columns, 'F_4')
cols_f134 = cols_f1 + cols_f3 + cols_f4
cols_f123 = cols_f1 + cols_f2 + cols_f3

data_f134 = data[cols_f134]
data_f1 = data[cols_f1]
data_f2 = data[cols_f2]
data_f3 = data[cols_f3]
data_f4 = data[cols_f4]

In [6]:
def make_training(df, n, p, random_state):
    # first find all rows with *no* NaN; sample n rows
    df = df[~df.isnull().any(axis=1)]
    if n > 0:
        df = df.sample(n=n, random_state=random_state)
    
    # random mask of NaN locations; only cols F_1*, F_3*, F_4*
    mask = np.random.random(df[cols_f134].shape) < p
    df_na = df[cols_f134].mask(mask)

    # put it back together with F_2*
    df_na = pd.concat([df_na[cols_f1], df[cols_f2], df_na[cols_f3], df_na[cols_f4]], axis=1)
    return df, df_na, df_na.isna().sum().sum()

def sse_cols(df1, df2):
    return ((df1 - df2).pow(2)).sum()

def rmse(df1, df2, n):
    return (sse_cols(df1, df2).sum()/n)**0.5

In [7]:
def get_model(input_size=28, hidden_size=64):
    
    # Input:
    inputF4 = L.Input(shape=input_size)

    # Network:
    x = L.Dense(units=hidden_size*4,  
                kernel_regularizer=tf.keras.regularizers.l2(40e-6),
                activation=mish)(inputF4)
#    x = L.BatchNormalization()(x)
    x = L.Dense(units=hidden_size*4, 
                kernel_regularizer=tf.keras.regularizers.l2(40e-6),
                activation=mish)(x)
#    x = L.BatchNormalization()(x)
    x = L.Dense(units=hidden_size*2,
                kernel_regularizer=tf.keras.regularizers.l2(40e-6),
                activation=mish)(x)
#    x = L.BatchNormalization()(x)
    x = L.Dense(units=hidden_size, 
                kernel_regularizer=tf.keras.regularizers.l2(40e-6),
                activation=mish)(x)
#    x = L.BatchNormalization()(x)
    x = L.Dense(units=1, activation='linear')(x)
    
    # Output:
    model = tf.keras.Model(inputF4, x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
        loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    return(model)

## Impute mean for F1,F3

In [8]:
# grab a training subset
train, train_na, na_count = make_training(data, -1, P, RANDOM_STATE)

mean_imputer = SimpleImputer(strategy="mean")

# include F2 and passthrough just to retain column order
mean_imputer = ColumnTransformer(
    transformers=[
        ("mean1", mean_imputer, cols_f1),
        ("mean2", mean_imputer, cols_f2),
        ("mean3", mean_imputer, cols_f3),
    ],
    remainder='passthrough'
)
train_na[:] = mean_imputer.fit_transform(train_na)

## Iteratively impute F4_* using Keras

In [9]:
for col in ['F_4_0']:
    print(col)
    x_train = data_f4[data_f4[col].notna()]
    x_test = data_f4[data_f4[col].isna()]

    x_train, x_valid = train_test_split(x_train, test_size=0.20)
    y_train = x_train.pop(col)
    y_valid = x_valid.pop(col)
    x_test.pop(col)

    imputer = SimpleImputer(strategy="constant", fill_value=-128, add_indicator=True)
    x_train = imputer.fit_transform(x_train)
    x_valid = imputer.fit_transform(x_valid)
    x_test = imputer.fit_transform(x_test)
    
    print(f'train: {x_train.shape}, valid: {x_valid.shape}, test: {x_test.shape}')
    print(f'train: {y_train.shape}, valid: {y_valid.shape}')

    model = get_model()
    history = model.fit(
        x=x_train, y=y_train, 
        validation_data=(x_valid, y_valid),
        epochs=2, 
        verbose=2,
        callbacks=[
            ReduceLROnPlateau(monitor='val_loss',mode='min',
                verbose=0,factor=0.5,patience=3),
            EarlyStopping(mode='min',restore_best_weights=True,
                verbose=0,min_delta=1e-4,patience=10),
        ]
    )
    y_pred = model.predict(x_valid)
    print(f'FFNN: col={col}, RMSE={mean_squared_error(y_valid,y_pred,squared=False)}')        

    
    

F_4_0
train: (785497, 28), valid: (196375, 28), test: (18128, 28)
train: (785497,), valid: (196375,)
24547/24547 - 210s - loss: 405.4362 - root_mean_squared_error: 20.1219 - val_loss: 5.8766 - val_root_mean_squared_error: 2.2926


NameError: name 'cnt' is not defined

In [None]:
%%time



from keras.wrappers.scikit_learn import KerasClassifier

callbacks=[ReduceLROnPlateau(monitor='val_loss',mode='min',
                             verbose=0,factor=0.5,patience=3),
           EarlyStopping(mode='min',restore_best_weights=True,
                         verbose=0,min_delta=1e-4,patience=10)
          ]

estimator = KerasClassifier(build_fn=lambda: get_model(),
                            epochs=1, 
                            validation_split=0.2,
                            callbacks=callbacks)
                            
iter_imputer = IterativeImputer(estimator=estimator,
                                max_iter=1,
                                verbose=2,
                                random_state=RANDOM_STATE)

train_na[cols_f4] = iter_imputer.fit_transform(train_na[cols_f4].to_numpy())
print(f'RMSE={rmse(train, train_na, na_count)}')

In [None]:
y_train = train_na['F_4_0']
X_train = train_na[cols_f4].drop(columns='F_4_0')

print(X_train.shape, y_train.shape)

estimator.fit(X_train, y_train)

In [None]:
display(X_train.iloc[0].to_numpy().shape)
estimator.predict(X_train)

In [None]:
def train_test_split(df, col):
    X_train = df[df[col].notna()]
    y_train = X_train[col]
    X_train = X_train.drop(columns=col)
    
    X_test = df[df[col].isna()]
    X_test = X_test.drop(columns=col)
    return X_train, y_train, X_test


In [None]:
def fit_model(df, col):

    print(f'**** imputing {col}')
    X_train, y_train, X_test = train_test_split(df[cols_f4], col)
    print(f'{X_train.shape}, {y_train.shape}, {X_test.shape}')

    X_train = X_train.fillna(-1000)
    X_test = X_test.fillna(-1000)

    model = get_model(X_train.shape[1])
    model.fit(X_train, y_train, validation_split=0.1,
              epochs=300, 
            callbacks=[
                ReduceLROnPlateau(monitor='val_loss',mode='min',
                    verbose=0,factor=0.5,patience=3),
                EarlyStopping(mode='min',restore_best_weights=True,
                    verbose=0,min_delta=1e-4,patience=10),
                ModelCheckpoint(f'model_{col}.hdf5',monitor='val_loss',mode='min',
                    verbose=0,save_best_only=True,save_weights_only=True),
            ]      
             )
    model.load_weights(f'model_{cnt}_{col}.hdf5')
    return model.predict(X_test)
    



In [None]:
all_preds = {}

for col in cols_f4:
    preds = fit_model(train_na, col)
    all_preds[col] = preds


In [None]:
def update_preds(df, all_preds):
    for col in all_preds.keys():
        print(col)
        preds = all_preds[col]
        df.update(preds)
    return df

In [None]:
train_na = update_preds(train_na, all_preds)

In [None]:
rmse(train, train_na, na_count)

In [None]:
data[:] = imputer.fit_transform(data)
all_preds = fit_model(data)
data = update_preds(data, all_preds)

In [None]:
def make_submission(data):
    new_submission = submission.copy(deep=True)
    for i in tqdm(new_submission.index):
        row = int(i.split('-')[0])
        col = i.split('-')[1]
        new_submission.loc[i, 'value'] = data.loc[row, col]

    new_submission.to_csv('submission.csv')

In [None]:
make_submission(data)

In [19]:
id = "A0100 00000 XaVu9"
for i in id:
    if i.isupper() == True:
        id = id.replace(i,"1")
    elif i == " ":
        id = id.replace(i," ")
    else:
        print(1,i,id)
        id = id.replace(i,"0")
        print(2,i,id)
print (id)

1 0 10100 00000 XaVu9
2 0 10100 00000 XaVu9
1 1 10100 00000 XaVu9
2 1 00000 00000 XaVu9
1 0 00000 00000 XaVu9
2 0 00000 00000 XaVu9
1 0 00000 00000 XaVu9
2 0 00000 00000 XaVu9
1 0 00000 00000 XaVu9
2 0 00000 00000 XaVu9
1 0 00000 00000 XaVu9
2 0 00000 00000 XaVu9
1 0 00000 00000 XaVu9
2 0 00000 00000 XaVu9
1 0 00000 00000 XaVu9
2 0 00000 00000 XaVu9
1 0 00000 00000 XaVu9
2 0 00000 00000 XaVu9
1 a 00000 00000 1aVu9
2 a 00000 00000 10Vu9
1 u 00000 00000 101u9
2 u 00000 00000 10109
1 9 00000 00000 10109
2 9 00000 00000 10100
00000 00000 10100


In [21]:
old = "A0100 00000 XaVu9"
new = []
for i in old:
    if i.isupper() == True:
        new.append("1")
    elif i == " ":
        new.append(" ")
    else:
        new.append("0")
print (''.join(new))

10000 00000 10100
