In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler



In [2]:
from tps_feb_2021.config import config
from tps_feb_2021.utils import add_noise, save_model, get_run_logdir, extract_features

In [3]:
data_dir = config['data_dir']

train_raw = pd.read_csv(data_dir + 'raw/train.csv')
test_raw = pd.read_csv(data_dir + 'raw/test.csv')


In [4]:
cont_cols = [col for col in train_raw.columns if col[:4] == 'cont']
cat_cols = [col for col in train_raw.columns if col[:3] == 'cat']

In [5]:
X = pd.concat([train_raw[cat_cols + cont_cols], test_raw[cat_cols + cont_cols]])
X.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,A,B,A,A,B,D,A,E,C,I,...,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903
1,B,A,A,A,B,B,A,E,A,F,...,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464
2,A,A,A,C,B,D,A,B,C,N,...,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352
3,A,A,A,C,B,D,A,E,G,K,...,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766
4,A,B,A,A,B,B,A,E,C,F,...,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743


In [6]:
X.shape

(500000, 24)

In [7]:
noisy_X = add_noise(X, p=0.30)

In [8]:
scaler = StandardScaler()
cont_X_scaled = pd.DataFrame(scaler.fit_transform(X[cont_cols]), columns=cont_cols)
cont_noisy_X_scaled = pd.DataFrame(scaler.fit_transform(noisy_X[cont_cols]), columns=cont_cols)

In [9]:
noisy_one_hot_cats = pd.get_dummies(noisy_X[cat_cols])
one_hot_cats = pd.get_dummies(X[cat_cols])

In [10]:
noisy_X = pd.concat([noisy_one_hot_cats, cont_noisy_X_scaled], axis=1)
X = pd.concat([one_hot_cats.reset_index(drop=True), cont_X_scaled.reset_index(drop=True)], axis=1)

In [11]:
noisy_X.shape

(500000, 70)

In [12]:
X.shape

(500000, 70)

In [13]:
noisy_X.columns

Index(['cat0_A', 'cat0_B', 'cat1_A', 'cat1_B', 'cat2_A', 'cat2_B', 'cat3_A',
       'cat3_B', 'cat3_C', 'cat3_D', 'cat4_A', 'cat4_B', 'cat4_C', 'cat4_D',
       'cat5_A', 'cat5_B', 'cat5_C', 'cat5_D', 'cat6_A', 'cat6_B', 'cat6_C',
       'cat6_D', 'cat6_E', 'cat6_G', 'cat6_H', 'cat6_I', 'cat7_A', 'cat7_B',
       'cat7_C', 'cat7_D', 'cat7_E', 'cat7_F', 'cat7_G', 'cat7_I', 'cat8_A',
       'cat8_B', 'cat8_C', 'cat8_D', 'cat8_E', 'cat8_F', 'cat8_G', 'cat9_A',
       'cat9_B', 'cat9_C', 'cat9_D', 'cat9_E', 'cat9_F', 'cat9_G', 'cat9_H',
       'cat9_I', 'cat9_J', 'cat9_K', 'cat9_L', 'cat9_M', 'cat9_N', 'cat9_O',
       'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')

In [14]:
X.columns

Index(['cat0_A', 'cat0_B', 'cat1_A', 'cat1_B', 'cat2_A', 'cat2_B', 'cat3_A',
       'cat3_B', 'cat3_C', 'cat3_D', 'cat4_A', 'cat4_B', 'cat4_C', 'cat4_D',
       'cat5_A', 'cat5_B', 'cat5_C', 'cat5_D', 'cat6_A', 'cat6_B', 'cat6_C',
       'cat6_D', 'cat6_E', 'cat6_G', 'cat6_H', 'cat6_I', 'cat7_A', 'cat7_B',
       'cat7_C', 'cat7_D', 'cat7_E', 'cat7_F', 'cat7_G', 'cat7_I', 'cat8_A',
       'cat8_B', 'cat8_C', 'cat8_D', 'cat8_E', 'cat8_F', 'cat8_G', 'cat9_A',
       'cat9_B', 'cat9_C', 'cat9_D', 'cat9_E', 'cat9_F', 'cat9_G', 'cat9_H',
       'cat9_I', 'cat9_J', 'cat9_K', 'cat9_L', 'cat9_M', 'cat9_N', 'cat9_O',
       'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')

In [15]:
model = keras.models.Sequential([
    keras.layers.Input(shape=noisy_X.shape[1:]),
    keras.layers.Dense(500, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.Dense(500, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.Dense(500, activation='selu', kernel_initializer='lecun_normal'),
    keras.layers.Dense(70)
])

model.compile(loss="mean_squared_error", optimizer=keras.optimizers.Adam(learning_rate=0.001))

In [16]:
run_logdir = get_run_logdir()
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

In [17]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

In [18]:
history = model.fit(noisy_X, X, epochs=2001,
                    callbacks=[tensorboard_cb, early_stopping_cb])

Epoch 1/2001
Epoch 2/2001
Epoch 3/2001
Epoch 4/2001
Epoch 5/2001
Epoch 6/2001
Epoch 7/2001
Epoch 8/2001
Epoch 9/2001
Epoch 10/2001
Epoch 11/2001
Epoch 12/2001
Epoch 13/2001
Epoch 14/2001
Epoch 15/2001
Epoch 16/2001
Epoch 17/2001
Epoch 18/2001
Epoch 19/2001
Epoch 20/2001
Epoch 21/2001
Epoch 22/2001
Epoch 23/2001
Epoch 24/2001
Epoch 25/2001
Epoch 26/2001
Epoch 27/2001
Epoch 28/2001
Epoch 29/2001
Epoch 30/2001
Epoch 31/2001
Epoch 32/2001
Epoch 33/2001
Epoch 34/2001
Epoch 35/2001
Epoch 36/2001
Epoch 37/2001
Epoch 38/2001
Epoch 39/2001
Epoch 40/2001
Epoch 41/2001
Epoch 42/2001
Epoch 43/2001
Epoch 44/2001
Epoch 45/2001
Epoch 46/2001
Epoch 47/2001
Epoch 48/2001
Epoch 49/2001
Epoch 50/2001
Epoch 51/2001
Epoch 52/2001
Epoch 53/2001
Epoch 54/2001
Epoch 55/2001
Epoch 56/2001
Epoch 57/2001
Epoch 58/2001
Epoch 59/2001
Epoch 60/2001
Epoch 61/2001
Epoch 62/2001
Epoch 63/2001
Epoch 64/2001
Epoch 65/2001
Epoch 66/2001
Epoch 67/2001
Epoch 68/2001
Epoch 69/2001
Epoch 70/2001
Epoch 71/2001
Epoch 72/2001
E

In [19]:
save_model(model, '06_dae_model_selu_sn_30.h5')

In [20]:
#model = keras.models.load_model('../models/06_dae_model_swap_noise_30.h5')

In [21]:
model.inputs

[<KerasTensor: shape=(None, 70) dtype=float32 (created by layer 'input_1')>]

In [22]:
features_df = extract_features(model, X)



In [23]:
features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0.058319,0.000465,0.052489,0.021048,-0.794594,-0.004756,0.010086,-0.715352,0.029382,0.014650,...,0.065788,-0.086147,0.069789,0.003956,0.105133,-0.048815,0.129096,0.064421,0.007491,0.002482
1,0.040359,0.082081,0.037943,0.010939,-1.328174,-0.033000,0.038045,1.619082,0.050926,0.001353,...,0.007663,0.007641,0.068315,0.003265,-0.161186,0.164613,0.052404,0.121457,0.091269,0.002464
2,0.108792,0.053552,0.019014,-0.110000,-0.339576,-0.141808,0.031946,1.223386,0.072953,0.066531,...,0.046698,0.092065,0.092808,0.037581,0.216625,0.119879,0.064635,0.078915,0.081021,0.142035
3,0.029734,0.062232,-0.012232,-0.053900,-0.932599,-0.062278,0.019316,1.155396,0.070561,0.039300,...,0.032138,0.020613,0.038386,-0.001263,0.070591,0.048093,0.007249,-0.016039,0.040505,0.070142
4,0.019453,0.065288,0.033731,0.031940,-1.119124,0.023137,0.031918,-0.020218,0.015763,0.015059,...,0.028058,0.028873,0.093481,0.001618,-0.212316,0.085945,0.018746,0.127637,0.031417,-0.108234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0.066229,0.045676,0.003087,0.060369,-0.754635,-0.047248,0.000370,-0.597162,0.071692,0.034272,...,0.089646,-0.087097,0.030235,-0.012128,0.128132,-0.070481,0.058289,0.032597,0.035057,0.071568
499996,0.005686,0.094318,-0.011155,0.064207,1.151271,-0.038018,0.090155,0.119251,0.067192,0.040168,...,0.057818,0.057854,0.120594,-0.022143,-0.004959,0.034844,0.019324,-0.029615,0.013295,0.053215
499997,0.013255,0.066388,0.010527,0.012821,0.379246,0.016940,0.031929,0.130853,0.064136,0.026497,...,0.031966,0.051732,0.097315,0.013994,0.011130,0.048667,0.030291,0.017709,0.019820,-0.049556
499998,0.038217,0.001306,0.017454,-0.049669,0.017816,-0.099921,0.018635,1.418504,0.036142,-0.008843,...,0.024920,-0.073589,0.089761,-0.028511,0.053014,0.065833,0.144459,0.039169,0.030655,0.022318


In [24]:
features_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,...,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,0.025102,0.059247,0.007537,0.013929,-0.44003,-0.027438,0.040679,0.269557,0.04212,0.021308,...,0.055828,0.028474,0.073996,-0.009258,-0.020578,0.059913,0.040509,0.041072,0.040587,0.006793
std,0.024557,0.029455,0.035191,0.040105,0.721729,0.049846,0.028351,0.805048,0.019014,0.03698,...,0.053463,0.048431,0.09776,0.059845,0.079519,0.052592,0.046877,0.057809,0.034782,0.075069
min,-0.390449,-0.36757,-0.66565,-0.305884,-1.73465,-0.465273,-0.455463,-1.023035,-0.210573,-0.3406,...,-0.361819,-0.288986,-1.332452,-0.397591,-0.410056,-0.312431,-0.267537,-0.383548,-0.258766,-0.2887
25%,0.012979,0.040253,-0.011707,-0.007401,-1.014889,-0.063687,0.024501,-0.521208,0.030642,0.002702,...,0.024861,0.006043,0.045827,-0.046013,-0.072785,0.028232,0.014013,0.010669,0.020831,-0.043727
50%,0.026807,0.060236,0.012341,0.017576,-0.578595,-0.019833,0.042631,0.161118,0.042805,0.023326,...,0.056898,0.034714,0.081152,0.003235,-0.007854,0.058954,0.0418,0.046626,0.04411,0.011554
75%,0.04005,0.078863,0.031213,0.039777,0.078632,0.012389,0.059547,1.102319,0.05455,0.04464,...,0.089882,0.060162,0.114661,0.031445,0.034587,0.092632,0.070326,0.081217,0.064362,0.054233
max,0.412326,0.314849,0.310028,0.336443,3.080727,0.331823,0.394082,2.545892,0.346485,0.39296,...,0.327842,0.213785,0.637291,0.209276,0.285022,0.355897,0.275079,0.275906,0.209911,0.369965


In [25]:
descrip = features_df.describe()

In [26]:
descrip.loc['mean'][1]

0.05924705043435097

In [27]:
unused_features = []
active_features = []
for col in features_df.columns:
    if descrip.loc['mean'][col] == 0.0:
        unused_features.append(col)
    else:
        active_features.append(col)
print('active features = ', len(active_features))
print('unused features = ', len(unused_features))

active features =  1500
unused features =  0


In [28]:
active_features_df = features_df[active_features]

In [29]:
X_train_features_df = active_features_df[:len(train_raw)]
len(X_train_features_df)

300000

In [30]:
X_test_features_df = active_features_df[len(train_raw):]
len(X_test_features_df)

200000

In [31]:
X_train_features_df.to_csv(data_dir + 'processed/X_train_dae_encoded_selu_sn_30.csv', index=False)

In [32]:
X_test_features_df.to_csv(data_dir + 'processed/X_test_dae_encoded_selu_sn_30.csv', index=False)

## Test linear model fit

In [33]:
from  sklearn.linear_model import Ridge

In [34]:
y = pd.read_csv(data_dir + 'processed/y_train.csv')

In [35]:
ridge_reg = Ridge(alpha=1, solver='cholesky')
ridge_reg.fit(X_train_features_df, y)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Ridge(alpha=1, solver='cholesky')

In [36]:
y_pred = ridge_reg.predict(X_train_features_df)

In [37]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_pred, squared=False)

0.8548836979129444