In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler



In [2]:
from tps_feb_2021.config import config
from tps_feb_2021.utils import add_noise, save_model, get_run_logdir, extract_features

In [3]:
data_dir = config['data_dir']

train_raw = pd.read_csv(data_dir + 'raw/train.csv')
test_raw = pd.read_csv(data_dir + 'raw/test.csv')


In [4]:
cont_cols = [col for col in train_raw.columns if col[:4] == 'cont']
cat_cols = [col for col in train_raw.columns if col[:3] == 'cat']

In [5]:
X = pd.concat([train_raw[cat_cols + cont_cols], test_raw[cat_cols + cont_cols]])
X.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,A,B,A,A,B,D,A,E,C,I,...,0.281421,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903
1,B,A,A,A,B,B,A,E,A,F,...,0.282354,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464
2,A,A,A,C,B,D,A,B,C,N,...,0.293756,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352
3,A,A,A,C,B,D,A,E,G,K,...,0.769785,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766
4,A,B,A,A,B,B,A,E,C,F,...,0.279105,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743


In [6]:
X.shape

(500000, 24)

In [7]:
noisy_X = add_noise(X, p=0.40)

In [8]:
scaler = StandardScaler()
cont_X_scaled = pd.DataFrame(scaler.fit_transform(X[cont_cols]), columns=cont_cols)
cont_noisy_X_scaled = pd.DataFrame(scaler.fit_transform(noisy_X[cont_cols]), columns=cont_cols)

In [9]:
noisy_one_hot_cats = pd.get_dummies(noisy_X[cat_cols])
one_hot_cats = pd.get_dummies(X[cat_cols])

In [10]:
noisy_X = pd.concat([noisy_one_hot_cats, cont_noisy_X_scaled], axis=1)
X = pd.concat([one_hot_cats.reset_index(drop=True), cont_X_scaled.reset_index(drop=True)], axis=1)

In [11]:
noisy_X.shape

(500000, 70)

In [12]:
X.shape

(500000, 70)

In [13]:
noisy_X.columns

Index(['cat0_A', 'cat0_B', 'cat1_A', 'cat1_B', 'cat2_A', 'cat2_B', 'cat3_A',
       'cat3_B', 'cat3_C', 'cat3_D', 'cat4_A', 'cat4_B', 'cat4_C', 'cat4_D',
       'cat5_A', 'cat5_B', 'cat5_C', 'cat5_D', 'cat6_A', 'cat6_B', 'cat6_C',
       'cat6_D', 'cat6_E', 'cat6_G', 'cat6_H', 'cat6_I', 'cat7_A', 'cat7_B',
       'cat7_C', 'cat7_D', 'cat7_E', 'cat7_F', 'cat7_G', 'cat7_I', 'cat8_A',
       'cat8_B', 'cat8_C', 'cat8_D', 'cat8_E', 'cat8_F', 'cat8_G', 'cat9_A',
       'cat9_B', 'cat9_C', 'cat9_D', 'cat9_E', 'cat9_F', 'cat9_G', 'cat9_H',
       'cat9_I', 'cat9_J', 'cat9_K', 'cat9_L', 'cat9_M', 'cat9_N', 'cat9_O',
       'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')

In [14]:
X.columns

Index(['cat0_A', 'cat0_B', 'cat1_A', 'cat1_B', 'cat2_A', 'cat2_B', 'cat3_A',
       'cat3_B', 'cat3_C', 'cat3_D', 'cat4_A', 'cat4_B', 'cat4_C', 'cat4_D',
       'cat5_A', 'cat5_B', 'cat5_C', 'cat5_D', 'cat6_A', 'cat6_B', 'cat6_C',
       'cat6_D', 'cat6_E', 'cat6_G', 'cat6_H', 'cat6_I', 'cat7_A', 'cat7_B',
       'cat7_C', 'cat7_D', 'cat7_E', 'cat7_F', 'cat7_G', 'cat7_I', 'cat8_A',
       'cat8_B', 'cat8_C', 'cat8_D', 'cat8_E', 'cat8_F', 'cat8_G', 'cat9_A',
       'cat9_B', 'cat9_C', 'cat9_D', 'cat9_E', 'cat9_F', 'cat9_G', 'cat9_H',
       'cat9_I', 'cat9_J', 'cat9_K', 'cat9_L', 'cat9_M', 'cat9_N', 'cat9_O',
       'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')

In [15]:
model = keras.models.Sequential([
    keras.layers.Input(shape=noisy_X.shape[1:]),
    keras.layers.Dense(500, activation='relu'),
    keras.layers.Dense(500, activation='relu'),
    keras.layers.Dense(500, activation='relu'),
    keras.layers.Dense(70)
])

model.compile(loss="mean_squared_error", optimizer=keras.optimizers.Adam(learning_rate=0.001))

In [16]:
run_logdir = get_run_logdir()
tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

In [17]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

In [18]:
history = model.fit(noisy_X, X, epochs=2001,
                    callbacks=[tensorboard_cb, early_stopping_cb])

Epoch 1/2001
Epoch 2/2001
Epoch 3/2001
Epoch 4/2001
Epoch 5/2001
Epoch 6/2001
Epoch 7/2001
Epoch 8/2001
Epoch 9/2001
Epoch 10/2001
Epoch 11/2001
Epoch 12/2001
Epoch 13/2001
Epoch 14/2001
Epoch 15/2001
Epoch 16/2001
Epoch 17/2001
Epoch 18/2001
Epoch 19/2001
Epoch 20/2001
Epoch 21/2001
Epoch 22/2001
Epoch 23/2001
Epoch 24/2001
Epoch 25/2001
Epoch 26/2001
Epoch 27/2001
Epoch 28/2001
Epoch 29/2001
Epoch 30/2001
Epoch 31/2001
Epoch 32/2001
Epoch 33/2001
Epoch 34/2001
Epoch 35/2001
Epoch 36/2001
Epoch 37/2001
Epoch 38/2001
Epoch 39/2001
Epoch 40/2001
Epoch 41/2001
Epoch 42/2001
Epoch 43/2001
Epoch 44/2001
Epoch 45/2001
Epoch 46/2001
Epoch 47/2001
Epoch 48/2001
Epoch 49/2001
Epoch 50/2001
Epoch 51/2001
Epoch 52/2001
Epoch 53/2001
Epoch 54/2001
Epoch 55/2001
Epoch 56/2001
Epoch 57/2001
Epoch 58/2001
Epoch 59/2001
Epoch 60/2001
Epoch 61/2001
Epoch 62/2001
Epoch 63/2001
Epoch 64/2001
Epoch 65/2001
Epoch 66/2001
Epoch 67/2001
Epoch 68/2001
Epoch 69/2001
Epoch 70/2001
Epoch 71/2001
Epoch 72/2001
E

In [19]:
save_model(model, '06_dae_model_swap_noise_40.h5')

In [20]:
# model = keras.models.load_model('../models/06_dae_model_swap_noise_25.h5')

In [21]:
model.inputs

[<KerasTensor: shape=(None, 70) dtype=float32 (created by layer 'input_1')>]

In [22]:
features_df = extract_features(model, X)



In [23]:
features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.749727,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.301094,0.0,1.383269,0.0,0.000000,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.382377,0.0,0.000000,0.0,0.866042,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.600120,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.433342,0.0,0.000000,0.0,1.074741,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0.0,0.0,0.000000,0.0,0.0,0.0,0.961982,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.710420,0.0,0.000000,0.0
499996,0.0,0.0,1.047975,0.0,0.0,0.0,0.804564,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.855798,0.0
499997,0.0,0.0,0.000000,0.0,0.0,0.0,0.092414,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0
499998,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.602676,0.0,0.000000,0.0,0.962269,0.0


In [24]:
features_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,...,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,0.000475,0.00015,0.351002,0.000972,0.0,0.000136,0.661634,0.0,0.000267,0.0,...,2e-05,1.1e-05,8.892937e-07,0.0,0.427041,8.6e-05,0.210648,0.0,0.361074,0.0
std,0.023207,0.015219,0.58462,0.039583,0.0,0.014986,0.731846,0.0,0.018325,0.0,...,0.005768,0.00346,0.0006288256,0.0,0.709982,0.012143,0.465592,0.0,0.479201,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.350689,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.572796,0.0,0.0,0.0,1.390294,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.692814,0.0,0.029857,0.0,0.956616,0.0
max,3.398247,3.077224,4.041704,4.753543,0.0,3.940678,3.236087,0.0,3.111079,0.0,...,3.090553,1.829194,0.4446468,0.0,6.903209,2.788623,8.326519,0.0,2.172947,0.0


In [25]:
descrip = features_df.describe()

In [26]:
descrip.loc['mean'][1]

0.00014969886979088187

In [27]:
unused_features = []
active_features = []
for col in features_df.columns:
    if descrip.loc['mean'][col] == 0.0:
        unused_features.append(col)
    else:
        active_features.append(col)
print('active features = ', len(active_features))
print('unused features = ', len(unused_features))

active features =  715
unused features =  785


In [28]:
active_features_df = features_df[active_features]

In [29]:
X_train_features_df = active_features_df[:len(train_raw)]
len(X_train_features_df)

300000

In [30]:
X_test_features_df = active_features_df[len(train_raw):]
len(X_test_features_df)

200000

In [31]:
X_train_features_df.to_csv(data_dir + 'processed/X_train_dae_encoded_sn_40.csv', index=False)

In [32]:
X_test_features_df.to_csv(data_dir + 'processed/X_test_dae_encoded_sn_40.csv', index=False)

## Test linear model fit

In [33]:
from  sklearn.linear_model import Ridge

In [34]:
y = pd.read_csv(data_dir + 'processed/y_train.csv')

In [35]:
ridge_reg = Ridge(alpha=1, solver='cholesky')
ridge_reg.fit(X_train_features_df, y)

  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


Ridge(alpha=1, solver='cholesky')

In [36]:
y_pred = ridge_reg.predict(X_train_features_df)

In [37]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y, y_pred, squared=False)

0.8563208600556839