In [37]:
import pandas as pd
import numpy as np

# Loading the train and test data, raw and normalized versions.
train = pd.read_csv('./train.csv')
train_norm = pd.read_csv('./train_norm.csv')
test = pd.read_csv('./test.csv')
test_norm = pd.read_csv('./test_norm.csv')

# Sample Submission
sub = pd.read_csv('./sample_submission.csv')

In [40]:
X, X_norm = train.iloc[:,1:-1], train_norm.iloc[:,1:-1]
X_test, X_norm_test = test.iloc[:,1:-1], test_norm.iloc[:,1:-1]
y = train.iloc[:,-1]

# Data Preprocessing
- They are no missing values, so no imputation is needed.
- Data Features ranges from 1 to -1, and normalization is not necessary. But a normalization is ran through all the feature columns.
- PCA could be done to decrease the dimensions of the data:
    - First train and test data should be combined.
    - Then a PCA allgorithm should be ran throught all of the data.

In [9]:
(train.isnull().sum() != 0).sum(), (test.isnull().sum() != 0).sum() # No missing Values

(0, 0)

In [33]:
# from sklearn.preprocessing import StandardScaler

# feature_data = pd.concat([train.iloc[:,1:-1], test.iloc[:,1:]])
# feature_data_norm = StandardScaler().fit_transform(feature_data)

# train.iloc[:,1:-1] = feature_data_norm[:train.shape[0]]
# test.iloc[:,1:] = feature_data_norm[:test.shape[0]]

# train.to_csv('train_norm.csv', index=False)
# test.to_csv('test_norm.csv', index=False)

# Modeling
The end goal is to have multiple models and aggregate their results, models:
- Neural Networks
- Random Forests and XGBoost
- SVMs
- Naive Bayes

All the models will be saved and loaded for ensembling.


In [46]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau

In [48]:
# Basic Linear regression model

model = Sequential(layers = [
    Dense(1000,
         kernel_initializer=TruncatedNormal(0,2,111),
         kernel_regularizer=l1(0.001),
         bias_initializer=TruncatedNormal(0,0.1,121),
         bias_regularizer=l2(0.01)),
    BatchNormalization(),
    Dense(1)
])

In [49]:
model.compile(optimizer=Adam(0.0015), loss='mae')
model.fit(X,y,
          validation_split=0.3, verbose=1, 
          batch_size=32, epochs=100,
          callbacks=[
              EarlyStopping(monitor='val_loss', 
                            min_delta=0, 
                            patience=100, 
                            verbose=1, 
                            mode='min', 
                            restore_best_weights=True),
              ReduceLROnPlateau(monitor='val_loss', 
                                factor=0.9, 
                                patience=30, 
                                verbose=1,
                                min_delta=0, 
                                cooldown=0, 
                                min_lr=1e-12)
          ]
         )

Epoch 1/100


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100

KeyboardInterrupt: 