In [1]:
import pandas as pd
import numpy as np

In [218]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [219]:

print('Shape:',train.shape)
train = train[train.columns[[True]+list((train.var()!=0))]]
print('After dropping 0 var:',train.shape)

Shape: (4459, 4993)
After dropping 0 var: (4459, 4737)


In [220]:
print('Shape:',train.shape)
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
train=train.drop(to_drop, axis=1)
print('After dropping corr>0.95:',train.shape)

Shape: (4459, 4737)
After dropping corr>0.95: (4459, 4475)


In [221]:
corrs = dict()
for i in range(train.shape[1]-2):
    corrs[train.columns[2+i]] = np.corrcoef(train['target'],train[train.columns[2+i]])[0,1]
    
s = [k for k in corrs if abs(corrs[k])<0.1]

In [222]:
print('Shape:',train.shape)
train=train.drop(s, axis=1)
print('After dropping corr<0.1:',train.shape)

Shape: (4459, 4475)
After dropping corr<0.1: (4459, 392)


In [223]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X=train[train.columns[2:]]
y=train['target']

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=203)

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

scal = StandardScaler()
x_train = scal.fit_transform(x_train)
x_test = scal.transform(x_test)


## Test Improvement

In [234]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping

model = Sequential()
model.add(Dense(50, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss='mean_squared_logarithmic_error',
              optimizer=Adam(lr=0.1,decay=0.0001))




checkp = ModelCheckpoint(filepath='weights.hdf5', verbose=1, save_best_only=True)
lrred = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, cooldown=2, min_lr=0.000001)
stp = EarlyStopping(monitor='val_loss', min_delta=0, patience=50)
cbs = [checkp,lrred,stp]
model.fit(x_train, y_train,
        epochs=400,
        batch_size=400,
        validation_data=(x_test, y_test),
        callbacks=cbs)

Train on 3567 samples, validate on 892 samples
Epoch 1/400
 400/3567 [==>...........................] - ETA: 4s - loss: 209.4451Epoch 00000: val_loss improved from inf to 33.50230, saving model to weights.hdf5
Epoch 2/400
 400/3567 [==>...........................] - ETA: 0s - loss: 35.2789Epoch 00001: val_loss improved from 33.50230 to 20.73575, saving model to weights.hdf5
Epoch 3/400
 400/3567 [==>...........................] - ETA: 0s - loss: 21.3350Epoch 00002: val_loss improved from 20.73575 to 15.96052, saving model to weights.hdf5
Epoch 4/400
 400/3567 [==>...........................] - ETA: 0s - loss: 16.9572Epoch 00003: val_loss improved from 15.96052 to 13.47465, saving model to weights.hdf5
Epoch 5/400
 400/3567 [==>...........................] - ETA: 0s - loss: 14.6641Epoch 00004: val_loss improved from 13.47465 to 11.90148, saving model to weights.hdf5
Epoch 6/400
 400/3567 [==>...........................] - ETA: 0s - loss: 12.0944Epoch 00005: val_loss improved from 11.901

<keras.callbacks.History at 0x7f2e6e7999b0>

## Actual Training

In [240]:
x_train=train[train.columns[2:]]
y_train=train['target']

x_test=test[x_train.columns]

scal = StandardScaler()
x_train = scal.fit_transform(x_train)
x_test = scal.transform(x_test)


model = Sequential()
model.add(Dense(50, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss='mean_squared_logarithmic_error',
              optimizer=Adam(lr=0.1))

checkp = ModelCheckpoint(filepath='weights_final.hdf5')
lrred = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=10, cooldown=2, min_lr=0.000001)
stp = EarlyStopping(monitor='loss', min_delta=0, patience=50)
cbs = [checkp,lrred,stp]
model.fit(x_train, y_train,
        epochs=400,
        batch_size=400,
        callbacks=cbs)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

<keras.callbacks.History at 0x7f2fa7e91c88>

In [249]:
predictions=pd.DataFrame({'ID':test['ID'],'target':model.predict(x_test, verbose=1).flatten()})
print(predictions.head())
predictions.to_csv('pred.csv',index=False)

0  000137c73  1266456.500
1  00021489f  1136502.500
2  0004d7953  1725045.875
3  00056a333  8244792.500
4  00056d8eb  1136502.500
