In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

def drop_vars(df):
    tmp=df.shape[1]
    df = df[df.columns[[True]+list((df.var()!=0))]]
    print('0 var:',tmp-df.shape[1])
    
    corr_matrix = df[df.columns[2:]].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    tmp=df.shape[1]
    df=df.drop(to_drop, axis=1)
    print('Corr>0.95:',tmp-df.shape[1],'Now:',)
    
    corrs = dict()
    for i in range(df.shape[1]-2):
        corrs[df.columns[2+i]] = np.corrcoef(df['target'],df[df.columns[2+i]])[0,1]
    s = [k for k in corrs if abs(corrs[k])<0.1]
    tmp=df.shape[1]
    df=df.drop(s, axis=1)
    print('Corr Target <0.1:',tmp-df.shape[1])
    
    return df
train = drop_vars(train)

0 var: 256
Corr>0.95: 139 Now:
Corr Target <0.1: 4206


## Test Improvement

In [5]:
X=train[train.columns[2:]]
y=np.log1p(train['target'])

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=203)

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

def root_mean_squared_error(y_true, y_pred):
        return abs(np.sqrt(np.mean((y_pred - y_true)**2))) 

clf1 = GradientBoostingRegressor()
clf1.fit(x_train, y_train)
clf2 = ElasticNet(random_state=23,alpha=5)
clf2.fit(x_train, y_train)
scal=StandardScaler()

y_true,y_pred = y_test,clf1.predict(x_test) 
print(root_mean_squared_error(y_pred, y_true))

y_true,y_pred = y_test,clf2.predict(x_test) 
print(root_mean_squared_error(y_pred, y_true))

x_train = np.c_[x_train,scal.fit_transform(clf1.predict(x_train).flatten().reshape(-1,1)).flatten(),scal.fit_transform(clf2.predict(x_train).flatten().reshape(-1,1)).flatten()]
x_test = np.c_[x_test,scal.transform(clf1.predict(x_test).flatten().reshape(-1,1)).flatten(),scal.transform(clf2.predict(x_test).flatten().reshape(-1,1)).flatten()]

1.4813580179455834
1.7418805964740947


In [14]:
X=train[train.columns[2:]]
y=np.log1p(train['target'])

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=203)

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

clf1 = GradientBoostingRegressor()
clf1.fit(x_train, y_train)
clf2 = ElasticNet(random_state=23,alpha=5)
clf2.fit(x_train, y_train)

scal=StandardScaler()
x_train = np.c_[x_train,scal.fit_transform(clf1.predict(x_train).flatten().reshape(-1,1)).flatten(),scal.fit_transform(clf2.predict(x_train).flatten().reshape(-1,1)).flatten()]
x_test = np.c_[x_test,scal.transform(clf1.predict(x_test).flatten().reshape(-1,1)).flatten(),scal.transform(clf2.predict(x_test).flatten().reshape(-1,1)).flatten()]

model = Sequential()
model.add(Dense(50, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss=root_mean_squared_error,
              optimizer=Adam(lr=0.1,decay=0.0001))


checkp = ModelCheckpoint(filepath='weights.hdf5')
lrred = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, cooldown=2, min_lr=0.000001)
stp = EarlyStopping(monitor='val_loss', min_delta=0, patience=50)
cbs = [checkp,lrred,stp]
model.fit(x_train, y_train,
        epochs=1000,
        batch_size=400,
        validation_data=(x_test, y_test),
        callbacks=cbs)

Train on 3567 samples, validate on 892 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1

Epoch 77/1000
Epoch 78/1000
Epoch 79/1000


<keras.callbacks.History at 0xfa15e48>

## NN+ElNet - Actual

In [None]:
x_train=train[train.columns[2:]]
y_train=np.log1p(train['target'])

x_test=test[x_train.columns]

sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

def root_mean_squared_error(y_true, y_pred):
        return abs(K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)))

clf = GradientBoostingRegressor()
clf.fit(x_train, y_train)
scal=StandardScaler()
x_train = np.c_[x_train,scal.fit_transform(clf.predict(x_train).flatten().reshape(-1,1)).flatten()]
x_test = np.c_[x_test,scal.transform(clf.predict(x_test).flatten().reshape(-1,1)).flatten()]

clf = ElasticNet(random_state=23,alpha=5)
clf.fit(x_train, y_train)
scal=StandardScaler()
x_train = np.c_[x_train,scal.fit_transform(clf.predict(x_train).flatten().reshape(-1,1)).flatten()]
x_test = np.c_[x_test,scal.transform(clf.predict(x_test).flatten().reshape(-1,1)).flatten()]

model = Sequential()
model.add(Dense(50, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss=root_mean_squared_error,
              optimizer=Adam(lr=0.1,decay=0.0001))


checkp = ModelCheckpoint(filepath='weights.hdf5')
lrred = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=10, cooldown=2, min_lr=0.000001)
stp = EarlyStopping(monitor='loss', min_delta=0, patience=50)
cbs = [checkp,lrred,stp]
model.fit(x_train, y_train,
        epochs=1000,
        batch_size=400,
        callbacks=cbs)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
 400/4459 [=>............................] - ETA: 0s - loss: 1.2355

In [27]:
predictions=pd.DataFrame({'ID':test['ID'],'target':np.expm1(model.predict(x_test, verbose=1).flatten())})
print(predictions.head())
predictions.to_csv('pred_boost.csv',index=False)

          ID      target
0  000137c73  1752246.00
1  00021489f  2149595.00
2  0004d7953  2535024.25
3  00056a333  2129491.00
4  00056d8eb  2149595.00
