In [38]:
import pandas as pd
import os
from timeit import timeit
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.ensemble import BaggingRegressor
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor 
import xgboost as xg
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

In [39]:
data = pd.read_csv('train.csv')
print ('Training set', data.shape)
print (data.head(3))
print (data.columns)

('Training set', (15780, 26))
   Observation         T1    RH_1    T2       RH_2         T3       RH_3  \
0         1111  22.700000  37.200  21.0  38.000000  23.390000  37.290000   
1         1112  21.500000  41.045  20.5  39.133333  22.926667  39.526667   
2         1113  21.666667  38.000  22.6  35.700000  21.890000  36.590000   

          T4       RH_4         T5   ...         RH_8     T9       RH_9  \
0  22.832857  34.942857  20.500000   ...    45.360000  20.20  38.663333   
1  21.700000  34.126667  18.633333   ...    34.663333  19.73  37.933333   
2  22.000000  35.530000  19.000000   ...    38.545000  19.79  39.430000   

   T_out  Press_mm_hg     RH_out  Windspeed  Visibility  Tdewpoint  Energy  
0  12.80   760.050000  62.000000        3.5        28.0       5.65      70  
1   8.47   764.166667  48.166667        8.0        26.5      -1.92     210  
2  10.60   757.600000  57.000000        2.0        27.0       2.40      50  

[3 rows x 26 columns]
Index([u'Observation', u'T1', u'R

In [40]:
test_data = pd.read_csv('test.csv')
print ('Testing set', test_data.shape)
print (test_data.head(3))

('Testing set', (3945, 25))
   Observation    T1   RH_1     T2       RH_2         T3       RH_3     T4  \
0        50001  21.1  39.90  19.29  41.260000  21.500000  40.700000  18.89   
1        50002  22.6  39.03  20.79  40.463333  22.290000  38.290000  20.29   
2        50003  21.0  35.59  19.79  34.900000  21.166667  35.833333  20.39   

    RH_4         T5    ...         T8       RH_8     T9   RH_9  T_out  \
0  41.20  18.088889    ...      21.60  47.090000  18.10  44.90   2.70   
1  36.70  20.760000    ...      23.76  39.266667  19.39  37.50   7.27   
2  33.09  18.000000    ...      22.60  34.126667  18.20  39.79   6.05   

   Press_mm_hg     RH_out  Windspeed  Visibility  Tdewpoint  
0   733.633333  98.166667        2.0   34.833333       2.43  
1   756.666667  82.000000        2.0   40.000000       4.40  
2   751.250000  50.000000        6.5   34.500000      -3.70  

[3 rows x 25 columns]


In [41]:
all_data = pd.concat([data, test_data])
print (all_data.isnull().any(), all_data.isnull().sum())
print (all_data.describe())
print (all_data.dtypes)

(Energy          True
Observation    False
Press_mm_hg    False
RH_1           False
RH_2           False
RH_3           False
RH_4           False
RH_5           False
RH_6           False
RH_7           False
RH_8           False
RH_9           False
RH_out         False
T1             False
T2             False
T3             False
T4             False
T5             False
T6             False
T7             False
T8             False
T9             False
T_out          False
Tdewpoint      False
Visibility     False
Windspeed      False
dtype: bool, Energy         3945
Observation       0
Press_mm_hg       0
RH_1              0
RH_2              0
RH_3              0
RH_4              0
RH_5              0
RH_6              0
RH_7              0
RH_8              0
RH_9              0
RH_out            0
T1                0
T2                0
T3                0
T4                0
T5                0
T6                0
T7                0
T8                0
T9                0


In [42]:
drop_features = ['Energy']
big_data = all_data.drop(drop_features, axis=1)

train_labels = all_data['Energy']
print ('Total dataset', big_data.shape) 
print ('Train labels:', train_labels.shape)

('Total dataset', (19725, 25))
('Train labels:', (19725,))


In [43]:
le = LabelEncoder()
sc = StandardScaler()
for i, c in enumerate(big_data.columns):
    if big_data.dtypes[i] == 'object':
        big_data[c] = le.fit_transform(big_data[c])
    elif big_data.dtypes[i] in ['int', 'float'] and c!='Observation':
        big_data[c] = sc.fit_transform(big_data[c])

print (big_data.head(3))

train_data = big_data[:len(data)]
train_labels = train_labels[:len(data)]
test_x = big_data[len(data):]

print ('Training shape:', train_data.shape, train_labels.shape, 'Testing shape:', test_x.shape)

   Observation  Press_mm_hg      RH_1      RH_2      RH_3      RH_4      RH_5  \
0         1111     0.612152 -0.769109 -0.595156 -0.600044 -0.940848 -0.900566   
1         1112     1.168521  0.197128 -0.316672  0.087149 -1.128841 -0.697001   
2         1113     0.281033 -0.568071 -1.160315 -0.815112 -0.805611 -0.526686   

       RH_6      RH_7      RH_8    ...            T4        T5        T6  \
0 -1.443907  0.071277  0.463657    ...      0.967782  0.491925  0.758851   
1 -1.370278 -2.005361 -1.583628    ...      0.413191 -0.520025  0.374055   
2 -1.061849 -0.337507 -0.840698    ...      0.560056 -0.321249  0.401971   

         T7        T8        T9     T_out  Tdewpoint  Visibility  Windspeed  
0  0.849539  0.342790  0.354194  1.012815   0.449973   -0.875608  -0.220285  
1  0.247603  0.458671  0.120888  0.198610  -1.354501   -1.002805   1.615596  
2 -0.316416 -0.071313  0.150672  0.599131  -0.324735   -0.960406  -0.832245  

[3 rows x 25 columns]
('Training shape:', (15780, 25), (1



In [44]:
train_x, val_x, train_y, val_y = train_test_split(train_data, train_labels, test_size=0.25)
print ('training shape:', train_x.shape, train_y.shape)
print ('validation shape:', val_x.shape, val_y.shape)

('training shape:', (11835, 25), (11835,))
('validation shape:', (3945, 25), (3945,))


In [45]:
def cv_model(clf):     
    cv = KFold(n_splits=5,shuffle=True,random_state=45)
    scores = cross_val_score(clf, train_x, train_y, cv=cv, scoring='r2')
    return scores.mean()

In [33]:
%timeit
clf = RandomForestRegressor(n_estimators=300, verbose=1)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.5s finished


('Training score:', 0.46688108657756544)


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:  1.9min finished


('Validation score:', 0.50246146999076768)


[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    0.7s finished


In [46]:
%timeit
clf = ExtraTreesRegressor(n_estimators=500, verbose=1)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   43.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   41.4s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   42.7s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   45.6s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   38.6s finished
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished


('Training score:', 0.50469971750527698)


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   48.7s finished


('Validation score:', 0.58018719970351174)


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    1.2s finished


In [0]:
clf = LinearRegression()
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

('Training score:', 0.13695990185421228)
('Validation score:', 0.14583264321656797)


In [0]:
clf = Ridge(alpha=1.0)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

('Training score:', 0.13696158649486995)
('Validation score:', 0.14582774247790364)


In [0]:
clf = KernelRidge(alpha=0.1)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [58]:
clf = BaggingRegressor(n_estimators=300, verbose=1)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.6s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   49.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   45.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   47.5s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.7s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


('Training score:', 0.45880050854895255)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min finished


('Validation score:', 0.51415876922779058)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


In [None]:
clf = Lasso(alpha=1e-4)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [0]:
clf = HuberRegressor()
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [57]:
clf = AdaBoostRegressor(n_estimators=500)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

('Training score:', -0.38805062644799138)
('Validation score:', -0.33710393422641416)


In [0]:
clf = TheilSenRegressor(random_state=45)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [0]:
clf = xg.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=300)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [0]:
# clf = svm.SVR()
# scores = cv_model(clf)
# print ('Training score:', scores.mean())
# clf.fit(train_x, train_y)
# print ('Validation score:', clf.score(val_x, val_y))

In [0]:
# clf = svm.SVR(kernel='linear')
# scores = cv_model(clf)
# print ('Training score:', scores.mean())
# clf.fit(train_x, train_y)
# print ('Validation score:', clf.score(val_x, val_y))

In [0]:
cv = KFold(n_splits=5,shuffle=True,random_state=45)

parameters = {'max_depth': [3, 5, 10],
              'learning_rate' : [0.1, 0.001],
              'n_estimators' : [150, 300],
              'gamma' : [1, 3],
              'reg_lambda': [0.01,]}

clf = xg.XGBRegressor()
grid_obj = GridSearchCV(clf, parameters, cv=cv, scoring='r2', n_jobs=4, verbose = 5)
grid_fit = grid_obj.fit(train_x, train_y)
best_clf = grid_fit.best_estimator_ 

best_clf.fit(train_x, train_y)

In [61]:
from mlxtend.regressor import StackingRegressor

clf1 = ExtraTreesRegressor(random_state = 45, n_estimators = 300)
clf2 = RandomForestRegressor(random_state = 45, n_estimators = 300)
clf3 = xg.XGBRegressor(seed = 45, learning_rate = 0.1, n_estimators = 300)
clf4 = BaggingRegressor(n_estimators=150)
# print ('ExtraTree:', cv_model(clf1))
# print ('RF:', cv_model(clf2))
# print ('XGB:', cv_model(clf3))
# print ('BaggingTree:', cv_model(clf4))
# Compute stacking features
model =  StackingRegressor(regressors=[clf1], meta_regressor=clf4, verbose=1)
print ('Stack:', cv_model(model))
model.fit(train_x, train_y)

# Final prediction score
print('Final r2 score: [%.8f]' % model.score(val_x, val_y))
pred_test_y = model.predict(test_x)

Fitting 1 regressors...
Fitting regressor1: extratreesregressor (1/1)
Fitting 1 regressors...
Fitting regressor1: extratreesregressor (1/1)
Fitting 1 regressors...
Fitting regressor1: extratreesregressor (1/1)
Fitting 1 regressors...
Fitting regressor1: extratreesregressor (1/1)
Fitting 1 regressors...
Fitting regressor1: extratreesregressor (1/1)
('Stack:', 0.50458041032950152)
Fitting 1 regressors...
Fitting regressor1: extratreesregressor (1/1)
Final r2 score: [0.58195818]


In [0]:
estimators = [('OLS', LinearRegression()),
              ('Theil-Sen', TheilSenRegressor(random_state=45)),
              ('RANSAC', RANSACRegressor(random_state=45)),
              ('HuberRegressor', HuberRegressor())]

clf = make_pipeline(RandomForestRegressor(n_estimators=150, max_depth=5), BaggingRegressor())
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))
pred_test_y = clf.predict(test_x)

In [48]:
train_X = train_x.as_matrix()
train_Y = train_y.as_matrix()
val_X = val_x.as_matrix()
val_Y = val_y.as_matrix()
test_x = test_x.as_matrix()
print (train_X.shape, train_Y.shape)
print (val_X.shape, val_Y.shape)

((11835, 25), (11835,))
((3945, 25), (3945,))


In [49]:
# nn = MLPRegressor(hidden_layer_sizes=(100, ), activation='identity', learning_rate='adaptive', batch_size=16)

# nn.fit(train_X, train_Y)
# print ('Val r2 score:', nn.score(val_X, val_Y))

In [50]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import SGD, Adam
from keras.metrics import mae
from keras.wrappers.scikit_learn import KerasRegressor

# custom R2-score metrics for keras backend
def r2_keras(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def base_model():
    
    model = Sequential()
    model.add(Dense(25, input_dim=25, activation='relu'))
    model.add(BatchNormalization())
#     model.add(Dense(8, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(3, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mae', optimizer='adam', metrics=[r2_keras])
    
    return model

estimator = KerasRegressor(build_fn=base_model, epochs=100, batch_size=64, verbose=True)
#kfold = KFold(n_splits=5, random_state=45)
#results = cross_val_score(estimator, train_X, train_Y, cv=kfold, scoring='r2')
#print ('\nTraining score:', results.mean())
estimator.fit(train_X, train_Y)
pred_Y = estimator.predict(val_X)
print ('Validation score:', metrics.r2_score(val_Y, pred_Y))
pred_test_y = estimator.predict(test_x)

Using TensorFlow backend.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

KeyboardInterrupt: 

In [0]:
import tflearn
import tensorflow as tf

train_Y_new = train_Y.reshape(-1,1)
val_Y_new = val_Y.reshape(-1,1)

tf.reset_default_graph()
r2 = tflearn.R2()
net = tflearn.input_data(shape=train_X.shape)
net = tflearn.fully_connected(net, 25, activation='linear')
net = tflearn.fully_connected(net, 10, activation='linear')
net = tflearn.fully_connected(net, 1, activation='linear')
sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.01, decay_step=100)
net = tflearn.regression(net, optimizer=sgd,loss='mean_absolute', metric=r2)
model = tflearn.DNN(net)

model.fit(train_X, train_Y_new, show_metric=True, validation_set=(val_X, val_Y_new), shuffle=True, n_epoch=50)

In [0]:
!ls

In [0]:
!cat sample_submission.csv | less

In [62]:
pred_test_y = clf.predict(test_x)
sub = pd.DataFrame({'Observation':test_data['Observation'], 'Energy':pred_test_y})
sub.to_csv('submit.csv', columns=['Observation', 'Energy'], index=False)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished


In [0]:
!cat submit.csv | less

Observation,Energy
50001,43.22
50002,89.26
50003,54.86
50004,70.56
50005,49.22
50006,72.4
50007,147.6
50008,44.9
50009,308.36
50010,43.48
50011,48.86
50012,74.64
50013,48.66
50014,98.34
50015,59.72
50016,109.62
50017,57.26
50018,464.22
50019,64.84
50020,90.34
50021,70.54
50022,107.3
[K50023,113.86
:[K

In [0]:
def regression_model():
  
  
  

In [11]:
print (big_data.columns)

Index([u'Observation', u'Press_mm_hg', u'RH_1', u'RH_2', u'RH_3', u'RH_4',
       u'RH_5', u'RH_6', u'RH_7', u'RH_8', u'RH_9', u'RH_out', u'T1', u'T2',
       u'T3', u'T4', u'T5', u'T6', u'T7', u'T8', u'T9', u'T_out', u'Tdewpoint',
       u'Visibility', u'Windspeed'],
      dtype='object')


In [18]:
big_data.Observation

0        1111
1        1112
2        1113
3        1114
4        1115
5        1116
6        1117
7        1118
8        1119
9        1120
10       1121
11       1122
12       1123
13       1124
14       1125
15       1126
16       1127
17       1128
18       1129
19       1130
20       1131
21       1132
22       1133
23       1134
24       1135
25       1136
26       1137
27       1138
28       1139
29       1140
        ...  
3915    53916
3916    53917
3917    53918
3918    53919
3919    53920
3920    53921
3921    53922
3922    53923
3923    53924
3924    53925
3925    53926
3926    53927
3927    53928
3928    53929
3929    53930
3930    53931
3931    53932
3932    53933
3933    53934
3934    53935
3935    53936
3936    53937
3937    53938
3938    53939
3939    53940
3940    53941
3941    53942
3942    53943
3943    53944
3944    53945
Name: Observation, Length: 19725, dtype: int64