In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, HuberRegressor, TheilSenRegressor, RANSACRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.ensemble import BaggingRegressor
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor 
import xgboost as xg
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

The minimum supported version is 2.4.6



In [2]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print ('Training set', data.shape)
print ('Testing set', test_data.shape)
print (data.head(5))
print (data.columns)
all_data = pd.concat([data, test_data])
print (all_data.isnull().any(), all_data.isnull().sum())

('Training set', (9366, 18))
('Testing set', (4801, 17))
  portfolio_id      desk_id    office_id pf_category  start_date         sold  \
0   PF00001002  DSK00001001  OFF00001002           B    20040720  110000000.0   
1   PF00001003  DSK00001002  OFF00001001           A    20040709  176671000.0   
2   PF00001005  DSK00001004  OFF00001001           A    20040723   56474000.0   
3   PF00001006  DSK00001005  OFF00001001           A    20040609  164813000.0   
4   PF00001007  DSK00001005  OFF00001002           B    20040609  140800000.0   

  country_code  euribor_rate currency  libor_rate        bought  \
0            T       0.02074      USD    2.332216  1.098097e+08   
1            N       0.02074      GBP    5.269617  1.760084e+08   
2            T       0.02074      USD    2.332216  5.637953e+07   
3            T       0.02074      USD    2.332216  1.645088e+08   
4            T       0.02074      USD    2.332216  1.405402e+08   

   creation_date indicator_code  sell_date type hedge

In [3]:
all_data = all_data.dropna(subset = ['sold', 'bought'])
all_data['libor_rate'] = all_data['libor_rate'].fillna(all_data['libor_rate'].mean())
print (all_data.isnull().any())

bought            False
country_code      False
creation_date     False
currency          False
desk_id            True
euribor_rate      False
hedge_value        True
indicator_code     True
libor_rate        False
office_id         False
pf_category       False
portfolio_id      False
return             True
sell_date         False
sold              False
start_date        False
status             True
type              False
dtype: bool


In [4]:
drop_features = ['desk_id', 'indicator_code', 'return', 'hedge_value', 'status']
big_data = all_data.drop(drop_features, axis=1)

train_labels = all_data['return']
print (big_data.shape, train_labels.shape)

((14165, 13), (14165,))


In [5]:
le = LabelEncoder()
for c in big_data.columns:
    big_data[c] = le.fit_transform(big_data[c])
print (big_data.head())

train_data = big_data[:9364]
train_labels = train_labels[:9364]
test_x = big_data[9364:]
print (test_x)
print (train_data.shape, train_labels.shape, test_x.shape)

   bought  country_code  creation_date  currency  euribor_rate  libor_rate  \
0    9126             2              0         4            97         383   
1   10999             1              1         2            97         461   
2    6411             2              1         4            97         383   
3   10712             2              1         4            97         383   
4   10285             2              1         4            97         383   

   office_id  pf_category  portfolio_id  sell_date  sold  start_date  type  
0          1            1             1          7  3868          13     1  
1          0            0             2          7  4783           8     2  
2          0            0             4         10  2466          14     0  
3          0            0             5          1  4664           4     0  
4          1            1             6          1  4395           4     1  
      bought  country_code  creation_date  currency  euribor_rate  li

In [6]:
train_x, val_x, train_y, val_y = train_test_split(train_data, train_labels, test_size=0.25)
print ('training shape:', train_x.shape, train_y.shape)
print ('validation shape:', val_x.shape, val_y.shape)

('training shape:', (7023, 13), (7023,))
('validation shape:', (2341, 13), (2341,))


In [7]:
def cv_model(clf):     
    cv = KFold(n_splits=5,shuffle=True,random_state=45)
    scores = cross_val_score(clf, train_x, train_y, cv=cv, scoring='r2')
    return scores.mean()

In [None]:
clf = RandomForestRegressor(n_estimators=150)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = ExtraTreesRegressor(n_estimators=150, max_depth=5)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = LinearRegression()
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = Ridge(alpha=1.0)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = KernelRidge(alpha=0.1)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = BaggingRegressor()
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = Lasso(alpha=1e-4)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = HuberRegressor()
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = AdaBoostRegressor()
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = TheilSenRegressor(random_state=45)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
clf = xg.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=150)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))

In [None]:
# clf = svm.SVR()
# scores = cv_model(clf)
# print ('Training score:', scores.mean())
# clf.fit(train_x, train_y)
# print ('Validation score:', clf.score(val_x, val_y))

In [None]:
# clf = svm.SVR(kernel='linear')
# scores = cv_model(clf)
# print ('Training score:', scores.mean())
# clf.fit(train_x, train_y)
# print ('Validation score:', clf.score(val_x, val_y))

In [None]:
cv = KFold(n_splits=5,shuffle=True,random_state=45)

parameters = {'max_depth': [3, 5, 10],
              'learning_rate' : [0.1, 0.001],
              'n_estimators' : [150, 300],
              'gamma' : [1, 3],
              'reg_lambda': [0.01,]}

clf = xg.XGBRegressor()
grid_obj = GridSearchCV(clf, parameters, cv=cv, scoring='r2', n_jobs=4, verbose = 5)
grid_fit = grid_obj.fit(train_x, train_y)
best_clf = grid_fit.best_estimator_ 

best_clf.fit(train_x, train_y)

In [16]:
from mlxtend.classifier import StackingClassifier

clf1 = ExtraTreesRegressor(random_state = 45, n_estimators = 150, max_depth= 5)
clf2 = RandomForestRegressor(random_state = 45, n_estimators = 150)
clf3 = xg.XGBRegressor(seed = 45, learning_rate = 0.1, n_estimators = 150, max_depth = 5)
clf4 = BaggingRegressor()
print ('ExtraTree:', cv_model(clf1))
print ('RF:', cv_model(clf2))
print ('XGB:', cv_model(clf3))
print ('BaggingTree:', cv_model(clf4))
# Compute stacking features
model =  StackingClassifier(regressors=[clf4, clf3], meta_regressor=clf2, verbose=1)
print ('Stack:', cv_model(model))
model.fit(train_x, train_y)

# Final prediction score
print('Final r2 score: [%.8f]' % model.score(val_x, val_y))
pred_test_y = model.predict(test_x)

('ExtraTree:', 0.83382630289168791)
('RF:', 0.81757050850219104)
('XGB:', 0.82378593844255921)
('BaggingTree:', 0.81307632449198197)
Fitting 2 regressors...
Fitting regressor1: baggingregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: baggingregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: baggingregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: baggingregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Fitting 2 regressors...
Fitting regressor1: baggingregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
('Stack:', 0.79187538378318023)
Fitting 2 regressors...
Fitting regressor1: baggingregressor (1/2)
Fitting regressor2: xgbregressor (2/2)
Final r2 score: [0.83885254]


In [None]:
train_X = train_x.as_matrix()
train_Y = train_y.as_matrix()
val_X = val_x.as_matrix()
val_Y = val_y.as_matrix()
test_x = test_x.as_matrix()
print (train_X.shape, train_Y.shape)
print (val_X.shape, val_Y.shape)

In [None]:
nn = MLPRegressor(hidden_layer_sizes=(100, ), activation='identity', learning_rate='adaptive', batch_size=16)

nn.fit(train_X, train_Y)
print ('Val r2 score:', nn.score(val_X, val_Y))

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD, Adam
from keras.metrics import mae
from keras.wrappers.scikit_learn import KerasRegressor
# custom R2-score metrics for keras backend
def r2_keras(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def base_model():
    
    model = Sequential()
    model.add(Dense(13, input_dim=13, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(1e-4), metrics=[r2_keras])
    
    return model

estimator = KerasRegressor(build_fn=base_model, epochs=100, batch_size=16, verbose=True)
#kfold = KFold(n_splits=5, random_state=45)
#results = cross_val_score(estimator, train_X, train_Y, cv=kfold, scoring='r2')
#print ('\nTraining score:', results.mean())
estimator.fit(train_X, train_Y)
pred_Y = estimator.predict(val_X)
print ('\nValidation score:', r2_score(val_Y, pred_Y))
pred_test_y = estimator.predict(test_x)

In [None]:
import tflearn
import tensorflow as tf

train_Y_new = train_Y.reshape(-1,1)
val_Y_new = val_Y.reshape(-1,1)

tf.reset_default_graph()
r2 = tflearn.R2()
net = tflearn.input_data(shape=train_X.shape)
net = tflearn.fully_connected(net, 14, activation='linear')
net = tflearn.fully_connected(net, 10, activation='linear')
net = tflearn.fully_connected(net, 1, activation='linear')
sgd = tflearn.SGD(learning_rate=0.1, lr_decay=0.01, decay_step=100)
net = tflearn.regression(net, optimizer=sgd,loss='mean_square', metric=r2)
model = tflearn.DNN(net)

model.fit(train_X, train_Y_new, show_metric=True, validation_set=(val_X, val_Y_new), shuffle=True, n_epoch=50)

In [None]:
clf = xg.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=150)
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))
pred_test_y = clf.predict(test_x)

In [None]:
sub = pd.DataFrame({'portfolio_id':test_data['portfolio_id'], 'return':pred_test_y})
sub.to_csv('submit.csv', columns=['portfolio_id', 'return'], index=False)

In [12]:
estimators = [('OLS', LinearRegression()),
              ('Theil-Sen', TheilSenRegressor(random_state=45)),
              ('RANSAC', RANSACRegressor(random_state=45)),
              ('HuberRegressor', HuberRegressor())]

clf = make_pipeline(RandomForestRegressor(n_estimators=150, max_depth=5), BaggingRegressor())
scores = cv_model(clf)
print ('Training score:', scores.mean())
clf.fit(train_x, train_y)
print ('Validation score:', clf.score(val_x, val_y))
pred_test_y = clf.predict(test_x)



('Training score:', 0.78423188615206296)




('Validation score:', 0.9404296210349441)


