In [1]:
import gc
import load as ld
import preprocessing as pp
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, metrics
import lightgbm as lgb
import training as tr
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATADIR = "input/"
SUBMISSIONS_DIR = "submissions/"

In [3]:
#import zipfile
#zip_ref = zipfile.ZipFile(DATADIR + "historical_transactions.csv.zip", 'r')
#zip_ref.extractall(DATADIR)
#zip_ref.close()

In [4]:
def get_tree_models():
    lgb_params = {}
    #lgb_params['nthread'] = 3
    lgb_params['n_estimators'] = 10000
    lgb_params['learning_rate'] = 0.005
    lgb_params['colsample_bytree'] = 0.75
    lgb_params['subsample'] = 0.8
    #lgb_params['max_depth'] = 10
    #lgb_params["reg_alpha"] = 0.041545473
    #lgb_params['reg_lambda'] = 0.0735294
    #lgb_params['num_leaves'] = 34
    lgb_params['metric'] = 'rmse'
    lgb_params['objective'] = 'regression'

    
    lgb_fit_params = {}
    lgb_fit_params['verbose_eval'] = 100
    lgb_fit_params['early_stopping_rounds'] = 200
    lgb_fit_params['valid_sets'] = {}
    lgb_fit_params['valid_names'] = ["validation"]
    
    xgb_params = dict()
    xgb_params["booster"] = "gbtree"
    xgb_params["objective"] = "reg:linear"
    #xgb_params["colsample_bytree"] = 0.9497036
    #xgb_params["subsample"] = 0.8715623
    #xgb_params["max_depth"] = 8
    #xgb_params['reg_alpha'] = 0.041545473
    #xgb_params['reg_lambda'] = 0.0735294
    xgb_params["learning_rate"] = 0.005
    #xgb_params["min_child_weight"] = 39.3259775
    xgb_params['eval_metric'] = 'rmse'
    xgb_params['silent'] = 1
    
    xgb_fit_params = {}
    xgb_fit_params['verbose_eval'] = 100
    xgb_fit_params['early_stopping_rounds'] = 200
    xgb_fit_params['evals'] = {}
    xgb_fit_params['num_boost_round'] = 10000

    tree_models = []   

    lgbm = tr.LightGBMRegressorWrapper(params = lgb_params, name = "lgbm")
    xgb = tr.XgbRegressorWrapper(params = xgb_params, name = "xgb")
    
    tree_models.append((lgbm, lgb_fit_params))
    #tree_models.append((xgb, xgb_fit_params))
        
    return tree_models

In [5]:
def run_lgb(train_X, train_y, val_X, val_y, test_X, features = None, verbose = 50, early_stopping_rounds = 200):
    
    lgb_params = {
            "objective" : "regression",
            "metric" : "rmse",
            "num_leaves" : 30,
            "min_child_weight" : 50,
            "learning_rate" : 0.01,
            "bagging_fraction" : 0.8,
            "feature_fraction" : 0.8,
            "bagging_frequency" : 5,
            "bagging_seed" : 2018,
            "num_iterations" : 2000
        }
       
    if features == None:
        features = train_X.columns.tolist()
        
    #train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 2018)
    
    lgb_train = lgb.Dataset(data = train_X, label = train_y, feature_name = features)
    lgb_val = lgb.Dataset(data = val_X, label = val_y, feature_name = features)
    
    lgb_booster = lgb.train(params = lgb_params, train_set = lgb_train, valid_sets = [lgb_val], valid_names = ["validation"], 
            verbose_eval = verbose, early_stopping_rounds = early_stopping_rounds)
    
    predictions = lgb_booster.predict(test_X, num_iteration = lgb_booster.best_iteration)
    
    return lgb_booster, predictions

In [6]:
def get_datasets(debug_size, silent, treat_duplicated = True):
    train, test = ld.get_processed_files(debug_size, silent)
    features = [f for f in train.columns if f not in ['target', 'card_id', 'index', 'first_active_month']]
    
    train_y = train['target']
    train_X = train.loc[:, features]

    ids = test['card_id']
    test_X = test.loc[:, features]
    
    return train_X, train_y, test_X, ids

In [7]:
debug_size = 0
silent = False
verbose = 10
early_stopping_rounds = 100

In [8]:
train_X, train_y, test_X, ids = get_datasets(debug_size = debug_size, silent = silent)
features = train_X.columns.tolist()

Process train and test - Start!
Train shape: (201917, 16)
Test shape: (123623, 15)
Process train and test - Done in 2s

Process Historic Transactions - Start!
Memory usage of dataframe is 3109.54 MB
Memory usage after optimization is: 1749.11 MB
Decreased by 43.7%

Calculating Interval since first - Start!
Calculating Interval since first - Done in 25s

Concatenating - Start!
Concatenating - Done in 15s

Calculating Interval since last - Start!
Calculating Interval since last - Done in 15s

Calculating Counts - Start!
Calculating Counts - Done in 5s

Calculating NUNIQUE - Start!
Calculating NUNIQUE - Done in 104s

Calculating Engineered Features - Numerical - Start!
Calculating Engineered Features - Numerical - Done in 187s

Calculating Engineered Features - Categorical - Start!
Calculating Engineered Features - Categorical - Done in 25s

Historic Transactions shape: (325540, 63)
Process Historic Transactions - Done in 508s



In [40]:
if __name__ == "__main__":
    
    with pp.timer("Full Model Run"):
        
        print("Regrerssors will be fitted with {} out of {} features".format(len(features), train_X.shape[1]))
        
        for m, fp in get_tree_models():
        
            with pp.timer("Run " + m.name):
                
                model = tr.OOFRegressor(reg = m, nfolds = 5, stratified = False)
                
                model.fit(train_X.loc[:, features], train_y, **fp)
                pred = model.predict(test_X.loc[:, features])
                
                cv_score = model.rmse_score_
                feat_importance = model.importances_
                
                if debug_size == 0:
                    submission = pp.submit_file(ids, pred, prefix_file_name = m.name, cv_score = cv_score)
                
                del model, pred, cv_score
                gc.collect()
                
                print("*" * 80)

Full Model Run - Start!
Regrerssors will be fitted with 75 out of 75 features
Run lgbm - Start!
Training until validation scores don't improve for 200 rounds.
[100]	validation's rmse: 3.76867
[200]	validation's rmse: 3.74885
[300]	validation's rmse: 3.74075
[400]	validation's rmse: 3.73694
[500]	validation's rmse: 3.73434
[600]	validation's rmse: 3.733
[700]	validation's rmse: 3.73246
[800]	validation's rmse: 3.73148
[900]	validation's rmse: 3.73135
[1000]	validation's rmse: 3.73113
[1100]	validation's rmse: 3.73091
[1200]	validation's rmse: 3.73059
[1300]	validation's rmse: 3.73047
[1400]	validation's rmse: 3.73049
[1500]	validation's rmse: 3.73041
[1600]	validation's rmse: 3.73046
Early stopping, best iteration is:
[1472]	validation's rmse: 3.73032
Fold  1 RMSE : 3.730322
Training until validation scores don't improve for 200 rounds.
[100]	validation's rmse: 3.80235
[200]	validation's rmse: 3.78221
[300]	validation's rmse: 3.77105
[400]	validation's rmse: 3.76439
[500]	validation's r

In [41]:
feat_importance

Unnamed: 0,FEATURE,IMPORTANCE,FOLD
0,elapsed_time,2938,1
1,year,323,1
2,month,1079,1
3,feature_1_1,37,1
4,feature_1_2,77,1
5,feature_1_3,32,1
6,feature_1_4,12,1
7,feature_1_5,68,1
8,feature_2_1,141,1
9,feature_2_2,25,1


In [1]:
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential

def baseline_nn(n_features):
    model = Sequential()
    model.add(Dense(50, input_dim=n_features, init='normal', activation='relu'))
    model.add(Dense(1, init='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

Using TensorFlow backend.


ImportError: Traceback (most recent call last):
  File "/home/fernando_carneiro/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "/home/fernando_carneiro/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "/home/fernando_carneiro/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 24, in swig_import_helper
    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
  File "/home/fernando_carneiro/anaconda3/envs/tensorflow/lib/python3.6/imp.py", line 243, in load_module
    return load_dynamic(name, filename, file)
  File "/home/fernando_carneiro/anaconda3/envs/tensorflow/lib/python3.6/imp.py", line 343, in load_dynamic
    return _load(spec)
ImportError: libcuda.so.1: cannot open shared object file: No such file or directory


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/errors

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, Ridge, SGDRegressor, LassoLars
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
import numpy as np

  return f(*args, **kwds)


In [8]:
model_lasso = Lasso(alpha = 0.000507)
model_ridge = Ridge()
model_ENet = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3, max_iter = 10000)
model_rforest = RandomForestRegressor()

In [9]:
print(np.sqrt((-cross_val_score(model_rforest, train_X, train_y, cv=3, scoring="neg_mean_squared_error"))))

[ 3.82214121  3.86294031  3.86954698]
