In [1]:
import pandas as pd
import os
from src.utils.memory_managment import save_object
from src.utils.submission import createSubmissionFile
%matplotlib inline
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from hyperopt import hp
import hyperopt
from hyperopt import fmin, tpe, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
DATA_FOLDER = '../../data/'
CUSTOM_DATA_FOLDER = '../../data_custom/'
SUBMISSION_FOLDER = "../../submissions/"

In [3]:
train_test_df = pd.read_feather(os.path.join(os.getcwd(), CUSTOM_DATA_FOLDER, 'all_data_preprocessed.feather')).set_index("index")


In [4]:
train_test_df.head()

Unnamed: 0_level_0,shop_id,item_id,date_block_num,item_cnt_month,revenue,perc_sales_year_shop,perc_sales_year_item,category_id,item_name_vector_0,item_name_vector_1,...,city_cnt_lag1,city_cnt_lag1_diff,city_cnt_lag2,city_cnt_lag2_diff,city_cnt_lag12,city_cnt_lag12_diff,item_cnt_month_diff,item_cnt_month_all_shops_diff,category_cnt_diff,category_cnt_all_shops_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2,27,0,1.0,2499.0,0.115227,0.21875,19,0.009564,-0.022983,...,0.0,0.150595,0.0,0.150595,0.0,0.150595,,,,
1,2,33,0,1.0,499.0,0.115227,0.127479,37,0.004434,-0.029593,...,0.0,0.150595,0.0,0.150595,0.0,0.150595,,,,
2,2,317,0,1.0,299.0,0.115227,0.181818,45,-0.198342,0.466214,...,0.0,0.150595,0.0,0.150595,0.0,0.150595,,,,
3,2,438,0,1.0,299.0,0.115227,0.15,45,-0.024946,-0.025346,...,0.0,0.150595,0.0,0.150595,0.0,0.150595,,,,
4,2,471,0,2.0,798.0,0.115227,0.163522,49,0.002027,0.004761,...,0.0,0.150595,0.0,0.150595,0.0,0.150595,,,,


In [5]:
def trainXGBoost(train_x, train_y, valid_x=None, valid_y=None, n_estimators=50):
    
    model = XGBRegressor(
        max_depth=10,
        n_estimators=n_estimators,
        min_child_weight=0.5, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.1,
    #     tree_method='gpu_hist',
        seed=42)
    if valid_x is None : 
        eval_set = None
        early_stopping = None
    else: 
        eval_set = [(train_x, train_y), (valid_x, valid_y)]
        early_stopping = 10
    
    model.fit(
        train_x, 
        train_y, 
        eval_metric="rmse", 
        eval_set=eval_set, 
        verbose=True, 
        early_stopping_rounds = early_stopping)
    
    return model


def trainLR(train_x, train_y):
    lr = LinearRegression()
    lr.fit(train_x.fillna(0).values, train_y.fillna(0))
    return lr

from sklearn import svm
def trainSVM(train_x, train_y):
    regr = svm.LinearSVR()
    regr.fit(train_x.values, train_y)
    return regr

from sklearn.neural_network import MLPRegressor
def trainNN(train_x, train_y):
    regr = MLPRegressor(hidden_layer_sizes=(16, 8), learning_rate="adaptive", verbose=True, max_iter=8)
    regr.fit(train_x.values, train_y)
    return regr

from sklearn.metrics import mean_squared_error
def getRMSE(y_actual, y_predicted):
    rms = mean_squared_error(y_actual.clip(upper=20), y_predicted.clip(max=20), squared=True)
    return rms

In [6]:
#train_test_df.dropna(inplace=True)
#all_train_x = train_test_df[train_test_df.date_block_num < 34].drop(['item_cnt_month'], axis=1)
#all_train_y = train_test_df[train_test_df.date_block_num < 34]['item_cnt_month'].clip(lower=0, upper=20)

In [7]:
train_x = train_test_df[train_test_df.date_block_num < 33].drop(['item_cnt_month'], axis=1)
train_y = train_test_df[train_test_df.date_block_num < 33]['item_cnt_month'].clip(lower=0, upper=20)
valid_x = train_test_df[train_test_df.date_block_num == 33].drop(['item_cnt_month'], axis=1)
valid_y = train_test_df[train_test_df.date_block_num == 33]['item_cnt_month'].clip(lower=0, upper=20)
test_x = train_test_df[train_test_df.date_block_num == 34].drop(['item_cnt_month'], axis=1)



In [8]:
def get_validation_score(args):
    max_depth = args["max_depth"]
    min_child_weight = args["min_child_weight"]
    eta = args["eta"]
    subsample = args["subsample"]
    colsample_bytree = args["colsample_bytree"]
    
    model = XGBRegressor(
        max_depth=max_depth,
        n_estimators=100,
        min_child_weight=min_child_weight, 
        colsample_bytree=colsample_bytree, 
        subsample=subsample, 
        eta=eta,
    #     tree_method='gpu_hist',
        seed=42)
    
    eval_set = [(train_x, train_y), (valid_x, valid_y)]
    early_stopping = 15
    
    model.fit(
        train_x, 
        train_y, 
        eval_metric="rmse", 
        eval_set=eval_set, 
        verbose=False, 
        early_stopping_rounds = early_stopping)
    
    rmse = getRMSE(valid_y, model.predict(valid_x, ntree_limit=model.best_ntree_limit))
    #print("max_depth: {}, min_child_weight: {}, eta: {}, subsample: {}, colsample_bytree:{}, rmse: {}, best_tree_number: {}".format(max_depth, min_child_weight, eta, subsample, colsample_bytree, rmse, model.best_ntree_limit))
    dict_to_ret = {
        "loss": -rmse,
        "status": STATUS_OK,
        "best_tree_number": model.best_ntree_limit
    }
    return dict_to_ret

In [9]:
space = {
    "max_depth": scope.int(hp.uniformint("max_depth", 5, 40, 2)),
    "min_child_weight" : hp.uniform("min_child_weight", 0.3, 1),
    "eta": hp.choice("eta", [0.1, 0.01, 0.001]),
    "subsample":  hp.uniform("subsample", 0.6, 1),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.6, 1),
}

In [10]:
trials = Trials()
best = fmin(get_validation_score, space, algo=tpe.suggest, max_evals=10, trials=trials)

  0%|          | 0/10 [00:37<?, ?trial/s, best loss=?]


job exception: Invalid Parameter format for max_depth expect int but value='32.0'



XGBoostError: Invalid Parameter format for max_depth expect int but value='32.0'

In [None]:
print(best)
# -> {'a': 1, 'c2': 0.01420615366247227}
print(hyperopt.space_eval(space, best))

In [None]:
print(trials)

In [None]:

best_path = os.path.join(os.getcwd(), CUSTOM_DATA_FOLDER, 'best_opt.pkl')
trials_path = os.path.join(os.getcwd(), CUSTOM_DATA_FOLDER, 'trials.pkl')
space_path = os.path.join(os.getcwd(), CUSTOM_DATA_FOLDER, 'space.pkl')
save_object(best, best_path)
save_object(trials, trials_path)
save_object(space, space_path)
