In [1]:
import pandas as pd
import statistics
import numpy as np

train = pd.read_csv("train.csv", sep=";")
train = train.fillna("NR")
test = pd.read_csv("test.csv", sep=";")
test = test.fillna("NR")
target_name = "prime_tot_ttc"

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
#change postal code to first two numbers if numeric
def trunc_postal(x, letter_count):
    if type(x) == int:
        return int(str(x)[:letter_count])
    elif type(x) == str and x != "NR":
        return str(x)[:letter_count]
    else:
        return x

def transform_postal(df, letter_count):
    df["codepostal_trunc_" + str(letter_count)] = df["codepostal"].apply(lambda x: trunc_postal(x, letter_count))
    return df


train = transform_postal(train, 3)
test = transform_postal(test, 3)
train = transform_postal(train, 2)
test = transform_postal(test, 2)

In [3]:
#Т.е. если var12==3211, (как и при var12==0) можно считать, что этот параметр неизвестен.

def fill_var12(x):
    if x > 2000 or x < 0.1:
        return "NR"
    else:
        return x
    
train["var12"] = train["var12"].apply(fill_var12)
test["var12"] = test["var12"].apply(fill_var12)

In [4]:
def xgbmape(y_pred, y_true): 
    labels = y_true.get_label()
    return ("mape", -np.mean(np.abs((labels - y_pred) / labels)) * 100) #need - here due to weird overriding of maximize var in early stopping callback

def mape(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [5]:
def categorical_mapping(df, column_name):
    
    values = list(set(df[column_name].unique()))
    grouped = df.groupby(column_name).mean()
    average_targets = {}
    
    for category in grouped.index:
        average_targets[category] = grouped.loc[category][target_name]
    
    return average_targets

In [6]:
col_names = ['annee_naissance', 'annee_permis', 'marque', 'puis_fiscale', 'anc_veh', 'codepostal', 'energie_veh', 'kmage_annuel', 'crm', 'profession', 'var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'var10', 'var11', 'var12', 'var13', 'var14', 'var15', 'var16', 'var17', 'var18', 'var19', 'var20', 'var21', 'var22', "codepostal_trunc_2", "codepostal_trunc_3"]
mappings = {col_name: categorical_mapping(train, col_name) for col_name in col_names}

In [7]:
def transform_dataframe(df, mappings):
    for col_name, mapping in mappings.items():
        col_values = []
        for i in df[col_name]:
            if i in mapping.keys():
                col_values.append(mapping[i])
            elif "NR" in mapping.keys():
                col_values.append(mapping["NR"])
            else:
                col_values.append(statistics.mean(mapping.values()))
        
        df[col_name] = col_values
    return df

In [8]:
train = transform_dataframe(train, mappings)
test = transform_dataframe(test, mappings)

In [9]:
import xgboost as xgb

test_ids = test["id"]
test = test[col_names]

target = train[target_name]
train = train[col_names]

In [10]:
test_matrix = test.as_matrix()
train_matrix = train.as_matrix()
target = np.array(target)

In [11]:
def train_val_by_size(train_matrix, eval_size):
    train_size = len(train_matrix)
    eval_set_size = int(eval_size * train_size)
    eval_set_start = train_size - eval_set_size
    eval_matrix = train_matrix[eval_set_start:]
    new_train = train_matrix[:eval_set_start]
    train_target = target[:eval_set_start]
    eval_target = target[eval_set_start:]
    
    return new_train, eval_matrix, train_target, eval_target

In [15]:
from lr_callbacks import dynamic_lr, bold_driver, mc_clain, stc

# decrease_function - takes current LR as input, returns a new one
# rounds_function - takes number of LR decreases, returns allowed number of iterations that dont reduce eval score before applying decrease_function 
dynamic_lr = dynamic_lr(start_lr=0.5, min_lr=0.0001, decrease_function=lambda x: x*0.5, rounds_function=lambda x: int(5*x**1.6))

bold_driver = bold_driver(start_lr=0.5, min_lr=0.005, boldness=1.005, timidness=0.5, relax=5, relax_k=2)

mc_clain = mc_clain(start_lr=0.5, target_lr=0.001)

stc = stc(start_lr=0.25, T=150)

new_train, eval_matrix, train_target, eval_target = train_val_by_size(train_matrix, 0.1)
eval_set = [(eval_matrix, eval_target)]
xgmat = xgb.DMatrix(new_train, train_target)

params = {"objective": "reg:linear",
          "eta": 0.01,
          "max_depth": 6}
gbm = xgb.train(dtrain=xgmat, callbacks=[stc], params=params, num_boost_round=15000, early_stopping_rounds=500, verbose_eval=20, evals=[(xgb.DMatrix(eval_matrix, eval_target), "val_0")])

[0]	val_0-rmse:373.077
Will train until val_0-rmse hasn't improved in 500 rounds.
[20]	val_0-rmse:38.1002
[40]	val_0-rmse:34.0343
[60]	val_0-rmse:32.6373
[80]	val_0-rmse:31.7846
[100]	val_0-rmse:31.2868
[120]	val_0-rmse:31.0539
[140]	val_0-rmse:30.8704
[160]	val_0-rmse:30.7195
[180]	val_0-rmse:30.4965
[200]	val_0-rmse:30.3743
[220]	val_0-rmse:30.2866
[240]	val_0-rmse:30.4142
[260]	val_0-rmse:30.4501
[280]	val_0-rmse:30.3986
[300]	val_0-rmse:30.3916
[320]	val_0-rmse:30.3091
[340]	val_0-rmse:30.2826
[360]	val_0-rmse:30.3875
[380]	val_0-rmse:30.3625
[400]	val_0-rmse:30.335
[420]	val_0-rmse:30.3189
[440]	val_0-rmse:30.333
[460]	val_0-rmse:30.314
[480]	val_0-rmse:30.2471
[500]	val_0-rmse:30.2192
[520]	val_0-rmse:30.1809
[540]	val_0-rmse:30.1805
[560]	val_0-rmse:30.2361
[580]	val_0-rmse:30.3109
[600]	val_0-rmse:30.2715
[620]	val_0-rmse:30.2856
[640]	val_0-rmse:30.3263
[660]	val_0-rmse:30.3244
[680]	val_0-rmse:30.2756
[700]	val_0-rmse:30.2493
[720]	val_0-rmse:30.2211
[740]	val_0-rmse:30.2269
