# Using xgboost regression trees to estimate severity of claims

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
import xgboost as xgb


In [2]:
#import data 
train_raw = pd.read_csv('../data/train-allstate.csv')
test_raw = pd.read_csv('../data/test-allstate.csv')
train_test_raw = pd.concat((train_raw, test_raw)).reset_index(drop=True)

features = [x for x in train_test_raw.columns if x not in ['id', 'loss']]
cat_features = [x for x in train_test_raw.select_dtypes(include=['O']).columns if x not in ['id', 'loss']]
num_features = [x for x in train_test_raw.select_dtypes(exclude=['O']).columns if x not in ['id', 'loss']]

In [3]:
# save important info
targets_log = np.log(train_test_raw['loss'])
out_ids = test_raw['id']
ntrain = train_raw.shape[0]

# cut columns with info stored elsewhere
train_test = train_test_raw.drop(['id', 'loss'], axis=1)


In [4]:
# factorize or one hot code categorical columns
def handle_cat_columns(train_test=train_test, cat_handling='factorize'):
    new_dat = train_test.copy()
    if cat_handling == 'factorize':
        print "factorizing categorical features"
        for feature in cat_features:
            new_dat[feature] = pd.factorize(new_dat[feature], sort=True)[0]
    elif cat_handling == 'onehot':
        print "one hot coding categorical features"
        new_dat = pd.concat([new_dat, get_dummies(new_dat, prefix=cat_features)],axis=1)
        new_dat = new_dat.drop(cat_features, 1)
    else:
        raise Exception('cat_handling Value Error')
    return new_dat

In [5]:
def preprocess_dat(train_test_dat, ntrain=ntrain):
    # separate data
    train = train_test_dat.iloc[:ntrain,:]
    test = train_test_dat.iloc[ntrain:,:]
    
    # scale data
    scaler = StandardScaler()
    scaler.fit(train_test_dat)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

In [11]:
def run_model(train, test, outputfile, params, folds=5, early_stopping=25):
    d_test = xgb.DMatrix(test)
    # variables to track while running
    mae_folds = 0 # sum of MAE accross folds
    best_model_folds = [] # best model from each fold
    pred_folds = 0 # sum of predicted loss across folds

    # start CV loop
    kf = KFold(ntrain, n_folds=folds)
    for i, (train_index, val_index) in enumerate(kf):
        print('\n Fold %d\n' % (i + 1))
        # cut data for each fold from the training data
        X_train, X_val = train[train_index], train[val_index]
        y_train, y_val = targets_log[train_index], targets_log[val_index]

        # make into dmatricies
        d_train = xgb.DMatrix(X_train, label=y_train)
        d_val = xgb.DMatrix(X_val, label=y_val)
        
        # model
        clf = xgb.train(params,
                       d_train,
                       100000,
                       [(d_train, 'train'), (d_val, 'eval')],
                       early_stopping_rounds=early_stopping)

        # Evaulate on fold's validation data
        mae_fold = mean_absolute_error(np.exp(y_val),
                                       np.exp(clf.predict(d_val, ntree_limit=clf.best_ntree_limit)))

        # Get fold's prediction of test data's losses
        pred_fold = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit))

        #keep track of sums to average later and of best estimator from fold
        mae_folds += mae_fold
        pred_folds += pred_fold
        best_model_folds.append(clf.best_iteration)

    # Average predictions and MAE to evaluate
    pred = pred_folds / folds
    mae = mae_folds / folds
    print('\n Average eval-MAE: %.6f' % mae)

    # Output predictions
    output = pd.DataFrame(pred, columns=['loss'])
    output['id'] = ids
    output = output.set_index('id')
    output.to_csv(outputfile, encoding='utf-8')

In [12]:
train, test = preprocess_dat(handle_cat_columns(cat_handling='factorize'))
params ={
    'booster': 'gbtree',
    'colsample_bytree': 0.3085,
    'eta': 0.1,
    'eval_metric': 'mae',
    'gamma': 0.529,
    'max_delta_step': 0,
    'max_depth': 7,
    'min_child_weight': 4.2922,
    'objective': 'reg:linear',
    'random_state': 1001,
    'silent': 1,
    'subsample': 0.993}

run_model(train, test, xgb.DMatrix(test), folds=5, early_stopping=25,params=params)

factorizing categorical features

 Fold 1

[0]	train-mae:6.46764	eval-mae:6.46368
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 25 rounds.
[1]	train-mae:5.82105	eval-mae:5.81686
[2]	train-mae:5.23911	eval-mae:5.23475
[3]	train-mae:4.71538	eval-mae:4.71077
[4]	train-mae:4.24402	eval-mae:4.23948
[5]	train-mae:3.81986	eval-mae:3.81529
[6]	train-mae:3.43807	eval-mae:3.43339
[7]	train-mae:3.09451	eval-mae:3.08976
[8]	train-mae:2.78538	eval-mae:2.78063
[9]	train-mae:2.50732	eval-mae:2.50264
[10]	train-mae:2.2572	eval-mae:2.25236
[11]	train-mae:2.03238	eval-mae:2.02743
[12]	train-mae:1.83052	eval-mae:1.82538
[13]	train-mae:1.64952	eval-mae:1.64422
[14]	train-mae:1.48759	eval-mae:1.48205
[15]	train-mae:1.34315	eval-mae:1.33771
[16]	train-mae:1.21495	eval-mae:1.20982
[17]	train-mae:1.10189	eval-mae:1.09728
[18]	train-mae:1.00284	eval-mae:0.998763
[19]	train-mae:0.916332	eval-mae:0.912696
[20]	train-mae:0.84152	e

NameError: global name 'ids' is not defined