Source: https://www.kaggle.com/zfturbo/talkingdata-mobile-user-demographics/xgboost-simple-starter/code

```python
__author__ = 'ZFTurbo: https://kaggle.com/zfturbo'
```

In [2]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn.metrics import log_loss

In [3]:
random.seed(2016)

def run_xgb(train, test, features, target, random_state=0):
    eta = 0.1
    max_depth = 3
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 500
    early_stopping_rounds = 50
    test_size = 0.3

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score

In [4]:
def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\n')
    total = 0
    test_val = test['device_id'].values
    for i in range(len(test_val)):
        str1 = str(test_val[i])
        for j in range(12):
            str1 += ',' + str(prediction[i][j])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [5]:
def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table

In [6]:
def read_train_test():
    # Events
    print('Read events...')
    events = pd.read_csv("data/events.csv", dtype={'device_id': np.str})
    events['counts'] = events.groupby(['device_id'])['event_id'].transform('count')
    events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first')

    # Phone brand
    print('Read brands...')
    pbd = pd.read_csv("data/phone_brand_device_model.csv", dtype={'device_id': np.str})
    pbd.drop_duplicates('device_id', keep='first', inplace=True)
    pbd = map_column(pbd, 'phone_brand')
    pbd = map_column(pbd, 'device_model')

    # Train
    print('Read train...')
    train = pd.read_csv("data/gender_age_train.csv", dtype={'device_id': np.str})
    train = map_column(train, 'group')
    train = train.drop(['age'], axis=1)
    train = train.drop(['gender'], axis=1)
    train = pd.merge(train, pbd, how='left', on='device_id', left_index=True)
    train = pd.merge(train, events_small, how='left', on='device_id', left_index=True)
    train.fillna(-1, inplace=True)

    # Test
    print('Read test...')
    test = pd.read_csv("data/gender_age_test.csv", dtype={'device_id': np.str})
    test = pd.merge(test, pbd, how='left', on='device_id', left_index=True)
    test = pd.merge(test, events_small, how='left', on='device_id', left_index=True)
    test.fillna(-1, inplace=True)

    # Features
    features = list(test.columns.values)
    features.remove('device_id')

    return train, test, features

In [7]:
train, test, features = read_train_test()

Read events...
Read brands...
Read train...
Read test...


In [8]:
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))

Length of train:  74645
Length of test:  112071
Features [3]: ['counts', 'device_model', 'phone_brand']


In [9]:
test_prediction, score = run_xgb(train, test, features, 'group')
print("LS: {}".format(round(score, 5)))

XGBoost params. ETA: 0.1, MAX_DEPTH: 3, SUBSAMPLE: 0.7, COLSAMPLE_BY_TREE: 0.7
Length train: 52251
Length valid: 22394


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-mlogloss:2.474397	eval-mlogloss:2.474914
[1]	train-mlogloss:2.465317	eval-mlogloss:2.466109
[2]	train-mlogloss:2.457118	eval-mlogloss:2.458275
[3]	train-mlogloss:2.450113	eval-mlogloss:2.451398
[4]	train-mlogloss:2.443151	eval-mlogloss:2.445095
[5]	train-mlogloss:2.437325	eval-mlogloss:2.439628
[6]	train-mlogloss:2.432078	eval-mlogloss:2.434799
[7]	train-mlogloss:2.427336	eval-mlogloss:2.430429
[8]	train-mlogloss:2.423183	eval-mlogloss:2.426577
[9]	train-mlogloss:2.419263	eval-mlogloss:2.423142
[10]	train-mlogloss:2.415664	eval-mlogloss:2.419885
[11]	train-mlogloss:2.412357	eval-mlogloss:2.416944
[12]	train-mlogloss:2.409511	eval-mlogloss:2.414311
[13]	train-mlogloss:2.406846	eval-mlogloss:2.412080
[14]	train-mlogloss:2.404468	eval-mlogloss:2.410016
[15]	train-mlogloss:2.402271	eval-mlogloss:2.408044
[16]	train-mlogloss:2.400320	eval-mlogloss:2.406388
[17]	train-mlogloss:2.398505	eval-mlogloss:2.404837
[18]	train-mlog

Validating...


  preds = preds.reshape(nrow, preds.size / nrow)


Predict test set...
Training time: 1.11 minutes
LS: 2.38721


In [10]:
create_submission(score, test, test_prediction)

Writing submission:  submission_2.38720859136_2016-07-20-02-08.csv
