In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
import zipfile
import time
import shutil
from sklearn.metrics import log_loss

In [2]:
gender_train = pd.read_csv('gender_age_train.csv')
gender_test = pd.read_csv('gender_age_test.csv')
phone_brand = pd.read_csv('phone_brand_device_model.csv')
app_events = pd.read_csv('app_events.csv')
app_labels = pd.read_csv('app_labels.csv')
labels = pd.read_csv('label_categories.csv')

In [3]:
app_cat = pd.merge(app_labels,labels,on='label_id')
app_cat.head()

Unnamed: 0,app_id,label_id,category
0,7324884708820027918,251,Finance
1,-4494216993218550286,251,Finance
2,8756705988821000489,251,Finance
3,1061207043315821111,251,Finance
4,-1491198667294647703,251,Finance


In [4]:
app_ev = pd.merge(app_events, app_cat, on='app_id')
app_ev.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,label_id,category
0,2,5927333115845830913,1,1,549,Property Industry 1.0
1,2,5927333115845830913,1,1,548,Industry tag
2,2,5927333115845830913,1,1,710,Relatives 1
3,2,5927333115845830913,1,1,704,Property Industry 2.0
4,2,5927333115845830913,1,1,172,IM


In [5]:
app_text = app_ev.groupby('event_id', as_index = False).agg({'category' : lambda x: ' '.join(x)})
app_text.head()

Unnamed: 0,event_id,category
0,2,Property Industry 1.0 Industry tag Relatives 1...
1,6,Property Industry 1.0 Industry tag Relatives 1...
2,7,Finance Custom label Insurance Insurance Mediu...
3,9,Property Industry 1.0 Industry tag Relatives 1...
4,16,Finance Debit and credit Custom label unknown ...


In [None]:
phone_brand['phone+brand'] = phone_brand.phone_brand + phone_brand.device_model
phone_brand = phone_brand.drop(['phone_brand','device_model'],axis=1)

In [None]:
phone_brand = pd.get_dummies(phone_brand)

In [None]:
train = pd.merge(gender_train, phone_brand)
test = pd.merge(gender_test, phone_brand)

In [None]:
train = train.drop(['age','gender'],axis=1)

In [None]:
def map_column(table, f):
    labels = sorted(table[f].unique())
    mappings = dict()
    for i in range(len(labels)):
        mappings[labels[i]] = i
    table = table.replace({f: mappings})
    return table

In [None]:
random.seed(2016)

def run_xgb(train, test, features, target, eta=0.1, random_state=0):
    #eta = 0.1
    max_depth = 3
    subsample = 0.7
    colsample_bytree = 0.7
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "multi:softprob",
        "num_class": 12,
        "booster" : "gbtree",
        "eval_metric": "mlogloss",
        "eta": eta,
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 1000
    early_stopping_rounds = 100
    test_size = 0.3

    X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
    print('Length train:', len(X_train.index))
    print('Length valid:', len(X_valid.index))
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)

    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

    print("Validating...")
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
    score = log_loss(y_valid.tolist(), check)

    print("Predict test set...")
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score

In [None]:
def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\n')
    total = 0
    test_val = test['device_id'].values
    for i in range(len(test_val)):
        str1 = str(test_val[i])
        for j in range(12):
            str1 += ',' + str(prediction[i][j])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [None]:
train = train.fillna(0)
test = test.fillna(0)

In [None]:
train = map_column(train, 'group')

In [None]:
res, score = run_xgb(train, test, train.columns.difference(['group']), 'group', eta=0.02, random_state=0)
print score
#create_submission(score, result_test, res)