In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, make_scorer
from scipy.sparse import csr_matrix, hstack



In [2]:
def load_data(pkl_path="", csv_path=""):
    if pkl_path != "":
        pkl_file = open(pkl_path, 'rb')
        data = pickle.load(pkl_file)
    elif csv_path != "":
        data = pd.read_csv(csv_path)
    return data    

In [3]:
def calc_site_times_portions(train_data, test_data):
    site_times = [{},{}]
    count = 0
    for data in [train_data, test_data]:
        for r, row in data[:][range(0, 10)+range(20,30)].iterrows():
            rowdic = {}
            for c, s in [[c, 'site' + str(c)] for c in range(1,10)]:
                if row[s] == 0:
                    continue
                if row[s] in rowdic:
                    rowdic[int(row[s])] += row["time_diff"+str(c)]
                else:
                    rowdic[int(row[s])] = row["time_diff"+str(c)]
            site_times[count][r] = {}
            for site, time in rowdic.items():
                if len(rowdic) == 1:
                    site_times[count][r][int(site)] = 1.0
                    continue
                if time > 0:
                    site_times[count][r][int(site)] = round(float(time)/row["session_timespan"],3)
        count+=1
    return site_times

In [4]:
def site_times_to_sparse(sitetimes):
    row = []
    col = []
    data = []
    rowcount = 0
    for sitetime in sitetimes:
        for r, sites in sitetime.items():
            for site, p in sites.items():
                col.append(site)
                row.append(rowcount)
                data.append(p)
            rowcount+=1
    site_times_sparse = csr_matrix((data, (row, col)), shape=(len(sitetimes[0])+len(sitetimes[1]), max(col)+1), \
                                                                                              dtype=float)[:,1:]
    return site_times_sparse

In [12]:
def sparsematrix1(X):
    indptr = [0]
    indices = []
    data = []
    vocabulary = {}
    for d in X:
        for term in d:
            if term != 0:
                index = vocabulary.setdefault(term, len(vocabulary))
                indices.append(index)
                data.append(1)
        indptr.append(len(indices))
    return csr_matrix((data, indices, indptr), dtype=int)

In [5]:
def sparsematrix(X):
    row = []
    col = []
    data = []
    for r in range(X.shape[0]):
        row_counter = Counter(X[r])
        for site, num in row_counter.items():
            row.append(r)
            col.append(site)
            data.append(num)
    print "Sparse Matrix - rows:", X.shape[0], "columns:", len(set(col))
    return csr_matrix((data, (row, col)), shape=(X.shape[0], len(set(col))))[:,1:]

In [13]:
def sites_to_sparse(train_data, test_data, target_col, session_length, label_encoder=False):
    train_test_df = pd.concat([train_data, test_data])

    train_test_df_sites = train_test_df[['site' + str(c) for c in range(1,session_length+1)]].fillna(0).astype('int')
    X_train_test_sparse = sparsematrix1(train_test_df_sites.as_matrix())
    X_train_sparse = X_train_test_sparse[:len(train_data)]
    X_test_sparse = X_train_test_sparse[len(train_data):]
    y = train_data[target_col]
    
    sites_columns_num = X_train_test_sparse.shape[1]
    
    y_for_vw = None
    class_encoder = None
    if label_encoder:
        class_encoder = LabelEncoder().fit(y.astype('str'))
        y_for_vw = class_encoder.transform(y.astype('str'))
    
    return [X_train_sparse, X_test_sparse, y, y_for_vw, sites_columns_num, class_encoder]

In [7]:
def features_to_sparse(train_data, test_data, feature_cols):
    features_matrix = []
    for df in [train_data, test_data]:
        num_cols = 0
        data = []
        rows = []
        cols = []
        for label in feature_cols:
            if label in ["day_of_week", "daytime"]:
                coldata = list(df[[label]].values.T[0].astype('float') + 1)
            else:
                coldata = list(df[[label]].values.T[0].astype('float'))
            if len(data):
                data += coldata
            else:
                data = list(coldata)
            if len(cols):
                cols += [num_cols] * len(coldata)
            else:
                cols = [num_cols] * len(coldata)
            num_cols += 1
        rows = [r for r in range(df.shape[0])] * num_cols
        features = csr_matrix((data, (rows, cols)), shape=(df.shape[0], num_cols), dtype=float)
        features_matrix.append(features)
    return features_matrix

In [8]:
def combine_sites_features_sparse(sites_train_sparse, features_train_sparse, \
                                  sites_test_sparse, features_test_sparse):
    X_train_sparse = hstack([sites_train_sparse, features_train_sparse], dtype=float).tocsr()
    X_test_sparse = hstack([sites_test_sparse, features_test_sparse], dtype=float).tocsr()
    return [X_train_sparse, X_test_sparse]

In [10]:
# Loading processed train and test files
train_data = load_data(csv_path='kaggle_data/toy_train_w5_old.csv')
test_data = load_data(csv_path='kaggle_data/full_test.csv')

In [76]:
%%time
# Additionally, let's calculate the percentage of session time spent by every site in session
#site_times = calc_site_times_portions(train_data, test_data)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.05 µs


In [77]:
# Convert site times to sparse format
#site_times_sparse = site_times_to_sparse(site_times)
#train_site_times_sparse = site_times_sparse[:len(train_data)]
#test_site_times_sparse = site_times_sparse[len(train_data):]
#site_times_sparse

In [78]:
[label for label in test_data[range(20, test_data.shape[1])]]

['time_diff1',
 'time_diff2',
 'time_diff3',
 'time_diff4',
 'time_diff5',
 'time_diff6',
 'time_diff7',
 'time_diff8',
 'time_diff9',
 'session_timespan',
 '#unique_sites',
 'site_longest_time',
 'start_hour',
 'day_of_week',
 'daytime',
 'fb_portion',
 'youtube_portion',
 'top30_portion',
 'bot30_portion',
 'prediction']

In [14]:
%%time
X_train_sparse, X_test_sparse, y, y_for_vw, sites_columns_num, class_encoder = \
    sites_to_sparse(train_data, test_data, "target", 10, label_encoder=LabelEncoder())
    
mycolumns = ['day_of_week', 'daytime', 'prediction', 'start_hour', 'youtube_portion', \
             'fb_portion', 'bot30_portion']
#[label for label in test_data[range(20, test_data.shape[1])]]

train_features, test_features = features_to_sparse(train_data, test_data, mycolumns)



X_train_sparse, X_test_sparse = combine_sites_features_sparse(X_train_sparse, train_features, \
                                                             X_test_sparse, \
                                                              test_features)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_sparse, y_for_vw, test_size=0.3, stratify=y_for_vw)

CPU times: user 1.37 s, sys: 144 ms, total: 1.52 s
Wall time: 1.54 s


In [17]:
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_valid, y_valid)
dtrainfull = xgb.DMatrix(X_train_sparse, y_for_vw)
dtest = xgb.DMatrix(X_test_sparse)

pd.DataFrame(y_valid).to_csv("kaggle_data/xgb/toy/y_valid_w5_old.csv", index=False, header=False)

dtrain.save_binary("kaggle_data/xgb/toy/train.buffer")
dvalid.save_binary("kaggle_data/xgb/toy/valid.buffer")
dtrainfull.save_binary("kaggle_data/xgb/toy/trainfull.buffer")
dtest.save_binary("kaggle_data/xgb/toy/test.buffer")

In [None]:
y_valid=pd.read_csv("kaggle_data/y_valid_w5.csv", header=None, squeeze=True)

In [19]:
%%time
# specify parameters via map
param = {'silent':0, 'objective':'multi:softmax', 'num_class':150, 'max_depth':5, 'max_delta_step': 3}
param['eval_metric'] = ['merror']#, 'mlogloss']
param['nthread'] = 2
num_round = 30
evallist  = [(dvalid,'eval'), (dtrain,'train')]
bst = xgb.train(param, dtrain, num_round)#, evallist, verbose_eval=True, early_stopping_rounds=10)
# make prediction
preds = bst.predict(dvalid)
print "Accuracy:", accuracy_score(preds, y_valid)

Accuracy: 0.640883159221
CPU times: user 2min 7s, sys: 248 ms, total: 2min 7s
Wall time: 1min 4s
