# Red hat customer value predction

# Importing data and preprocessing

In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
import xgboost as xgb
from sklearn import cross_validation as cv
from sklearn.metrics import auc
import os
seed = 7
np.random.seed(seed)

In [2]:
#defining function
def clean_data(data_set):
    clean_feature = ['year_x','month_x','day_x','isweekend_x','year_y','month_y','day_y','isweekend_y','outcome','char_38_y']
    colname = data_set.columns
    for i in colname:
        if i not in clean_feature:
            if(data_set[i].dtype == 'object'):
                data_set[i] = data_set[i].fillna('type 0')
                data_set[i] = data_set[i].apply(lambda x : x.split(' ')[1]).astype(np.int32)
            elif(data_set[i].dtype == 'bool'):
                data_set[i] = data_set[i].astype(np.int8)
    return data_set

In [14]:
people = pd.read_csv('./input/people.csv',dtype={'people_id':np.str,'char_38':np.int32},
                    parse_dates=['date'])
act_train = pd.read_csv('./input/act_train.csv',dtype={'people_id':np.str,'activity_id':np.str,'outcome':np.int8},
                        parse_dates=['date'])
act_test = pd.read_csv('./input/act_test.csv',dtype={'people_id':np.str,'activity_id':np.str},
                        parse_dates=['date'])
#Taking care of the date 
people['year'] = people['date'].dt.year
people['month'] = people['date'].dt.month
people['day'] = people['date'].dt.day
people['isweekend'] = (people['date'].dt.weekday>=5).astype(np.int8)
people = people.drop('date',axis=1)

act_train['year'] = act_train['date'].dt.year
act_train['month'] = act_train['date'].dt.month
act_train['day'] = act_train['date'].dt.day
act_train['isweekend'] = (act_train['date'].dt.weekday>=5).astype(np.int8)
act_train = act_train.drop('date',axis=1)

act_test['year'] = act_test['date'].dt.year
act_test['month'] = act_test['date'].dt.month
act_test['day'] = act_test['date'].dt.day
act_test['isweekend'] = (act_test['date'].dt.weekday>=5).astype(np.int8)
act_test = act_test.drop('date',axis=1)

train = act_train.merge(people, on='people_id', how='left', left_index=True)
test  = act_test.merge(people, on='people_id', how='left', left_index=True)

train = train.drop(['people_id','activity_id'],axis=1)
test_activity = test['activity_id']
test = test.drop(['people_id','activity_id'],axis=1)
clean_data(train)
clean_data(test)

del act_train,act_test,people

In [5]:
colname = test.columns
category = ['activity_category', 'char_1_x', 'char_2_x', 'char_3_x', 'char_4_x',
       'char_5_x', 'char_6_x', 'char_7_x', 'char_8_x', 'char_9_x', 'char_10_x',
        'year_x', 'month_x', 'day_x', 'isweekend_x', 'char_1_y',
       'group_1', 'char_2_y', 'char_3_y', 'char_4_y', 'char_5_y', 'char_6_y',
       'char_7_y', 'char_8_y', 'char_9_y', 'year_y', 'month_y', 'day_y', 'isweekend_y']
X_train_total = train[category]
X_test_total = test[category]
y_train_total = train['outcome']
uncategory = []
for i in colname:
    if i not in category:
        uncategory.append(i)

ohencoder = OneHotEncoder()
X_train_total = ohencoder.fit_transform(X_train_total[category])
X_test_total = ohencoder.transform(X_test_total[category])

X_train_total = hstack((X_train_total,train[uncategory]))
X_test_total = hstack((X_test_total,test[uncategory]))

In [6]:
print(X_train_total.shape)
print(y_train_total.shape)
print(X_test_total.shape)

(2197291, 36845)
(2197291,)
(498687, 36845)


# Starting learning

## XGBoost

In [42]:
(X_train,X_val,y_train,y_val) = cv.train_test_split(X_train_total,y_train_total,test_size=0.1,
                                                   random_state=1)
xg_train = xgb.DMatrix(X_train,label = y_train)
xg_test = xgb.DMatrix(X_val,label = y_val)

In [57]:
param = {'booster':'gblinear',
         'max_depth':11,
         'eta':0.05,
         'silent':0,
         'objective':'binary:logistic',
         'nthread':2,
         'eval_metric':'auc',
         'colsample_bytree':0.92,
         'colsample_bylevel':0.9,
         'subsample':0.85,
         'min_child_weight':0
            #'lambda':5,
        #'lambda_bias':0,
        #'alpha':1
        }
num_round = 450
watchlist = [(xg_train,'train'),(xg_test,'test')]
bst = xgb.train(param,xg_train,num_round,watchlist)
ypred = bst.predict(xg_test)
val_auc = auc(y_val, ypred)
print('logloss val {}'.format(val_auc))
del X_train,X_val,y_train,y_val
os.system("printf '\a'")

[0]	train-auc:0.908117	test-auc:0.907075
[1]	train-auc:0.928237	test-auc:0.927101
[2]	train-auc:0.944613	test-auc:0.943495
[3]	train-auc:0.958577	test-auc:0.957478
[4]	train-auc:0.969559	test-auc:0.968474
[5]	train-auc:0.977484	test-auc:0.97643
[6]	train-auc:0.982844	test-auc:0.981832
[7]	train-auc:0.986366	test-auc:0.985399
[8]	train-auc:0.988692	test-auc:0.987772
[9]	train-auc:0.99029	test-auc:0.98941
[10]	train-auc:0.991453	test-auc:0.990604
[11]	train-auc:0.992341	test-auc:0.99152
[12]	train-auc:0.993046	test-auc:0.992245
[13]	train-auc:0.993618	test-auc:0.992837
[14]	train-auc:0.994093	test-auc:0.993329
[15]	train-auc:0.994493	test-auc:0.993743
[16]	train-auc:0.994834	test-auc:0.994097
[17]	train-auc:0.995129	test-auc:0.994404
[18]	train-auc:0.995386	test-auc:0.994671
[19]	train-auc:0.995611	test-auc:0.994905
[20]	train-auc:0.995809	test-auc:0.995111
[21]	train-auc:0.995983	test-auc:0.995292
[22]	train-auc:0.996138	test-auc:0.995453
[23]	train-auc:0.996275	test-auc:0.995596
[24]	t

ValueError: Reordering is not turned on, and the x array is not increasing: [0 0 1 ..., 1 0 0]

### Training with all data

In [58]:
#num_round = 25
xg_train_total = xgb.DMatrix(X_train_total,label = y_train_total)
xg_test_total = xgb.DMatrix(X_test_total)
bst = xgb.train(param,xg_train_total,num_round)
ypred = bst.predict(xg_test_total)
pred_xgb = pd.DataFrame(index = test_activity, data =  ypred ,columns=['outcome'])
pred_xgb.to_csv('xgb_result3.csv')
os.system("printf '\a'")

0

## NNet