# BaseExtractor

In [None]:
!cat competition/extractors/Base.py

In [None]:
# a simple extractor
!cat competition/extractors/Raw.py

# run existing extractors

In [None]:
from util import loadDataStore,loadAll

from competition.extractors.Raw import Raw

from config import extractedDir

extractFile = 'raw.db'
extractor= Raw()
alldata = loadAll()

extractor.run(alldata,extractedDir+extractFile)

In [None]:
from util import loadDataStore,loadAll

from competition.extractors.Combine import Combine
from competition.extractors.Merge import Merge
from competition.extractors.Raw import Raw

from config import extractedDir

extractFile = 'raw_merge.db'
extractor= Combine([Raw(),Merge(loadDataStore())])
alldata = loadAll()

extractor.run(alldata,extractedDir+extractFile)

# prepare CV 

In [None]:
from util import loadDataStore
import pandas as pd
import numpy as np

store = loadDataStore()
train = store['train']
dates = train['clickTime']
dates = pd.DataFrame(dates//10000)
dategrp = dates.groupby(by='clickTime').groups

In [None]:
for k,v in dategrp.items():
    print k,len(v)

In [None]:
# 交叉验证使用的样本
from config import cvDB
cvKey = 'cv_simple'
cvStore=pd.HDFStore(cvDB)
split_grps = [
    ([17,18,19,20,21,22,23],[24]),
    ([18,19,20,21,22,23,24],[25]),
#    ([25,26,27,28],[29,30]),
]
cv = []
cvgrp = []
for tr_grp, te_grp in split_grps:
    train_idx = []
    for t in tr_grp:
        train_idx.extend(dategrp[t])
    test_idx = []
    for t in te_grp:
        test_idx.extend(dategrp[t])
    cv.append((train_idx,test_idx))
    cvgrp.append((tr_grp,te_grp))

dfcv = pd.DataFrame(cv)
cvStore[cvKey] = dfcv
cvStore['grp_'+cvKey] = pd.DataFrame(cvgrp)

cvStore.flush(fsync=True)
dfcv

# design and validate a extractor
- get dataset

In [None]:
from util import getTrainAndVal,loadAll
extractFile = 'raw_merge.db'
cvKey = 'cv_simple'
cvSet = getTrainAndVal(extractFile,cvKey)
all_data = loadAll()

In [None]:
trX,trY,valX,valY = cvSet[0]
print trX.shape,trY.shape
print valX.shape,valY.shape

In [None]:
# trX,trY,valX,valY = cvSet[0]
# print trX.shape,trY.shape
# print valX.shape,valY.shape

- design a extractor 

In [None]:
import numpy as np
import pandas as pd
def CvrStatisticsByKey(train_label,X,key):
    dfCvr = train_label.groupby(key).apply(lambda df: np.mean(df["label"])).reset_index()
    dfCvr.columns=[key,key+'Cvr']
    newX = pd.merge(X,dfCvr,on=key,how='left')
    return newX

def split_time(tm):
    day=(tm//10000)%7
    hour = (tm%10000)//100
    minute = (tm%100)
    return (day,hour,minute)

def convertTime(df):
    timeInfo = df.apply(lambda row: split_time(row['clickTime']), axis=1)
    df['clickDay'],df['clickHour'],df['clickMin']=zip(*timeInfo)
    return df

dfTrain = convertTime(trX.copy())
dfTrain['label']=trY.copy()


In [None]:
def _extract(X,y,all_data):
    newX = convertTime(X)
    newX = X
    newX = CvrStatisticsByKey(dfTrain,newX,'appID')
    newX = CvrStatisticsByKey(dfTrain,newX,'positionID')
    newX = CvrStatisticsByKey(dfTrain,newX,'connectionType')
    newX = CvrStatisticsByKey(dfTrain,newX,'camgaignID')
    newX = CvrStatisticsByKey(dfTrain,newX,'count_act')
    newX = CvrStatisticsByKey(dfTrain,newX,'clickDay')
    del newX['clickTime']
    del newX['appID']

    return newX,y,all_data

In [None]:
e_trX, e_trY, _ = _extract(trX, trY, all_data)
e_valX, e_valY, _ = _extract(valX, valY, all_data)

In [None]:
e_trX.head()

- new feature explore

In [None]:
features = [
#    'clickDay',
#    'clickHour',
#    'clickMin',
#    'creativeIDCvr',
    'appIDCvr',
    'positionIDCvr',
    'connectionTypeCvr',
    'clickDayCvr',
    'count_actCvr',
    'camgaignIDCvr',    
]

In [None]:
e_trX[features].describe()

In [None]:
%pylab inline
for f in features:
    figure()
    col = e_trX[f].copy()
    col = col.fillna(-1)
    hist(col)
    title(f)

- validation

In [None]:
class OnlyCvr(object):
    def fit(self,X,y):
        return self
    def predict(self,X):
        return X['appCvr'].fillna(0)
    
from xgboost.sklearn import XGBModel

# we only use XGBModel or Simple Classifier as OnlyCvr

best_param = {
    'colsample_bylevel': 0.8,
    'max_depth': 5,
    'n_estimators': 50,
    'objective': 'binary:logistic',
    'subsample': 0.8
};

#estimator = OnlyCvr()
estimator = XGBModel(**best_param)

In [None]:
estimator.fit(e_trX,e_trY)

In [None]:
from competition.models import official_score
print -official_score(estimator,e_trX,e_trY)
print -official_score(estimator,e_valX,e_valY)

In [None]:
%pylab inline
# only for xgb
figure(figsize=(10,10))

fimp = estimator.feature_importances_
fnames=e_trX.columns

idx = np.arange(len(fimp))
barh(idx, fimp)
yticks(idx+0.5,fnames)
show()

# we keep record here
## record1 (no-merge)
- 0.0999130021573
- 0.108799880583

## current best setting

    newX = convertTime(X)
    newX = X
    newX = CvrStatisticsByKey(dfTrain,newX,'appID')
    newX = CvrStatisticsByKey(dfTrain,newX,'positionID')
    newX = CvrStatisticsByKey(dfTrain,newX,'connectionType')
    newX = CvrStatisticsByKey(dfTrain,newX,'camgaignID')
    newX = CvrStatisticsByKey(dfTrain,newX,'count_act')
    newX = CvrStatisticsByKey(dfTrain,newX,'clickDay')
    del newX['clickTime']
    del newX['appID']
- 0.0991358327441
- 0.106815019252    

# save and run the new extractor
- save extractor into file
- import and run

In [None]:
%%writefile competition/extractors/StatsFeatures.py
from competition.extractors.Base import BaseExtractor
import pandas as pd
import numpy as np
import os

def CvrStatisticsByKey(train_label,X,key):
    dfCvr = train_label.groupby(key).apply(lambda df: np.mean(df["label"])).reset_index()
    dfCvr.columns=[key,key+'Cvr']
    newX = pd.merge(X,dfCvr,on=key,how='left')
    return newX

def split_time(tm):
    day=(tm//10000)%7
    hour = (tm%10000)//100
    minute = (tm%100)
    return (day,minute,hour)

def convertTime(df):
    timeInfo = df.apply(lambda row: split_time(row['clickTime']), axis=1)
    df['clickDay'],df['clickHour'],df['clickMin']=zip(*timeInfo)
    return df

def stats_extract(X,y,raw_data,dfTrain):
    newX = convertTime(X)
    newX = X
    newX = CvrStatisticsByKey(dfTrain,newX,'appID')
    newX = CvrStatisticsByKey(dfTrain,newX,'positionID')
    newX = CvrStatisticsByKey(dfTrain,newX,'connectionType')
    newX = CvrStatisticsByKey(dfTrain,newX,'camgaignID')
    newX = CvrStatisticsByKey(dfTrain,newX,'count_act')
    newX = CvrStatisticsByKey(dfTrain,newX,'clickDay')
    del newX['clickTime']
    del newX['appID']
    return newX,y,raw_data

class StatsFeatures(BaseExtractor):
    def __init__(self,X,y):
        self.dfTrain = convertTime(X.copy())
        self.dfTrain['label']=y.copy()

    def get_train(self,X,y,raw_data):
        return self._extract(X,y,raw_data)

    def get_test(self,X,y,raw_data):
        return self._extract(X,y,raw_data)

    def _extract(self,X,y,raw_data):
        return stats_extract(X,y,raw_data,self.dfTrain)

- run extractor by loading from file

In [None]:
from util import loadDataStore,loadAll

from competition.extractors.Combine import Combine
from competition.extractors.Wrapper import Wrapper
from competition.extractors.StatsFeatures import StatsFeatures

from config import extractedDir

import pandas as pd
import os
inFile = 'raw_merge.db'
ext = pd.HDFStore(os.path.join(extractedDir,inFile))
trX = ext['trX']
trY = ext['trY']
teX = ext['teX']


extractFile = 'raw_merge_stats.db'
extractor= Combine([Wrapper(trX,trY,teX),StatsFeatures(trX,trY)])
alldata = loadAll()

extractor.run(alldata,extractedDir+extractFile)
print 'done'

# generate no-tuning model

In [None]:
class OnlyCvr(object):
    def fit(self,X,y):
        return self
    def predict(self,X):
        return X['appCvr'].fillna(0)
    
from xgboost.sklearn import XGBModel

# we only use XGBModel or Simple Classifier as OnlyCvr

best_param = {
    'colsample_bylevel': 0.8,
    'max_depth': 5,
    'n_estimators': 50,
    'objective': 'binary:logistic',
    'subsample': 0.8
};

#estimator = OnlyCvr()
estimator = XGBModel(**best_param)
clf = estimator

In [None]:
from util import loadExtracted
extractFile = 'raw_merge_stats.db'
dset = loadExtracted(extractFile)
dset

In [None]:
clf.fit(dset['trX'],dset['trY'])

In [None]:
from util import saveModel
saveModel(clf,extractFile,estimator_name='XGB',para_name='_FE')

# make no-tune result

In [None]:
from util import loadModel
extractFile = 'raw_merge_stats.db'
clf=loadModel(extractFile,estimator_name='XGB',para_name='_FE')

In [None]:
clf

In [None]:
from util import predictResult
predictResult(clf,extractFile,estimator_name='XGB',para_name='_FE') # feature engineered

In [None]:
!ls ./_results/raw_merge_stats.db-XGB_FE