In [1]:
%pylab inline
from __future__ import division
import pandas as pd
import datetime as dt

Populating the interactive namespace from numpy and matplotlib


In [2]:
u = pd.read_csv('tianchi_mobile_recommend_train_user_filtered.csv')
uci = u.set_index(['user_id', 'item_category', 'item_id'])

In [3]:
i = pd.read_csv('tianchi_mobile_recommend_train_item.csv', index_col=['item_id'])

# Utilities...

In [449]:
class DataSetCache(object):
    def __init__(self):
        super(Cache, self).__init__()
        self._cache = dict()
        self._generator = dict()
    def __call__(self, generator):
        name = generator.func_name
        self._cache[name] = dict()
        self._generator[name] = generator
        def gen(date):
            if not self._cache[name].has_key(date):
                self._cache[name][date] = self._generator[name](date)
            return self._cache[name][date]
        gen.func_name = name
        return gen
cache = DataSetCache()

# Generating training dataset

In [458]:
@cache
def stats_data_set(label_date):
    period_end = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(1)).strftime('%Y-%m-%d')
    period_start = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(4)).strftime('%Y-%m-%d')
    
    du = u[u.date == period_end]
    pu = u[(u.date >= period_start) & (u.date <= period_end)]
    lu = u[(u.date == label_date) & (u.behavior_type == 4)]

    uif = pd.DataFrame(dict(
            item_category=pu.groupby(['user_id', 'item_id']).item_category.first(),
            user_item_cnt=pu.groupby(['user_id', 'item_id']).behavior_type.size(),
            last_mark_hr=du[du.behavior_type==2].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_cart_hr=du[du.behavior_type==3].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_buy_hr=du[du.behavior_type==4].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_opr_hr=du.groupby(['user_id', 'item_id']).hour.max() + 1,
            first_opr_hr=du.groupby(['user_id', 'item_id']).hour.min()
    )).reset_index()
    ucf = pd.DataFrame(dict(
            user_category_cnt=pu.groupby(['user_id', 'item_category']).behavior_type.size(),
            user_category_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id', 'item_category']).behavior_type.size()
    )).reset_index()
    cf = pd.DataFrame(dict(
            category_cnt=pu.groupby(['item_category']).behavior_type.size(),
            category_buy_cnt=pu[pu.behavior_type==4].groupby(['item_category']).behavior_type.size()
    )).reset_index()
    itf = pd.DataFrame(dict(
            item_cnt=pu.groupby(['item_id']).behavior_type.size(),
            item_buy_cnt=pu[pu.behavior_type==4].groupby(['item_id']).behavior_type.size()
    )).reset_index()
    uf = pd.DataFrame(dict(
            user_cnt=pu.groupby(['user_id']).behavior_type.size(),
            user_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id']).behavior_type.size()
    )).reset_index()
    
    ldf = pd.DataFrame(dict(
            label=lu.groupby(['user_id', 'item_id']).behavior_type.first().map(lambda d: 1)
    )).reset_index()
    
    return pd.merge(traindf, ldf, on=['user_id', 'item_id'], how='left').fillna(0).set_index(['user_id', 'item_category', 'item_id'])

In [459]:
@cache
def rank_data_set(label_date):
    period_end = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(1)).strftime('%Y-%m-%d')
    period_start = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(4)).strftime('%Y-%m-%d')
    
    du = u[u.date == period_end]
    pu = u[(u.date >= period_start) & (u.date <= period_end)]
    lu = u[(u.date == label_date) & (u.behavior_type == 4)]

    uif = pd.DataFrame(dict(
            item_category=pu.groupby(['user_id', 'item_id']).item_category.first(),
            user_item_cnt=pu.groupby(['user_id', 'item_id']).behavior_type.size(),
            last_mark_hr=du[du.behavior_type==2].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_cart_hr=du[du.behavior_type==3].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_buy_hr=du[du.behavior_type==4].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_opr_hr=du.groupby(['user_id', 'item_id']).hour.max() + 1,
            first_opr_hr=du.groupby(['user_id', 'item_id']).hour.min()
    )).reset_index()
    ucf = pd.DataFrame(dict(
            user_category_cnt=pu.groupby(['user_id', 'item_category']).behavior_type.size(),
            user_category_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id', 'item_category']).behavior_type.size()
    )).reset_index()
    cf = pd.DataFrame(dict(
            category_cnt=pu.groupby(['item_category']).behavior_type.size(),
            category_buy_cnt=pu[pu.behavior_type==4].groupby(['item_category']).behavior_type.size()
    )).reset_index()
    itf = pd.DataFrame(dict(
            item_cnt=pu.groupby(['item_id']).behavior_type.size(),
            item_buy_cnt=pu[pu.behavior_type==4].groupby(['item_id']).behavior_type.size()
    )).reset_index()
    uf = pd.DataFrame(dict(
            user_cnt=pu.groupby(['user_id']).behavior_type.size(),
            user_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id']).behavior_type.size()
    )).reset_index()

    ldf = pd.DataFrame(dict(
            label=lu.groupby(['user_id', 'item_id']).behavior_type.first().map(lambda d: 1)
    )).reset_index()

    traindf = pd.merge(
        pd.merge(
            pd.merge(
                pd.merge(
                    uif,
                    uf,
                    on=['user_id'], how='left'
                ).fillna(0),
                itf,
                on=['item_id'], how='left'
            ).fillna(0),
            cf,
            on=['item_category'], how='left'
        ).fillna(0),
        ucf,
        on=['user_id', 'item_category'], how='left'
    ).fillna(0)
    
    # rank
    traindf['item_rank'] = traindf.item_cnt.rank(ascending=False)
    traindf['user_rank'] = traindf.user_cnt.rank(ascending=False)
    traindf['category_rank'] = traindf.category_cnt.rank(ascending=False)
    traindf['user_item_rank'] = traindf.groupby(['user_id', 'item_category']).user_item_cnt.rank(ascending=False)
    traindf['user_category_rank'] = traindf.groupby(['user_id']).user_category_cnt.rank(ascending=False)
    traindf['item_category_rank'] = traindf.groupby(['item_category']).item_cnt.rank(ascending=False)
    traindf['item_category_buy_rank'] = traindf.groupby(['item_category']).item_buy_cnt.rank(ascending=False)
    
#     del traindf['user_item_cnt'], traindf['user_category_cnt'], traindf['user_cnt'], traindf['item_cnt'], traindf['category_cnt']
    
    return pd.merge(traindf, ldf, on=['user_id', 'item_id'], how='left').fillna(0).set_index(['user_id', 'item_category', 'item_id'])

In [464]:
@cache
def ratio_data_set(label_date):
    period_end = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(1)).strftime('%Y-%m-%d')
    period_start = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(4)).strftime('%Y-%m-%d')
    
    du = u[u.date == period_end]
    pu = u[(u.date >= period_start) & (u.date <= period_end)]
    lu = u[(u.date == label_date) & (u.behavior_type == 4)]

    uif = pd.DataFrame(dict(
            item_category=pu.groupby(['user_id', 'item_id']).item_category.first(),
            user_item_cnt=pu.groupby(['user_id', 'item_id']).behavior_type.size(),
            last_mark_hr=du[du.behavior_type==2].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_cart_hr=du[du.behavior_type==3].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_buy_hr=du[du.behavior_type==4].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_opr_hr=du.groupby(['user_id', 'item_id']).hour.max() + 1,
            first_opr_hr=du.groupby(['user_id', 'item_id']).hour.min()
    )).reset_index()
    ucf = pd.DataFrame(dict(
            user_category_cnt=pu.groupby(['user_id', 'item_category']).behavior_type.size(),
            user_category_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id', 'item_category']).behavior_type.size()
    )).reset_index()
    cf = pd.DataFrame(dict(
            category_cnt=pu.groupby(['item_category']).behavior_type.size(),
            category_buy_cnt=pu[pu.behavior_type==4].groupby(['item_category']).behavior_type.size()
    )).reset_index()
    itf = pd.DataFrame(dict(
            item_cnt=pu.groupby(['item_id']).behavior_type.size(),
            item_buy_cnt=pu[pu.behavior_type==4].groupby(['item_id']).behavior_type.size()
    )).reset_index()
    uf = pd.DataFrame(dict(
            user_cnt=pu.groupby(['user_id']).behavior_type.size(),
            user_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id']).behavior_type.size()
    )).reset_index()

    ldf = pd.DataFrame(dict(
            label=lu.groupby(['user_id', 'item_id']).behavior_type.first().map(lambda d: 1)
    )).reset_index()

    traindf = pd.merge(
        pd.merge(
            pd.merge(
                pd.merge(
                    uif,
                    uf,
                    on=['user_id'], how='left'
                ).fillna(0),
                itf,
                on=['item_id'], how='left'
            ).fillna(0),
            cf,
            on=['item_category'], how='left'
        ).fillna(0),
        ucf,
        on=['user_id', 'item_category'], how='left'
    ).fillna(0)
    
    # ratio
    traindf['user_item_ratio'] = traindf.user_item_cnt / traindf.user_category_cnt
    traindf['user_category_ratio'] = traindf.user_category_cnt / traindf.user_cnt
    traindf['item_category_ratio'] = traindf.item_cnt / traindf.category_cnt
    traindf['item_ratio'] = traindf.item_cnt / len(pu)
    traindf['user_ratio'] = traindf.user_cnt / len(pu)
    traindf['category_ratio'] = traindf.category_cnt / len(pu)
    traindf['user_buy_ratio'] = traindf.user_buy_cnt / traindf.user_cnt
    traindf['item_buy_ratio'] = traindf.item_buy_cnt / traindf.item_cnt
    traindf['category_buy_ratio'] = traindf.category_buy_cnt / traindf.category_cnt

#     del traindf['user_item_cnt'], traindf['user_category_cnt'], traindf['user_cnt'], traindf['item_cnt'], traindf['category_cnt']
    
    return pd.merge(traindf, ldf, on=['user_id', 'item_id'], how='left').fillna(0).set_index(['user_id', 'item_category', 'item_id'])

In [557]:
@cache
def pure_ratio_data_set(label_date):
    period_end = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(1)).strftime('%Y-%m-%d')
    period_start = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(4)).strftime('%Y-%m-%d')
    
    du = u[u.date == period_end]
    pu = u[(u.date >= period_start) & (u.date <= period_end)]
    lu = u[(u.date == label_date) & (u.behavior_type == 4)]

    uif = pd.DataFrame(dict(
            item_category=pu.groupby(['user_id', 'item_id']).item_category.first(),
            user_item_cnt=pu.groupby(['user_id', 'item_id']).behavior_type.size(),
            last_mark_hr=(du[du.behavior_type==2].groupby(['user_id', 'item_id']).hour.max() + 1) / 24,
            last_cart_hr=(du[du.behavior_type==3].groupby(['user_id', 'item_id']).hour.max() + 1) / 24,
            last_buy_hr=-(du[du.behavior_type==4].groupby(['user_id', 'item_id']).hour.max() + 1) / 24,
            last_opr_hr=(du.groupby(['user_id', 'item_id']).hour.max() + 1) / 24,
            first_opr_hr=(du.groupby(['user_id', 'item_id']).hour.min()) / 24
    )).reset_index()
    ucf = pd.DataFrame(dict(
            user_category_cnt=pu.groupby(['user_id', 'item_category']).behavior_type.size(),
            user_category_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id', 'item_category']).behavior_type.size()
    )).reset_index()
    cf = pd.DataFrame(dict(
            category_cnt=pu.groupby(['item_category']).behavior_type.size(),
            category_buy_cnt=pu[pu.behavior_type==4].groupby(['item_category']).behavior_type.size()
    )).reset_index()
    itf = pd.DataFrame(dict(
            item_cnt=pu.groupby(['item_id']).behavior_type.size(),
            item_buy_cnt=pu[pu.behavior_type==4].groupby(['item_id']).behavior_type.size()
    )).reset_index()
    uf = pd.DataFrame(dict(
            user_cnt=pu.groupby(['user_id']).behavior_type.size(),
            user_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id']).behavior_type.size()
    )).reset_index()

    ldf = pd.DataFrame(dict(
            label=lu.groupby(['user_id', 'item_id']).behavior_type.first().map(lambda d: 1)
    )).reset_index()

    traindf = pd.merge(
        pd.merge(
            pd.merge(
                pd.merge(
                    uif,
                    uf,
                    on=['user_id'], how='left'
                ).fillna(0),
                itf,
                on=['item_id'], how='left'
            ).fillna(0),
            cf,
            on=['item_category'], how='left'
        ).fillna(0),
        ucf,
        on=['user_id', 'item_category'], how='left'
    ).fillna(0)
    
    # ratio
    traindf['user_item_ratio'] = traindf.user_item_cnt / traindf.user_category_cnt
    traindf['user_category_ratio'] = traindf.user_category_cnt / traindf.user_cnt
    traindf['item_category_ratio'] = traindf.item_cnt / traindf.category_cnt
    traindf['item_ratio'] = traindf.item_cnt / len(pu)
    traindf['user_ratio'] = traindf.user_cnt / len(pu)
    traindf['category_ratio'] = traindf.category_cnt / len(pu)
    traindf['user_buy_ratio'] = traindf.user_buy_cnt / traindf.user_cnt
    traindf['item_buy_ratio'] = traindf.item_buy_cnt / traindf.item_cnt
    traindf['category_buy_ratio'] = traindf.category_buy_cnt / traindf.category_cnt

#     del traindf['user_item_cnt'], traindf['user_category_cnt'], traindf['user_cnt'], traindf['item_cnt'], traindf['category_cnt']
    
    return pd.merge(traindf, ldf, on=['user_id', 'item_id'], how='left').fillna(0).set_index(['user_id', 'item_category', 'item_id'])

In [460]:
@cache
def mixed_data_set(label_date):
    period_end = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(1)).strftime('%Y-%m-%d')
    period_start = (dt.datetime.strptime(label_date, '%Y-%m-%d') - dt.timedelta(4)).strftime('%Y-%m-%d')
    
    du = u[u.date == period_end]
    pu = u[(u.date >= period_start) & (u.date <= period_end)]
    lu = u[(u.date == label_date) & (u.behavior_type == 4)]

    uif = pd.DataFrame(dict(
            item_category=pu.groupby(['user_id', 'item_id']).item_category.first(),
            user_item_cnt=pu.groupby(['user_id', 'item_id']).behavior_type.size(),
            last_mark_hr=du[du.behavior_type==2].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_cart_hr=du[du.behavior_type==3].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_buy_hr=du[du.behavior_type==4].groupby(['user_id', 'item_id']).hour.max() + 1,
            last_opr_hr=du.groupby(['user_id', 'item_id']).hour.max() + 1,
            first_opr_hr=du.groupby(['user_id', 'item_id']).hour.min()
    )).reset_index()
    ucf = pd.DataFrame(dict(
            user_category_cnt=pu.groupby(['user_id', 'item_category']).behavior_type.size(),
            user_category_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id', 'item_category']).behavior_type.size()
    )).reset_index()
    cf = pd.DataFrame(dict(
            category_cnt=pu.groupby(['item_category']).behavior_type.size(),
            category_buy_cnt=pu[pu.behavior_type==4].groupby(['item_category']).behavior_type.size()
    )).reset_index()
    itf = pd.DataFrame(dict(
            item_cnt=pu.groupby(['item_id']).behavior_type.size(),
            item_buy_cnt=pu[pu.behavior_type==4].groupby(['item_id']).behavior_type.size()
    )).reset_index()
    uf = pd.DataFrame(dict(
            user_cnt=pu.groupby(['user_id']).behavior_type.size(),
            user_buy_cnt=pu[pu.behavior_type==4].groupby(['user_id']).behavior_type.size()
    )).reset_index()

    ldf = pd.DataFrame(dict(
            label=lu.groupby(['user_id', 'item_id']).behavior_type.first().map(lambda d: 1)
    )).reset_index()

    traindf = pd.merge(
        pd.merge(
            pd.merge(
                pd.merge(
                    uif,
                    uf,
                    on=['user_id'], how='left'
                ).fillna(0),
                itf,
                on=['item_id'], how='left'
            ).fillna(0),
            cf,
            on=['item_category'], how='left'
        ).fillna(0),
        ucf,
        on=['user_id', 'item_category'], how='left'
    ).fillna(0)
    
    # ratio
    traindf['user_item_ratio'] = traindf.user_item_cnt / traindf.user_category_cnt
    traindf['user_category_ratio'] = traindf.user_category_cnt / traindf.user_cnt
    traindf['item_category_ratio'] = traindf.item_cnt / traindf.category_cnt
    traindf['item_ratio'] = traindf.item_cnt / len(pu)
    traindf['user_ratio'] = traindf.user_cnt / len(pu)
    traindf['category_ratio'] = traindf.category_cnt / len(pu)
    traindf['user_buy_ratio'] = traindf.user_buy_cnt / traindf.user_cnt
    traindf['item_buy_ratio'] = traindf.item_buy_cnt / traindf.item_cnt
    traindf['category_buy_ratio'] = traindf.category_buy_cnt / traindf.category_cnt
    
    # rank
    traindf['item_rank'] = traindf.item_cnt.rank(ascending=False)
    traindf['user_rank'] = traindf.user_cnt.rank(ascending=False)
    traindf['category_rank'] = traindf.category_cnt.rank(ascending=False)
    traindf['user_item_rank'] = traindf.groupby(['user_id', 'item_category']).user_item_cnt.rank(ascending=False)
    traindf['user_category_rank'] = traindf.groupby(['user_id']).user_category_cnt.rank(ascending=False)
    traindf['item_category_rank'] = traindf.groupby(['item_category']).item_cnt.rank(ascending=False)
    traindf['item_category_buy_rank'] = traindf.groupby(['item_category']).item_buy_cnt.rank(ascending=False)
    
#     del traindf['user_item_cnt'], traindf['user_category_cnt'], traindf['user_cnt'], traindf['item_cnt'], traindf['category_cnt']
    
    return pd.merge(traindf, ldf, on=['user_id', 'item_id'], how='left').fillna(0).set_index(['user_id', 'item_category', 'item_id'])

In [560]:
traindf = pure_ratio_data_set('2014-12-16')

In [561]:
len(traindf[traindf.label == 1]) / len(traindf[traindf.label==0])

0.001324006443498025

In [563]:
traindf[traindf.last_cart_hr > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,first_opr_hr,last_buy_hr,last_cart_hr,last_mark_hr,last_opr_hr,user_item_cnt,user_buy_cnt,user_cnt,item_buy_cnt,item_cnt,...,user_item_ratio,user_category_ratio,item_category_ratio,item_ratio,user_ratio,category_ratio,user_buy_ratio,item_buy_ratio,category_buy_ratio,label
user_id,item_category,item_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
120873,3064,95875607,0.666667,-0.875000,0.875000,0.000000,0.875000,14,1,286,2,54,...,0.056225,0.870629,0.002673,0.000237,0.001253,0.088506,0.003497,0.037037,0.013808,0
120873,3064,163309187,0.791667,0.000000,0.833333,0.833333,0.833333,5,1,286,0,5,...,0.020080,0.870629,0.000247,0.000022,0.001253,0.088506,0.003497,0.000000,0.013808,0
120873,3064,297010182,0.791667,0.000000,0.833333,0.000000,0.833333,2,1,286,0,2,...,0.008032,0.870629,0.000099,0.000009,0.001253,0.088506,0.003497,0.000000,0.013808,0
189833,3064,59582657,0.000000,-1.000000,0.041667,0.041667,1.000000,25,5,458,1,30,...,0.173611,0.314410,0.001485,0.000131,0.002006,0.088506,0.010917,0.033333,0.013808,0
189833,3064,163197167,0.000000,-1.000000,0.041667,0.000000,1.000000,10,5,458,1,10,...,0.069444,0.314410,0.000495,0.000044,0.002006,0.088506,0.010917,0.100000,0.013808,0
189833,3064,179175645,0.000000,0.000000,0.041667,0.000000,0.958333,5,5,458,0,5,...,0.034722,0.314410,0.000247,0.000022,0.002006,0.088506,0.010917,0.000000,0.013808,0
189833,3064,260748210,0.000000,0.000000,0.041667,0.000000,0.958333,8,5,458,0,11,...,0.055556,0.314410,0.000544,0.000048,0.002006,0.088506,0.010917,0.000000,0.013808,0
189833,3064,292673677,0.000000,0.000000,0.041667,0.000000,0.041667,8,5,458,0,8,...,0.055556,0.314410,0.000396,0.000035,0.002006,0.088506,0.010917,0.000000,0.013808,0
511319,13985,306490050,0.291667,0.000000,0.333333,0.000000,0.333333,2,0,2,0,2,...,1.000000,1.000000,0.005495,0.000009,0.000009,0.001594,0.000000,0.000000,0.027473,0
632347,4778,100079162,0.500000,0.000000,0.541667,0.000000,0.541667,4,0,22,0,19,...,1.000000,0.181818,0.007233,0.000083,0.000096,0.011507,0.000000,0.000000,0.005710,0


In [454]:
traindata = traindf.values

# Feature selection.

In [421]:
from sklearn import feature_selection as fsel

In [429]:
X_new = fsel.SelectKBest(fsel.chi2, k=8).fit_transform(traindata[:, :-1], traindata[:, -1])

In [430]:
X_new

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          6.14645000e+04,   3.19920000e+04,   2.43000000e+02],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          7.49715000e+04,   3.19920000e+04,   3.52000000e+02],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          2.81660000e+04,   4.43140000e+04,   1.83500000e+02],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          7.28385000e+04,   7.59425000e+04,   5.29800000e+03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          7.75765000e+04,   1.89105000e+04,   1.65000000e+02],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          8.67800000e+03,   1.78415000e+04,   9.50000000e+00]])

# Preparing for the validation...

In [142]:
from sklearn import cross_validation as cv

In [172]:
def sample_validation(clf, traindata):
    X_train, X_test, y_train, y_test = cv.train_test_split(traindata[:, :-1], traindata[:, -1], test_size=0.4, random_state=0)
    print clf
    clf.fit(X_train, y_train)
    result = zip(y_test.tolist(), clf.predict(X_test).tolist())
    print 'Actual:\t', len(filter(lambda d: d[0]==1, result))
    print 'Model:\t', len(filter(lambda d: d[1]==1, result))
    print 'Hit:\t', len(filter(lambda d: d[0] == 1 and d[0] == d[1], result))

In [176]:
def training_validation(clf, traindata):
    print clf
    clf.fit(traindata[:, :-1], traindata[:, -1])
    result = zip(traindata[:, -1].tolist(), clf.predict(traindata[:, :-1]).tolist())
    print 'Actual:\t', len(filter(lambda d: d[0]==1, result))
    print 'Model:\t', len(filter(lambda d: d[1]==1, result))
    print 'Hit:\t', len(filter(lambda d: d[0] == 1 and d[0] == d[1], result))

In [473]:
def cross_date_validation(clf, date, data_set=mixed_data_set):
    test_date = date
    train_date = (dt.datetime.strptime(date, '%Y-%m-%d') - dt.timedelta(1)).strftime('%Y-%m-%d')
    
    train_data = data_set(train_date).values
    test_data = data_set(test_date).values
    
    clf.fit(train_data[:, :-1], train_data[:, -1])
    result = zip(test_data[:, -1].tolist(), clf.predict(test_data[:, :-1]).tolist())
    
    model_actual = len(filter(lambda d: d[0]==1, result))
    model_predicted = len(filter(lambda d: d[1]==1, result))
    model_hit = len(filter(lambda d: d[0] == 1 and d[0] == d[1], result))
    
    actual = len(u[(u.date == date) & (u.behavior_type == 4)].groupby(['user_id', 'item_id']))
    
    prec = model_hit / model_predicted
    recl = model_hit / actual
    f1 = 2 * prec * recl / (prec + recl)
    
    print clf
    print 'Using data set `{0}` at {1}.'.format(data_set.func_name, date)
    print 'For test: positives={0:6d}, predicted\t={1:6d}, hit\t={2:6d}'.format(model_actual, model_predicted, model_hit)
    print 'Actual:   positives={0:6d}'.format(actual)
    print 'Ratio:    precision={0:0.4f}, recall\t={1:0.4f}, f1\t={2:0.4f}'.format(prec, recl, f1)

In [612]:
def cross_date_test(clf, date, data_set=mixed_data_set):
    test_data = data_set(date).values
    
    result = zip(test_data[:, -1].tolist(), clf.predict(test_data[:, :-1]).tolist())
    
    model_actual = len(filter(lambda d: d[0]==1, result))
    model_predicted = len(filter(lambda d: d[1]==1, result))
    model_hit = len(filter(lambda d: d[0] == 1 and d[0] == d[1], result))
    
    actual = len(u[(u.date == date) & (u.behavior_type == 4)].groupby(['user_id', 'item_id']))
    
    prec = model_hit / model_predicted
    recl = model_hit / actual
    f1 = 2 * prec * recl / (prec + recl)
    
    print clf
    print 'Using data set `{0}` at {1}.'.format(data_set.func_name, date)
    print 'For test: positives={0:6d}, predicted\t={1:6d}, hit\t={2:6d}'.format(model_actual, model_predicted, model_hit)
    print 'Actual:   positives={0:6d}'.format(actual)
    print 'Ratio:    precision={0:0.4f}, recall\t={1:0.4f}, f1\t={2:0.4f}'.format(prec, recl, f1)

# Linear Modeling

Linear model is something that we would easily ignore as the nature of many classification problems are not simply a linear model. However, linear model is such a special case that it can engage many other non-linear functions to gain a linear model, which is also the most important factor to be considered in the model.

In [12]:
from sklearn import linear_model as lm

In [309]:
training_validation(lm.LogisticRegressionCV(class_weight={0: 1, 1: 349}), traindata)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 349}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Actual:	130
Model:	4499
Hit:	82


In [310]:
sample_validation(lm.LogisticRegressionCV(class_weight={0: 1, 1: 349}), traindata)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 349}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Actual:	54
Model:	2032
Hit:	35


In [718]:
cross_date_validation(
    lm.LogisticRegressionCV(class_weight={0: 1, 1: 30}, n_jobs=-1),
    '2014-12-18'
)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 30}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Using data set `mixed_data_set` at 2014-12-18.
For test: positives=   109, predicted	=   120, hit	=     8
Actual:   positives=   393
Ratio:    precision=0.0667, recall	=0.0204, f1	=0.0312


## On ratio data set.

In [494]:
cross_date_validation(
    lm.LogisticRegressionCV(class_weight={0: 1, 1: 30}, n_jobs=-1),
    '2014-12-18', data_set=ratio_data_set
)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 30}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Using data set `ratio_data_set` at 2014-12-18.
For test: positives=   109, predicted	=   734, hit	=    27
Actual:   positives=   393
Ratio:    precision=0.0368, recall	=0.0687, f1	=0.0479


In [543]:
lrcv = lm.LogisticRegressionCV(class_weight={0: 1, 1: 30}, n_jobs=-1)
cross_date_validation(
    lrcv,
    '2014-12-17', data_set=ratio_data_set
)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 30}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Using data set `ratio_data_set` at 2014-12-17.
For test: positives=   130, predicted	=   377, hit	=    27
Actual:   positives=   392
Ratio:    precision=0.0716, recall	=0.0689, f1	=0.0702


In [551]:
lrcv.Cs_

array([  1.00000000e-04,   7.74263683e-04,   5.99484250e-03,
         4.64158883e-02,   3.59381366e-01,   2.78255940e+00,
         2.15443469e+01,   1.66810054e+02,   1.29154967e+03,
         1.00000000e+04])

In [474]:
cross_date_validation(
    lm.LogisticRegressionCV(class_weight={0: 1, 1: 30}, n_jobs=-1),
    '2014-12-16', data_set=ratio_data_set
)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 30}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Using data set `ratio_data_set` at 2014-12-16.
For test: positives=   120, predicted	=   645, hit	=    25
Actual:   positives=   445
Ratio:    precision=0.0388, recall	=0.0562, f1	=0.0459


## On pure ration data set.

In [712]:
cross_date_validation(
    lm.LogisticRegressionCV(class_weight={0: 1, 1: 150}, n_jobs=-1),
    '2014-12-17', data_set=pure_ratio_data_set
)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 150}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Using data set `pure_ratio_data_set` at 2014-12-17.
For test: positives=   130, predicted	=  1629, hit	=    54
Actual:   positives=   392
Ratio:    precision=0.0331, recall	=0.1378, f1	=0.0534


In [719]:
cross_date_validation(
    lm.LogisticRegressionCV(class_weight={0: 1, 1: 150}, n_jobs=-1),
    '2014-12-16', data_set=pure_ratio_data_set
)

LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 150}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
Using data set `pure_ratio_data_set` at 2014-12-16.
For test: positives=   120, predicted	=  1480, hit	=    32
Actual:   positives=   445
Ratio:    precision=0.0216, recall	=0.0719, f1	=0.0332


## On stats data set.

# Decision Tree Modeling

In [116]:
from sklearn import tree as dtree

In [318]:
training_validation(dtree.DecisionTreeClassifier(max_depth=5, class_weight={0:1, 1:10}), traindata)

DecisionTreeClassifier(class_weight={0: 1, 1: 10}, criterion='gini',
            max_depth=5, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, random_state=None,
            splitter='best')
Actual:	130
Model:	118
Hit:	40


In [319]:
sample_validation(dtree.DecisionTreeClassifier(max_depth=5, class_weight={0:1, 1:10}), traindata)

DecisionTreeClassifier(class_weight={0: 1, 1: 10}, criterion='gini',
            max_depth=5, max_features=None, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, random_state=None,
            splitter='best')
Actual:	54
Model:	57
Hit:	12


In [476]:
cross_date_validation(
    dtree.DecisionTreeClassifier(
        class_weight={0:1, 1:20}, criterion='gini',
        max_depth=10, max_features=4
    ),
    '2014-12-18'
)

DecisionTreeClassifier(class_weight={0: 1, 1: 20}, criterion='gini',
            max_depth=10, max_features=4, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, random_state=None,
            splitter='best')
Using data set `mixed_data_set` at 2014-12-18.
For test: positives=   109, predicted	=   334, hit	=     9
Actual:   positives=   393
Ratio:    precision=0.0269, recall	=0.0229, f1	=0.0248


In [490]:
cross_date_validation(
    dtree.DecisionTreeClassifier(
        class_weight={0:1, 1:30}, criterion='gini',
        max_depth=8, max_features='log2'
    ),
    '2014-12-18', data_set=ratio_data_set
)

DecisionTreeClassifier(class_weight={0: 1, 1: 30}, criterion='gini',
            max_depth=8, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, random_state=None,
            splitter='best')
Using data set `ratio_data_set` at 2014-12-18.
For test: positives=   109, predicted	=   546, hit	=    24
Actual:   positives=   393
Ratio:    precision=0.0440, recall	=0.0611, f1	=0.0511


In [525]:
clf = dtree.DecisionTreeClassifier(
    class_weight={0:1, 1:30}, criterion='gini',
    max_depth=8, max_features='log2'
)
cross_date_validation(
    clf,
    '2014-12-18', data_set=rank_data_set
)

DecisionTreeClassifier(class_weight={0: 1, 1: 30}, criterion='gini',
            max_depth=8, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, random_state=None,
            splitter='best')
Using data set `rank_data_set` at 2014-12-18.
For test: positives=   109, predicted	=   999, hit	=    32
Actual:   positives=   393
Ratio:    precision=0.0320, recall	=0.0814, f1	=0.0460


In [527]:
with open("/home/zhi/tree.dot", 'w') as f:
    f = dtree.export_graphviz(clf, out_file=f)

In [540]:
rdf = rank_data_set('2014-12-18')

In [529]:
clf.feature_importances_

array([ 0.03096362,  0.        ,  0.11697837,  0.02517086,  0.18605409,
        0.05789618,  0.01007507,  0.05280468,  0.0204342 ,  0.130699  ,
        0.01844224,  0.02549645,  0.00914853,  0.08080249,  0.03099779,
        0.03272282,  0.02273027,  0.04455403,  0.04514275,  0.01563525,
        0.04325132])

# Ensemble models

Ensemble models means the ensembling of other classification models, which here includes the following models:

- Random forest model.
- AdaBoost
- Gradient Boost Decision Tree (should be ignored here because it doesn't support class weight.)
- Bagging of logistic regression classification.

The most important models here are the __random forest__ and __bagging of logistic regression__. The random forest will be discussed in the [Random Forest](#Random-forest) section below. The bagging of logistic regression is discussed in the [Ensembling the logistic regression](#Ensembling-the-logistic-regression.) below.

GBDT(Gradient Boost Decision Tree) will be ignored here because of the uneven weight of data set and it performs poorly on such situation.

In [402]:
from sklearn import ensemble

In [183]:
training_validation(ensemble.GradientBoostingClassifier(max_depth=6), traindata)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=6, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Actual:	122
Model:	120
Hit:	120


In [182]:
sample_validation(ensemble.GradientBoostingClassifier(max_depth=6), traindata)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=6, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Actual:	43
Model:	36
Hit:	3


In [198]:
sample_validation(ensemble.RandomForestClassifier(max_depth=6, class_weight={0:1, 1:20}), traindata)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 20},
            criterion='gini', max_depth=6, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Actual:	43
Model:	7
Hit:	4


In [189]:
sample_validation(ensemble.AdaBoostClassifier(), traindata)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
Actual:	43
Model:	12
Hit:	2


In [204]:
sample_validation(ensemble.ExtraTreesClassifier(max_depth=6, class_weight={0:1, 1:20}), traindata)

ExtraTreesClassifier(bootstrap=False, class_weight={0: 1, 1: 20},
           criterion='gini', max_depth=6, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Actual:	43
Model:	18
Hit:	3


## Random forest

Random forest has some advantage in uncertain data classification. You can see the result below. Randomly choosing features sometimes works better than maximizing the profit of information gain(gini or entropy, the same). So when the decision trees with random feature selection sometimes outperform the random forest or other classification technique. 

In the experiment below, I use the manual feature selection(the model used in the last contest season where we win the chance to enter the 2nd season.) as the baseline of the comparison experiments. select 2 models that outperforms the baseline in some data set in the test.(2014-12-04/05/16/17/18) and it has the similar performance in other dates.

The 2 models listed below:

- `clf815` : trains on 2014-12-16 and test on 2014-12-17.
- `clf773` : trains on 2014-12-16 and test on 2014-12-17.

But something has to be noted that random forest training is not **stable** so using the features collected last date to train the model and predict the next date is merely impossible. We can just use the pre-trained model to predict the other date.

In [574]:
cross_date_validation(
    ensemble.RandomForestClassifier(
        max_depth=4, class_weight={0:1, 1:100}
    ),
    '2014-12-18', data_set=rank_data_set
)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Using data set `rank_data_set` at 2014-12-18.
For test: positives=   109, predicted	=   747, hit	=    30
Actual:   positives=   393
Ratio:    precision=0.0402, recall	=0.0763, f1	=0.0526


In [588]:
cross_date_validation(
    ensemble.RandomForestClassifier(
        max_depth=4, class_weight={0:1, 1:100}
    ),
    '2014-12-18', data_set=ratio_data_set
)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Using data set `ratio_data_set` at 2014-12-18.
For test: positives=   109, predicted	=   582, hit	=    29
Actual:   positives=   393
Ratio:    precision=0.0498, recall	=0.0738, f1	=0.0595


In [683]:
clf = ensemble.RandomForestClassifier(
    max_depth=4, class_weight={0:1, 1:100}
)
cross_date_validation(
    clf,
    '2014-12-17', data_set=ratio_data_set
)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Using data set `ratio_data_set` at 2014-12-17.
For test: positives=   130, predicted	=   443, hit	=    31
Actual:   positives=   392
Ratio:    precision=0.0700, recall	=0.0791, f1	=0.0743


In [707]:
cross_date_test(clf815, '2014-12-12', data_set=ratio_data_set)

RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 100},
            criterion='gini', max_depth=4, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Using data set `ratio_data_set` at 2014-12-12.
For test: positives=   621, predicted	=   641, hit	=   131
Actual:   positives=  1647
Ratio:    precision=0.2044, recall	=0.0795, f1	=0.1145


In [656]:
clf773 = cl773

In [681]:
clf815 = clf

In [580]:
cross_date_validation(
    ensemble.GradientBoostingClassifier(max_depth=6),
    '2014-12-17', data_set=pure_ratio_data_set
)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=6, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Using data set `pure_ratio_data_set` at 2014-12-17.
For test: positives=   130, predicted	=    83, hit	=     5
Actual:   positives=   392
Ratio:    precision=0.0602, recall	=0.0128, f1	=0.0211


## Ensembling the logistic regression.

In [710]:
cross_date_validation(
    ensemble.BaggingClassifier(
        base_estimator=lm.LogisticRegressionCV(class_weight={0: 1, 1: 30}, n_jobs=-1),
        n_jobs=1, bootstrap_features=True, max_features=0.8
    ),
    '2014-12-17', data_set=ratio_data_set
)

BaggingClassifier(base_estimator=LogisticRegressionCV(Cs=10, class_weight={0: 1, 1: 30}, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0),
         bootstrap=True, bootstrap_features=True, max_features=0.8,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0)
Using data set `ratio_data_set` at 2014-12-17.
For test: positives=   130, predicted	=    48, hit	=     4
Actual:   positives=   392
Ratio:    precision=0.0833, recall	=0.0102, f1	=0.0182


In [709]:
cross_date_validation(
    ensemble.BaggingClassifier(
        base_estimator=lm.LogisticRegressionCV(class_weight={0: 1, 1: 100}, n_jobs=-1),
        n_jobs=1, bootstrap_features=True, max_features=1
    ),
    '2014-12-17'
)

ZeroDivisionError: division by zero

# Bayes Modeling...

In [206]:
from sklearn import naive_bayes

In [207]:
sample_validation(naive_bayes.GaussianNB(), traindata)

GaussianNB()
Actual:	43
Model:	858
Hit:	22


In [208]:
sample_validation(naive_bayes.BernoulliNB(), traindata)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Actual:	43
Model:	15
Hit:	1


# Neighbor searching?

In [211]:
from sklearn import neighbors

In [215]:
sample_validation(neighbors.KNeighborsClassifier(), traindata)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')
Actual:	43
Model:	0
Hit:	0


# SVM modeling...

In [216]:
from sklearn import svm

In [218]:
sample_validation(svm.SVC(class_weight={0:1, 1:10}), traindata)

SVC(C=1.0, cache_size=200, class_weight={0: 1, 1: 10}, coef0=0.0, degree=3,
  gamma=0.0, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
Actual:	43
Model:	0
Hit:	0


In [None]:
cross_date_validation(
    svm.SVC(class_weight={0:1, 1:100}),
    '2014-12-17'
)