In [1]:
# %load 101_user_based_statistics.py
#encoding:utf-8
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path

train = pd.read_pickle('../data/train.pkl')
test = pd.read_pickle('../data/test.pkl')
df = pd.concat([train, test], axis=0)

df_feat = pd.DataFrame({'instance_id': df.instance_id, 'day': df.day})


user_cols = ['user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
id_cols = ['instance_id', 'item_id', 'item_brand_id', 'item_city_id', 'shop_id']
df_cols = df.columns
# grade_cols = ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'shop_review_num_level',
#               'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']
grade_cols = ['item_price_level']

def get_stats_target(df, group_column, target_column, drop_raw_col=False):
    df_old = df.copy()
    grouped = df_old.groupby(group_column)
    the_stats = grouped[target_column].agg(['mean', 'median', 'max', 'min', 'std', 'skew']).reset_index()
    the_stats.columns = [ group_column,
                            '_%s_groupby_%s_mean' % (target_column, group_column),
                            '_%s_groupby_%s_median' % (target_column, group_column),
                            '_%s_groupby_%s__max' % (target_column, group_column),
                            '_%s_groupby_%s__min' % (target_column, group_column),
                            '_%s_groupby_%s__std' % (target_column, group_column),
                            '_%s_groupby_%s__skew' % (target_column, group_column)
                        ]
    if drop_raw_col:
        df_old.drop(group_column, axis=1, inplace=True)

    return the_stats

def generate_basic_feats(df, feat_df):

    # ============================= 计算关于用户特征的id类特征的数量 =====================
    '''
    如：对于每个男性用户, 他所浏览的各种item_id的数量
    '''
    for user_col in (user_cols):
        cnt_result = df.groupby(user_col)[id_cols].nunique()
        cnt_result = cnt_result.add_prefix(user_col + '_').add_suffix('_cnt')
        cnt_result = cnt_result.reset_index()
        np.sum(feat_df.instance_id - pd.merge(df, cnt_result, on=user_col, how='left').instance_id)
        feat_df[user_col + '_count'] = pd.merge(df, cnt_result, on=user_col, how='left').iloc[:, -1].values
    print('the shape of feat_df {}'.format(feat_df.shape))
    # ============================ 计算关于用户特征的得分类特征的统计特征 ========================
    '''
    如：男性用户中item_price_level的平均数，中位数，最大值，最小值
    '''
    for user_col in (user_cols):
        # statistic feature
        for grade_col in grade_cols:
            the_stats = get_stats_target(df, user_col, grade_col, drop_raw_col=False)
            tmp_df = pd.merge(df, the_stats, on=user_col, how='left')
            feat_df = pd.concat([feat_df, tmp_df.drop(df_cols, axis=1, inplace=True)] , axis=1)
    print('the shape of feat_df {}'.format(feat_df.shape))
    
    # ========================== 计算每个用户下的category2的个数 =========================
    '''
    如：对于每个男性用户，他所浏览的各种category2的数量
    '''
    for user_col in (user_cols):
        # category and property columns
        category_cnt = df.groupby(user_col)['item_category_list2'].nunique().reset_index()
        category_cnt.columns = [user_col, 'category2_groupby_{}_cnt'.format(user_col)]
        tmp_df = pd.merge(df, category_cnt, how='left', on=user_col)
        feat_df = pd.concat([feat_df, tmp_df.drop(df_cols, axis=1, inplace=True)], axis=1)

        del category_cnt
        gc.collect()
    print('the shape of feat_df {}'.format(feat_df.shape))
    return feat_df


df = generate_basic_feats(df, df_feat)
train_feat = df[(df['day'] >= 18) & (df['day'] <= 24)].drop('day', axis=1)
test_feat = df[df['day'] == 25].drop('day', axis=1)
print(train_feat.columns)
print('the shape of train {}'.format(train_feat.shape))
print('the shape of test {}'.format(test_feat.shape))
dump_pickle(train_feat, '../data/train_feature/101_user_based_statistics.pkl')
dump_pickle(test_feat, '../data/test_feature/101_user_based_statistics.pkl')

the shape of feat_df (520975, 7)
the shape of feat_df (520975, 7)
the shape of feat_df (520975, 7)
Index(['instance_id', 'user_id_count', 'user_gender_id_count',
       'user_age_level_count', 'user_occupation_id_count',
       'user_star_level_count'],
      dtype='object')
the shape of train (478087, 6)
the shape of test (42888, 6)


In [2]:
test_feat

Unnamed: 0,instance_id,user_id_count,user_gender_id_count,user_age_level_count,user_occupation_id_count,user_star_level_count
478087,55144604125888,1,3856,3413,3823,2098
478088,221669167755726,29,3856,2734,3823,2730
478089,566644865989395,2,3856,3413,3823,2730
478090,954943998950521,2,3856,3413,3823,2722
478091,1192015136416062,1,3856,2673,3823,2348
478092,1239632009133923,2,3856,3266,3823,2730
478093,1370758425798224,4,3856,2734,3823,2870
478094,1386245957915884,4,3186,2673,3823,2888
478095,1587829942816220,1,3856,3266,3823,2973
478096,1965296257027358,1,3856,2734,3823,2836
