In [2]:
# coding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import time
import xlearn as xl
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')
np.random.seed(2018)

In [2]:
train = pd.read_table('../data/round2_train.txt', delim_whitespace=True, index_col=None)
test = pd.read_table('../data/round2_ijcai_18_test_a_20180425.txt', delim_whitespace=True, index_col=None)

train = train.drop_duplicates().reset_index(drop=True)
data = pd.concat([train, test]).reset_index(drop=True)

In [3]:
test_b = pd.read_table('../glq/2.data/round2_test_b.txt', delim_whitespace=True, index_col=None)
# data = pd.concat([data, test_b]).reset_index(drop=True)

In [4]:
len(test_b)

1209768

In [3]:
timeFeat = pd.read_csv('featFile/timeFeat.csv')
catFeat = pd.read_csv('featFile/catFeat.csv')

data = pd.concat([data, timeFeat], axis=1)
data = pd.concat([data, catFeat], axis=1)

# 并行apply

In [11]:
from multiprocessing import Pool

def multiprocess_apply_func(func,data_now,args):
    if(type(data_now) == pd.core.frame.DataFrame):
        return data_now.apply(func,args = args,axis = 1)
    else:
        return data_now.apply(func,args = args)
    
def multiprocess_apply(data_now,cpu_count,func,args = ()):   
    """
        @data_now:要apply的数据，DataFrame或Series格式
        @cpu_count:动用的核数
        @func:apply传入的函数（注意：不能是lambda函数）
        @args:func传入的参数
    """
    pool = Pool(cpu_count)
    data_count_per_cpu = len(data_now) / cpu_count
    res_pool = [None] * (cpu_count)
    
    for i in range(cpu_count):
        start = int(i * data_count_per_cpu)
        if(start == len(data_now)):
            break
        end = int(min((i + 1) * data_count_per_cpu - 1,len(data_now)))

        res_pool[i] = pool.apply_async(multiprocess_apply_func,args = (func,data_now.iloc[start:end + 1],args,))
    
    pool.close()
    pool.join()
    
    res = res_pool[0].get()
    for i in range(1,cpu_count):
        res = res.append(res_pool[i].get())
    assert len(res) == len(data_now)
    
    pool.terminate()
    return res

# 类目属性相关

In [5]:
def _jaccard(a, b):
    _a, _b = set(a), set(b)
    return len(_a&_b)/float(len(_a|_b))

def _makeCatPropFeature(row):
    item_cat_list = row['item_category_list'].split(';')
    item_prop_list = row['item_property_list'].split(';')

    predNull = True
    if (row['predict_category_property']!=np.nan) and (row['predict_category_property']!='-1'):
        predNull = False
        pred_cat_prop_list = row['predict_category_property'].split(';')

    # item_cat_1
    row['item_cat_1'] = int(item_cat_list[1])
    row['item_cat_2'] = int(item_cat_list[2]) if len(item_cat_list)>2 else np.nan

    cat_hit, best_cat_hit = False, False
    max_level_hit, best_max_level_hit = 0, 0
    acc_jac_sim, acc_match_ratio_sim = 0, 0
    best_prop_jac, best_prop_match_ratio = 0, 0
    if not predNull:
        for pred_index, pred_cat_prop in enumerate(pred_cat_prop_list):
            pred_cat = pred_cat_prop.split(':')[0]
            pred_prop_list = pred_cat_prop.split(':')[1].split(',')

            prop_jac = _jaccard(set(item_prop_list), set(pred_prop_list))
            prop_match_ritio = len(set(pred_prop_list)&set(item_prop_list))/float(len(set(pred_prop_list)))

            # cat是否命中/命中的最大level
            if pred_cat in item_cat_list:
                cur_hit = True
                cur_level_hit = item_cat_list.index(pred_cat) + 1
            else:
                cur_hit = False
                cur_level_hit = 0
            # 针对最优预测： cat是否命中/命中的最大level
            if pred_index == 0: # best matched predict
                best_max_level_hit = cur_level_hit
                best_prop_jac = prop_jac
                best_prop_match_ratio = prop_match_ritio
            # 组合property相似度
            acc_jac_sim += int(cur_hit) * 1.0/(pred_index+1) * np.log(1.1/(1.1-prop_jac))
            acc_match_ratio_sim += int(cur_hit) * 1.0/(pred_index+1) * 1.1/(1.1-prop_match_ritio)

            cat_hit = cat_hit or cur_hit
            max_level_hit = max(cur_level_hit, max_level_hit)
    row['predict_cat_hit'] = int(cat_hit) # predict_cat是否命中
    row['best_predict_cat_hit_level'] = best_max_level_hit # predict_cat 命中层级
    row['best_prop_jac'], row['best_prop_match_ratio'] = best_prop_jac, best_prop_match_ratio
    row['acc_jac_sim'], row['acc_match_ratio_sim'] = acc_jac_sim, acc_match_ratio_sim

    return row

def makeCatPropFeature(data):
    '''
    关于item_category_list和/item_property_list与predict_category_property的相关特征：
        1. 商品的类目属性与预测类目属性之间的相似度，使用Jaccard系数
    :param data: pandas.DataFrame
    :return: pandas.DataFrame
    '''
    init_cols = data.columns.tolist()

    data = data.apply(_makeCatPropFeature, axis=1)

    new_cols = list(filter(lambda x: x not in init_cols, data.columns.tolist()))  # 输出数据的特征列
    data[new_cols].to_csv('featFile-b/catFeat_b.csv', index=False)
    return data

In [6]:
makeCatPropFeature(test_b)

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,shop_score_delivery,shop_score_description,item_cat_1,item_cat_2,predict_cat_hit,best_predict_cat_hit_level,best_prop_jac,best_prop_match_ratio,acc_jac_sim,acc_match_ratio_sim
0,74080508196716,1774582353536418293,836752724084922533;4911723539855588624,6241534230954727302;367082587220462692;2636395...,5664460045239319053,8072963182326625214,6,12,12,17,...,0.982419,0.988221,4911723539855588624,,1,2,0.022727,1.000000,0.020878,11.333333
1,204576715383250,7207659473071472323,836752724084922533;6670526099037031245,6241534230954727302;367082587220462692;3995998...,2547916549533637024,6219110439660858399,8,10,9,17,...,0.966124,0.967953,6670526099037031245,,1,0,0.000000,0.000000,0.000000,0.533333
2,339754209266814,9197583400310910637,836752724084922533;1852600517265062354,2072967855524022579;5131280576272319091;772325...,8631276550057616546,8240548652619864253,6,14,14,21,...,0.978473,0.973896,1852600517265062354,,1,2,0.041667,1.000000,0.038615,11.333333
3,1232471723234702,5016845371443836749,836752724084922533;7314150500379498593,7126426653086863522;2636395404473730413;365787...,7738101827252981059,3948283326616421003,5,12,10,17,...,0.962932,0.956978,7314150500379498593,,1,0,0.000000,0.000000,0.013159,5.500000
4,1432327895640503,6908632267694168095,836752724084922533;5685690139879409547;7497531...,6241534230954727302;367082587220462692;5131280...,8631276550057616546,8240548652619864253,7,13,15,20,...,0.975740,0.986149,5685690139879409547,7.497531e+18,1,1,0.030303,0.250000,0.027935,1.294118
5,1582628758469245,6033858733414293692,836752724084922533;3434689896486063330;3123955...,8597265790857153347;2636395404473730413;901144...,5382454711692753696,1548565541085191105,6,13,12,18,...,0.975106,0.971193,3434689896486063330,3.123956e+18,1,1,0.024390,0.100000,0.035973,1.817391
6,1779564246041746,3683127832741048020,836752724084922533;768579787521575291,367082587220462692;3540471119339706981;7138596...,1033195780954912143,4918413420989329604,6,13,13,17,...,0.960292,0.978012,768579787521575291,,1,2,0.025000,1.000000,0.022990,11.250000
7,1796710634742147,2952734330252676781,836752724084922533;2211060154630359130;7848078...,7862557370302415437;9011444777478954178;197523...,5178771852012855834,4918413420989329604,6,9,8,15,...,0.980535,0.982482,2211060154630359130,7.848079e+18,1,3,0.000000,0.000000,0.000000,1.166667
8,1842191994190388,4801892925172805380,836752724084922533;768579787521575291,6241534230954727302;367082587220462692;3540471...,725191195555556462,196257267849351217,8,12,11,16,...,0.951866,0.970524,768579787521575291,,1,2,0.040000,0.500000,0.051540,2.200000
9,1923760687276623,8332591259452422812,836752724084922533;8841625760168847319,6241534230954727302;367082587220462692;5977512...,2967766441844533662,7219323489207530724,6,13,13,18,...,0.967004,0.982541,8841625760168847319,,1,2,0.054054,1.000000,0.050388,11.000000


# shop相关

In [14]:
def _makeShopReviewPositiveNum(data):
    data['shop_review_positive_num'] = data['shop_review_positive_rate'] * data['shop_review_num_level']
    return data

def _shopScoreMap(x, _type=0):
    if _type == 0:
        if x < 0.95:
            return 0
        if x >= 0.98:
            return 4
        return int(100 * x - 95) + 1
    if _type == 1:
        if x < 0.98:
            return 0
        return int(100 * x - 98) + 1

def _discreteShopFeature(data):
    targetCols = ['shop_score_service', 'shop_score_delivery', 'shop_score_description', 'shop_review_positive_rate']
    for col in targetCols:
        data[col].replace({-1: np.nan}, inplace=True)
        data[col] = data[col].fillna(data[col].median())

    for col in targetCols[:-1]:
        data[col + '_level'] = data[col].apply(_shopScoreMap, args=(0,))
    data['shop_review_positive_rate_level'] = data['shop_review_positive_rate'].apply(_shopScoreMap, args=(1,))
    return data

def makeShopFeature(data):
    init_cols = data.columns.tolist()

    data = _makeShopReviewPositiveNum(data) #shop好评数量
    data = _discreteShopFeature(data)

    new_cols = list(filter(lambda x: x not in init_cols, data.columns.tolist()))  # 输出数据的特征列
    data[new_cols].to_csv('featFile-b/shopFeat.csv', index=False)
    return data

In [15]:
data= makeShopFeature(data)

# 点击量特征

### 实时时间窗口点击量
* 紧邻此次点击之前的一段时间窗口内

In [21]:
def _timeWindowClickCount(row, _data):
    timeWindowInSecond = [x*3600 for x in [15, 30, 60, 180]]
    featPrefixs = ['15Min_', '30Min_', '1hour_', '3hour_']
    cur_time = row['context_timestamp']
    cur_user = row['user_id']
    cur_shop = row['shop_id']
    cur_item = row['item_id']
    cur_cat_1 = row['item_cat_1']
    for i, timeWindow in enumerate(timeWindowInSecond):
        _featPrefix = featPrefixs[i]
        target_data = _data[(_data.context_timestamp<cur_time) & (_data.context_timestamp>=(cur_time - timeWindow))]
        row[_featPrefix + 'userClickCount'] = target_data[target_data.user_id == cur_user].shape[0]
        row[_featPrefix + 'itemClickCount'] = target_data[target_data.item_id == cur_item].shape[0]
        row[_featPrefix + 'shopClickCount'] = target_data[target_data.shop_id == cur_shop].shape[0]
        
        row[_featPrefix + 'userItemClickCount'] = target_data[(target_data.user_id == cur_user) & (target_data.item_id == cur_item)].shape[0]
        row[_featPrefix + 'userShopClickCount'] = target_data[(target_data.user_id == cur_user) & (target_data.shop_id == cur_shop)].shape[0]
        row[_featPrefix + 'userCat1ClickCount'] = target_data[(target_data.user_id == cur_user) & (target_data.item_cat_1 == cur_cat_1)].shape[0]
    return row

def makeRealTimeWindowClickCount(data):
    init_cols = data.columns.tolist()
    
    data = multiprocess_apply(data, 5, _timeWindowClickCount, args=(data[['context_timestamp', 'user_id', 'shop_id', 'item_id', 'item_cat_1']],))
#     data.apply(_timeWindowClickCount, args=(data,), axis=1)
    
    new_cols = list(filter(lambda x: x not in init_cols, data.columns.tolist()))  # 输出数据的特征列
    data[new_cols].to_csv('featFile/realTimeWindowClickCountFeat.csv', index=False)
    return data

In [None]:
data = makeRealTimeWindowClickCount(data)