In [1]:
import os
import gc
import time
import pickle
import Geohash
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
cache_path = '../cache/'
train_path = '../data/train.csv'
test_path = '../data/test.csv'
split_time = '2017-05-23 00:00:00'
flag = True

In [3]:
# 获取真实标签
def get_label(data):
    result_path = cache_path + 'true.pkl'
    if os.path.exists(result_path):
        true = pickle.load(open(result_path, 'rb+'))
    else:
        train = pd.read_csv(train_path)
           
        true = dict(zip(train['orderid'].values, train['geohashed_end_loc']))
        pickle.dump(true, open(result_path, 'wb+'))
    data['label'] = data['orderid'].map(true)
    data['label'] = (data['label'] == data['geohashed_end_loc']).astype('int')
    return data

以下部分，为特征构造部门

In [4]:
# 计算两点之间距离
def cal_distance(lat1,lon1,lat2,lon2):
    lat1 = float(lat1);lon1 = float(lon1);lat2 = float(lat2);lon2 = float(lon2)
    dx = np.abs(lon1 - lon2)  # 经度差
    dy = np.abs(lat1 - lat2)  # 维度差
    b = (lat1 + lat2) / 2.0
    Lx = 6371004.0 * (dx / 57.2958) * np.cos(b / 57.2958)
    Ly = 6371004.0 * (dy / 57.2958)
    L = (Lx**2 + Ly**2) ** 0.5
    return L

# 计算两点之间的欧氏距离
def get_distance(result):
    locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc']))
    if np.nan in locs:
        locs.remove(np.nan)
    deloc = []
    for loc in locs:
        deloc.append(Geohash.decode(loc))
    loc_dict = dict(zip(locs,deloc))
    geohashed_loc = result[['geohashed_start_loc','geohashed_end_loc']].values
    
    distance = []
    Manhattan = []
    for i in geohashed_loc:
        lat1, lon1 = loc_dict[i[0]]
        lat2, lon2 = loc_dict[i[1]]
        
        dis1 = cal_distance(lat1, lon1, lat2, lon1)
        dis2 = cal_distance(lat2, lon1, lat2, lon2)
        
        Manhattan_dis = dis1 + dis2
        
        #line_dis = cal_distance(lat1,lon1,lat2,lon2)
        #distance.append(line_dis)
        Manhattan.append(Manhattan_dis)
        
    #result['distance'] = distance
    result['Manhattan'] = Manhattan
    return result

In [5]:
# 获取用户历史行为次数
def get_user_count(train,result):
    user_count = train.groupby('userid',as_index=False)['geohashed_end_loc'].agg({'user_count':'count'})
    result = pd.merge(result,user_count,on=['userid'],how='left')
    return result

# 获取用户去过某个地点历史行为次数
def get_user_eloc_count(train, result):
    user_eloc_count = train.groupby(['userid','geohashed_end_loc'],as_index=False)['userid'].agg({'user_eloc_count':'count'})
    result = pd.merge(result,user_eloc_count,on=['userid','geohashed_end_loc'],how='left')
    return result

# 获取用户从某个地点出发的行为次数
def get_user_sloc_count(train,result):
    user_sloc_count = train.groupby(['userid','geohashed_start_loc'],as_index=False)['userid'].agg({'user_sloc_count':'count'})
    user_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_end_loc'},inplace=True)
    result = pd.merge(result, user_sloc_count, on=['userid', 'geohashed_end_loc'], how='left')
    return result

In [6]:
# 获取用户从这个路径走过几次
def get_user_sloc_eloc_count(train,result):
    user_count = train.groupby(['userid','geohashed_start_loc','geohashed_end_loc'],as_index=False)['userid'].agg({'user_sloc_eloc_count':'count'})
    result = pd.merge(result,user_count,on=['userid','geohashed_start_loc','geohashed_end_loc'],how='left')
    return result

# 获取用户从这个路径折返过几次
def get_user_eloc_sloc_count(train,result):
    user_eloc_sloc_count = train.groupby(['userid','geohashed_start_loc','geohashed_end_loc'],as_index=False)['userid'].agg({'user_eloc_sloc_count':'count'})
    user_eloc_sloc_count.rename(columns = {'geohashed_start_loc':'geohashed_end_loc','geohashed_end_loc':'geohashed_start_loc'},inplace=True)
    result = pd.merge(result,user_eloc_sloc_count,on=['userid','geohashed_start_loc','geohashed_end_loc'],how='left')
    return result

In [7]:
# 获取目标地点的热度(目的地)
def get_eloc_count(train,result):
    eloc_count = train.groupby('geohashed_end_loc', as_index=False)['userid'].agg({'eloc_count': 'count'})
    result = pd.merge(result, eloc_count, on='geohashed_end_loc', how='left')
    return result

# 获取目标地点的热度(出发地地)
def get_eloc_as_sloc_count(train,result):
    eloc_as_sloc_count = train.groupby('geohashed_start_loc', as_index=False)['userid'].agg({'eloc_as_sloc_count': 'count'})
    eloc_as_sloc_count.rename(columns={'geohashed_start_loc':'geohashed_start_loc'})
    result = pd.merge(result, eloc_as_sloc_count, on='geohashed_start_loc', how='left')
    return result

In [8]:
#获得某个时间段，达到某个地方，或者是从某个时间段出发的信息
def hour_loc_count(train,result):
    result['hour'] = pd.to_datetime(result['starttime']).map(lambda x: x.strftime('%H'))
    train['hour'] = pd.to_datetime(train['starttime']).map(lambda x: x.strftime('%H'))
            
    hour_sloc_count = train.groupby(["hour","geohashed_start_loc"],as_index=False)["userid"].agg({'hour_sloc_count':'count'})
    #某时，从某个地方出发的次数
    hour_eloc_count = train.groupby(["hour","geohashed_end_loc"],as_index=False)["userid"].agg({'hour_eloc_count':'count'})
    #某时，到达某地的次数
    hour_sloc_eloc_count = train.groupby(["hour","geohashed_start_loc","geohashed_end_loc"],as_index=False)["userid"].agg({'hour_sloc_eloc_count':'count'})
    #某时，某地出发，到达某地的次数
    user_hour = train.groupby(["userid","hour"],as_index=False)["userid"].agg({'hour_user_count':'count'})
    #用户从某个时间出发的次数
    user_hour_eloc = train.groupby(["userid","hour","geohashed_end_loc"],as_index=False)["userid"].agg({'user_hour_eloc':'count'})
    #用户某时到某地的次数
    user_hour_sloc = train.groupby(["userid","hour","geohashed_start_loc"],as_index=False)["userid"].agg({'user_hour_sloc':'count'})
    #用户某时从某地出发的次数
    user_hour_sloc_eloc = train.groupby(["userid","hour","geohashed_start_loc","geohashed_end_loc"],as_index=False)["userid"].agg({'user_hour_sloc_eloc':'count'})
    #用户某时从某地出发到达某地的次数
    
    result = pd.merge(result, hour_sloc_count, on=["hour","geohashed_start_loc"], how='left')
    result = pd.merge(result, hour_eloc_count, on=["hour","geohashed_end_loc"], how='left')
    result = pd.merge(result, hour_sloc_eloc_count, on=["hour","geohashed_start_loc","geohashed_end_loc"], how='left')
    result = pd.merge(result, user_hour, on=["userid","hour"], how='left')
    result = pd.merge(result, user_hour_eloc, on=["userid","hour","geohashed_end_loc"], how='left')
    result = pd.merge(result, user_hour_sloc, on=["userid","hour","geohashed_start_loc"], how='left')
    result = pd.merge(result, user_hour_sloc_eloc, on=["userid","hour","geohashed_start_loc","geohashed_end_loc"], how='left')
    
    result["hour"] = result["hour"].astype(int)
    result.drop(["starttime"],axis=1,inplace=True)
    return result  

以下部分，为样本构造部分

In [9]:
#获得训练集的basic
def get_train_basic():
    bs_feat = ['orderid','geohashed_end_loc']
    result_path = cache_path + 'get_train_basic_%s.hdf' %("train_bs")
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        train = pd.read_csv(train_path)
        val = train[(train['starttime']>= split_time)]
        train = train[(train['starttime'] < split_time)]
                
        user_end_loc = get_user_end_loc(train, val)# 根据用户历史目的地点添加样本
        user_start_loc = get_user_start_loc(train, val)# 根据用户历史起始地点添加样本
        loc_to_loc = get_loc_to_loc(train, val)# 筛选起始地点去向最多的3个地点
        bike_end_loc = get_bike_end_loc(train, val)#车ID中，多次重复去同一个地方的样本
        user_go_back = get_user_go_back(train, val)#用户存在往返的样本 
        
        result = pd.concat([user_end_loc[bs_feat],
                            user_start_loc[bs_feat],
                            loc_to_loc[bs_feat],
                            bike_end_loc[bs_feat],
                            user_go_back[bs_feat],]).drop_duplicates()
        
        val.drop(["geohashed_end_loc"],axis=1,inplace=True)
        result = pd.merge(result,val,on="orderid",how="left")
        result = result[result['geohashed_end_loc'] != result['geohashed_start_loc']]
        result = result[(~result['geohashed_end_loc'].isnull()) & (~result['geohashed_start_loc'].isnull())]
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

##获得训练集的basic
def get_test_basic():
    bs_feat = ['orderid','geohashed_end_loc']
    result_path = cache_path + 'get_test_basic_%s.hdf' %("test_bs")
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        train = pd.read_csv(train_path)
        train = train[(train['starttime'] < split_time)]
        test = pd.read_csv(test_path)
            
        user_end_loc = get_user_end_loc(train, test) # 根据用户历史目的地点添加样本
        user_start_loc = get_user_start_loc(train, test)# 根据用户历史起始地点添加样本 
        loc_to_loc = get_loc_to_loc(train, test)  # 筛选起始地点去向最多的3个地点
        bike_end_loc = get_bike_end_loc(train, test)#车ID中，多次重复去同一个地方的样本
        user_go_back = get_user_go_back(train, test)#用户存在往返的样本 
        # 汇总样本id
        result = pd.concat([user_end_loc[bs_feat],
                            user_start_loc[bs_feat],
                            loc_to_loc[bs_feat],
                            bike_end_loc[bs_feat],
                            user_go_back[bs_feat],]).drop_duplicates()
        
        result = pd.merge(result,test,on="orderid",how="left")
        result = result[result['geohashed_end_loc'] != result['geohashed_start_loc']]
        result = result[(~result['geohashed_end_loc'].isnull()) & (~result['geohashed_start_loc'].isnull())]
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

# 将用户骑行过目的的地点加入样本
def get_user_end_loc(train,sample):
    result_path = cache_path + 'user_end_loc_%d.hdf' %(train.shape[0]*sample.shape[0])
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        user_eloc = train[['userid','geohashed_end_loc']].drop_duplicates()
        result = pd.merge(sample[['orderid','userid']],user_eloc,on='userid',how='left')
        result = result[['orderid', 'geohashed_end_loc']]
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

# 将用户骑行过出发的地点加入样本
def get_user_start_loc(train,sample):
    result_path = cache_path + 'user_start_loc_%d.hdf' %(train.shape[0]*sample.shape[0])
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        user_sloc = train[['userid', 'geohashed_start_loc']].drop_duplicates()
        result = pd.merge(sample[['orderid', 'userid']], user_sloc, on='userid', how='left')
        result.rename(columns={'geohashed_start_loc':'geohashed_end_loc'},inplace=True)
        result = result[['orderid', 'geohashed_end_loc']]
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

# 筛选起始地点去向最多的5个地点
def get_loc_to_loc(train,sample):
    result_path = cache_path + 'loc_to_loc_%d.hdf' %(train.shape[0]*sample.shape[0])
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        train.geohashed_start_loc = train.geohashed_start_loc.apply(lambda x:x[:6])
        sample.geohashed_start_loc = sample.geohashed_start_loc.apply(lambda x:x[:6])
        #出发点，只取前6位
        sloc_eloc_count = train.groupby(['geohashed_start_loc', 'geohashed_end_loc'],as_index=False)['geohashed_end_loc'].agg({'sloc_eloc_count':'count'})
        sloc_eloc_count.sort_values('sloc_eloc_count',inplace=True)
        sloc_eloc_count = sloc_eloc_count.groupby('geohashed_start_loc').tail(5)
        
        result = pd.merge(sample[['orderid', 'geohashed_start_loc']], sloc_eloc_count, on='geohashed_start_loc', how='left')
        result = result[['orderid', 'geohashed_end_loc']]
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

# 将车ID多次重复去一个地方的样本取出
def get_bike_end_loc(train,sample):
    result_path = cache_path + 'bike_end_loc_%d.hdf' %(train.shape[0]*sample.shape[0])
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        bike_eloc = train.groupby(["bikeid","geohashed_end_loc"],as_index=False)["geohashed_end_loc"].agg({"bike_count":"count"})
        bike_eloc = bike_eloc[bike_eloc.bike_count>2][["bikeid","geohashed_end_loc"]]
        result = pd.merge(sample[['orderid','bikeid']],bike_eloc,on='bikeid',how='left')
        result = result[['orderid', 'geohashed_end_loc']]
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

#将用户往返过的地方，放入样本
def get_user_go_back(train,sample):
    result_path = cache_path + 'user_go_back_%d.hdf' %(train.shape[0]*sample.shape[0])
    
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        user_go_back = pd.merge(train[["orderid","userid","geohashed_start_loc","geohashed_end_loc"]],
                                train[["orderid","userid","geohashed_start_loc","geohashed_end_loc"]],
                                left_on=["userid","geohashed_start_loc","geohashed_end_loc"],
                                right_on=["userid","geohashed_end_loc","geohashed_start_loc"],
                                how ="inner")
        
        user_go_back = user_go_back[["userid","geohashed_start_loc_x","geohashed_end_loc_x"]].drop_duplicates()
        
        result1 = pd.merge(sample[['orderid','userid',"geohashed_start_loc"]],
                          user_go_back,
                          left_on = ["userid","geohashed_start_loc"],
                          right_on= ["userid","geohashed_start_loc_x"],
                          how="inner")
        result1.rename(columns={'geohashed_end_loc_x':'geohashed_end_loc'},inplace=True)
        
        result2 = pd.merge(sample[['orderid','userid',"geohashed_start_loc"]],
                          user_go_back,
                          left_on = ["userid","geohashed_start_loc"],
                          right_on= ["userid","geohashed_end_loc_x"],
                          how="inner")
        result2.rename(columns={'geohashed_start_loc_x':'geohashed_end_loc'},inplace=True)
        
        result = pd.concat([result1,result2],axis=0)
        
        result = result[['orderid', 'geohashed_end_loc']].drop_duplicates()
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

In [10]:
def make_tr_data():
    
    result_path = cache_path + 'make_tr_data.hdf'
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
   
    else:
        train = pd.read_csv(train_path)
        train = train[(train['starttime'] < split_time)]
        
        result = get_train_basic()
        result = hour_loc_count(train,result)                                   # 处理时间相关特征
        result = get_user_count(train,result)                                   # 获取用户历史行为次数
        result = get_user_eloc_count(train, result)                             # 获取用户去过这个地点几次
        result = get_user_sloc_count(train, result)                             # 获取用户从目的地点出发过几次
        result = get_user_sloc_eloc_count(train, result)                        # 获取用户从这个路径走过几次
        result = get_user_eloc_sloc_count(train, result)                        # 获取用户从这个路径折返过几次
        result = get_eloc_count(train, result)                                  # 获取目的地点的热度(目的地)
        result = get_eloc_as_sloc_count(train, result)                          # 获取起始点和最终地点的欧式距离
        result = get_distance(result)                                           
        result = result.fillna(0)
        result = get_label(result)
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

In [11]:
def make_ts_data():
    result_path = cache_path + 'make_ts_data.hdf'
    if os.path.exists(result_path) & flag:
        result = pd.read_hdf(result_path, 'w')
    else:
        train = pd.read_csv(train_path)
        train = train[(train['starttime'] < split_time)]
        
        result = get_test_basic()
        result = hour_loc_count(train,result)                                   # 处理时间相关特征
        result = get_user_count(train,result)                                   # 获取用户历史行为次数
        result = get_user_eloc_count(train, result)                             # 获取用户去过这个地点几次
        result = get_user_sloc_count(train, result)                             # 获取用户从目的地点出发过几次
        result = get_user_sloc_eloc_count(train, result)                        # 获取用户从这个路径走过几次
        result = get_user_eloc_sloc_count(train, result)                        # 获取用户从这个路径折返过几次
        result = get_eloc_count(train, result)                                  # 获取目的地点的热度(目的地)
        result = get_eloc_as_sloc_count(train, result)
        result = get_distance(result)                                           # 获取起始点和最终地点的欧式距离
        result = result.fillna(0)
        result = get_label(result)
        result.to_hdf(result_path, 'w', complib='blosc', complevel=5)
    return result

In [12]:
# 分组排序
def rank(data, feat1, feat2):
    data.sort_values([feat1,feat2],inplace=True,ascending=False)
    data['rank'] = range(data.shape[0])
    min_rank = data.groupby(feat1,as_index=False)['rank'].agg({'min_rank':'min'})
    data = pd.merge(data,min_rank,on=feat1,how='left')
    data['rank'] = data['rank'] - data['min_rank']
    del data['min_rank']
    return data

In [13]:
# 对结果进行整理
def reshape(pred):
    result = pred.copy()
    result = rank(result,'orderid','pred')
    result = result[result['rank']<3][['orderid','geohashed_end_loc','rank']]
    result = result.set_index(['orderid','rank']).unstack()
    result.reset_index(inplace=True)
    result['orderid'] = result['orderid'].astype('int')
    result.columns = ['orderid', 0, 1, 2]
    return result

In [None]:
#训练样本
from xgboost import XGBClassifier    
train_feat = make_tr_data()
test_feat = make_ts_data()

feat = ["orderid","geohashed_end_loc","geohashed_start_loc","userid","bikeid"];label = ["label"]

predictors = [_ for _ in train_feat.columns if _ not in feat+label]

clf = XGBClassifier(objective ='binary:logistic',
                            learning_rate = 0.1,
                            silent=0,
                            n_estimators= 500,
                            max_depth = 10,
                            min_child_weight = 2,
                            gamma = 30,
                            reg_alpha = 10,
                            reg_lambda = 50,                        
                            subsample = 0.886, 
                            colsample_bytree = 0.886,
                            scale_pos_weight = 10,
                            nthread =4)

X_train, X_test, y_train, y_test = train_test_split(train_feat[predictors].values, 
                                                    train_feat[label].values, 
                                                    test_size=0.2, 
                                                    random_state=201709)

clf.fit(X_train, y_train,
        eval_set=[(X_train,y_train),(X_test, y_test)], 
        eval_metric= ["auc","map"],
        early_stopping_rounds=50)

In [None]:
test_feat['pred'] = clf.predict_proba(test_feat[predictors])[:,1]
result = reshape(test_feat)
test = pd.read_csv(test_path)
result = pd.merge(test[['orderid']],result,on='orderid',how='left')
result.fillna('0',inplace=True)
result.to_csv('../sub/result.csv',index=False,header=False)

In [None]:
import matplotlib.pylab as plt
plt.figure(figsize=(20,4))
feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()