# 1.导包

In [57]:
import os
#数据处理
import pandas as pd
import numpy as np
import random
import sklearn.preprocessing as preprocessing
#可视化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#ML
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (GradientBoostingClassifier, GradientBoostingRegressor, 
                              RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve


from scipy.special import boxcox1p


import warnings
warnings.filterwarnings('ignore')

# 2.数据载入

In [88]:
# 对于test，需要改文件名
# 载入数据，需要5GB内存
feature_type = (np.int64,np.int64,(np.str,60),(np.str,1990),np.int64,np.int64,np.int8,np.int8,np.int8,np.int8,np.int64,np.int8,np.int64,np.int64,np.int64,np.int64,np.int64,np.int32,(np.str,385),np.int64,np.int8,float,np.int16,float,float,float,np.int8)
feature_type = None
data = np.genfromtxt('data/round1_train.csv',dtype = feature_type,delimiter=' ',skip_header=1)
print(data.shape)
data = pd.DataFrame(data)

(18371,)


In [89]:
# 对于test用
# data = pd.concat([data,pd.Series([0] * len(data))], axis = 1)
# data = data.rename({0:'f26'},axis = 'columns')

In [60]:
# 对于test要注释掉
data.sort_values(['f16'],inplace = True)
data.reset_index(drop=True,inplace = True)

# 3.特征转换
## 3.1离散数据
### 3.1.1将出现次数少的值合并到统一类别中
- f6：将10（456）、2（347）、1（85）、11（21）、0（12）、17（1）、16（1）这几个取值单独拉出一个类。
- f9：将8（449）、7（245）、0（123）、6（116）、5（63）、4（33）3（11）、2（5）、1（1）这几个取值单独拉出一个类。
- f20：将23（353）、4（266）、2（87）、3（80）、1（20）、0（7）、25（4）这几个取值单独拉出一个类。
- f22：将5002（477）、5020（357）、5000（81）、5019（70）、5001（60）、4999（7）这几个取值单独拉出一个类。

In [61]:
# TODO

### 3.1.2onehot/dummy-trap
- 如果类别特征本身有顺序（例：优秀、良好、合格、不合格），那么可以保留单列自然数编码。如果类别特征没有明显的顺序（例：红、黄、蓝）则使用one-hot。
- f6广告商品的价格等级、f9广告商品被展示次数的等级、f11用户的预测性别编号、f12用户的预测年龄等级、f13用户的预测职业编号、f14用户的星级编号
- f7、f8、f17、f20、f22不确定
- sklearn.preprocessing.OneHotEncoder()

#### 3.1.2.1f11

In [90]:
temp = pd.get_dummies(data['f11'])  
temp = temp.rename({0.0:'f11:woman',1.0:'f11:man',2.0:'f11:family',3.0:'f11:other'},axis = 'columns')
data = pd.concat([data.loc[:,'f0':'f11'],temp,data.loc[:,'f12':'f26']], axis = 1)

#### 3.1.2.2f13

In [91]:
temp = pd.get_dummies(data['f13'])  
temp = temp.rename({2002.0:'f13:2002.0',2003.0:'f13:2003.0',2004.0:'f13:2004.0',2005.0:'f13:2005.0'},axis = 'columns')
data = pd.concat([data.loc[:,'f0':'f13'],temp,data.loc[:,'f14':'f26']], axis = 1)

- 将f11、f13进行onehot编码后，xgboost的验证集loss由0.084306减小到0.084273


#### 3.1.2.3f16

In [92]:
now = pd.to_datetime(data['f16'],unit='s')

size = len(data)
# year = pd.Series([-1] * 477303)
# month = pd.Series([-1] * 477303)
day = pd.Series([-1] * size)
hour = pd.Series([-1] * size)
minute = pd.Series([-1] * size)
second = pd.Series([-1] * size)
dayofweek = pd.Series([-1] * size)
dayofyear = pd.Series([-1] * size)

for i in range(len(data['f16'])):
#     year[i] = data['f16'][0].year
#     month[i] = data['f16'][0].month
    day[i] = now[i].day
    hour[i] = now[i].hour
    minute[i] = now[i].minute
    second[i] = now[i].second
    dayofweek[i] = now[i].dayofweek
    dayofyear[i] = now[i].dayofyear
data = pd.concat([data.loc[:,'f0':'f16'],day,hour,minute,second,dayofweek,dayofyear,data.loc[:,'f17':'f26']], axis = 1)
data.rename({'f26':'label'}, axis='columns',inplace = True)
data.rename({0:'f16:day',1:'f16:hour',2:'f16:minute',3:'f16:second',4:'f16:dayofweek',5:'f16:dayofyear'}, axis='columns',inplace = True)

- 转换时间f16为多个特征并删除特征f16后，xgboost的验证集loss较明显的降低
- 加入dayofweek特征后，xgboost的验证集loss没变，没用

### 3.1.3自然数编码
- 消耗内存小，训练时间快，但是相比one-hot特征的质量不高，含了一个假设：不同的类别之间，存在一种顺序关系。
- pd.Factorize()
- sklearn.LabelEncoder()

### 3.1.4聚类编码
- 和独热编码相比，聚类编码试图充分利用每一列0与1的信息表达能力。聚类编码时一般需要特定的专业知识（domain knowledge），例如ZIP码可以根据精确度分层为ZIP3、ZIP4、ZIP5、ZIP6，然后按层次进行编码。

- （我个人尚未使用过这种方法，仅在读论文的时候看到了类似的思路，所以暂时不知道对于各种算法而言效果如何。）


## 3.2连续数据
### 3.2.1scaling：分布太宽，做一下scaling，如：标准化、归一化

In [65]:
# f12
for i in range(0,8):
    data['f12'] = data['f12'].replace(1000. + i,0 + i)

In [66]:
# f14
for i in range(0,11):
    data['f14'] = data['f14'].replace(3000. + i,0 + i)

In [67]:
# f17
for i in range(0,20):
    data['f17'] = data['f17'].replace(4001. + i,0 + i)

In [68]:
# f22
for i in range(0,22):
    data['f22'] = data['f22'].replace(4999. + i,0 + i)

- 将f112、f14、f17、f22进行范围缩放后，xgboost的验证集loss由0.0775没变，没用。

### 3.2.2Binning：连续变量离散化
- 只有在了解属性的领域知识的基础，确定属性能够划分成简洁的范围时分箱才有意义，即所有的数值落入一个分区时能够呈现出共同的特征。
- 当不想让模型总是尝试区分值之间是否太近时，分区可以避免出现过拟合。

### 3.2.3正态化：对偏度大于0.75的数值特征（长尾分布）
- 用log1p函数进行转化使其更加服从高斯分布
np.log1p(train.SalePrice)
- Box-Cox变换

In [69]:
# f7:2 f8:2.5 f9:5 f20:1.5
data['f7'] = boxcox1p(data['f7'],2)
data['f8'] = boxcox1p(data['f8'],2.5)
data['f9'] = boxcox1p(data['f9'],5)
data['f20'] = boxcox1p(data['f20'],1.5)

- 正态化后，xgboost的验证集loss不变，没用

## 3.3高势集数据
### 3.3.1拆分题目中的属性（f2、f3、f18）
#### 3.3.1.1f2

In [93]:
data['f2'] = data['f2'].str.decode('utf-8')
data = pd.concat([data.loc[:,'f0':'f2'],data['f2'].astype(np.str).str.split(';', expand=True),data.loc[:,'f3':'label']], axis = 1)

data.rename({0:'f2:1',1:'f2:2',2:'f2:3'}, axis='columns',inplace = True)

data['f2:3'].fillna('-1',inplace = True)
data['f2:1'] = data['f2:1'].astype('int')
data['f2:2'] =data['f2:2'].astype('int')
data['f2:3'] = data['f2:3'].astype('int')

In [94]:
temp = pd.get_dummies(data['f2:2'],prefix = 'f2:2')  
data = pd.concat([data.loc[:,'f0':'f2:2'],temp,data.loc[:,'f2:3':'label']], axis = 1)

In [95]:
temp = pd.get_dummies(data['f2:3'],prefix = 'f2:3')  
data = pd.concat([data.loc[:,'f0':'f2:3'],temp,data.loc[:,'f3':'label']], axis = 1)

In [73]:
data

Unnamed: 0,f0,f1,f2,f2:1,f2:2,f2:2_22731265849056483,f2:2_509660095530134768,f2:2_1968056100269760729,f2:2_2011981573061447208,f2:2_2436715285093487584,...,f17,f18,f19,f20,f21,f22,f23,f24,f25,label
0,8034098185799936802,2709839111509701702,7908382889764677758;5755694407684602296,7908382889764677758,5755694407684602296,0,0,0,0,0,...,3,b'509660095530134768:-1;5755694407684602296:18...,1932086538588242822,54.546053,1.000000,16,0.960338,0.962638,0.978433,0
1,8481863804826437414,671302156973073367,7908382889764677758;509660095530134768,7908382889764677758,509660095530134768,0,1,0,0,0,...,0,"b'5755694407684602296:1667949271803926094,5131...",4413707559864701260,54.546053,1.000000,16,0.976033,0.976015,0.982944,0
2,890046314781699653,5997981559301046248,7908382889764677758;2642175453151805566;886888...,7908382889764677758,2642175453151805566,0,0,0,0,0,...,0,b'8868887661186419229:9148482949976129397;7908...,2527079776459244580,58.961813,1.000000,17,0.971999,0.973024,0.980329,0
3,3153022805452915440,2744506471184009586,7908382889764677758;7258015885215914736,7908382889764677758,7258015885215914736,0,0,0,0,0,...,0,b'7258015885215914736:-1;7822717283490579102:8...,7298587034305908007,20.415184,0.979905,8,0.965576,0.970909,0.959273,0
4,8788910857530634777,2456148350299163893,7908382889764677758;5755694407684602296,7908382889764677758,5755694407684602296,0,0,0,0,0,...,0,b'5755694407684602296:2636395404473730413;7908...,6331496105287532881,34.255469,1.000000,12,0.963868,0.962429,0.985334,0
5,7432754354590006338,8280752807027804716,7908382889764677758;8277336076276184272,7908382889764677758,8277336076276184272,0,0,0,0,0,...,6,b'509660095530134768:2636395404473730413;82773...,4421491403970608068,42.000000,1.000000,14,0.966251,0.968789,0.980320,0
6,8129478961168479132,4027507401524164069,7908382889764677758;2642175453151805566;886888...,7908382889764677758,2642175453151805566,0,0,0,0,0,...,0,b'8868887661186419229:-1;7908382889764677758:-1',5134635740200692273,17.333333,0.991914,7,0.981935,0.971613,0.976774,0
7,8534762917833874555,5997981559301046248,7908382889764677758;2642175453151805566;886888...,7908382889764677758,2642175453151805566,0,0,0,0,0,...,0,b'8868887661186419229:-1;7908382889764677758:-1',2527079776459244580,58.961813,1.000000,17,0.971999,0.973024,0.980329,0
8,4589579122849864554,8280752807027804716,7908382889764677758;8277336076276184272,7908382889764677758,8277336076276184272,0,0,0,0,0,...,11,b'509660095530134768:2636395404473730413;87107...,4421491403970608068,42.000000,1.000000,14,0.966251,0.968789,0.980320,0
9,8415421001838258338,2709839111509701702,7908382889764677758;5755694407684602296,7908382889764677758,5755694407684602296,0,0,0,0,0,...,0,b'5755694407684602296:-1;7999314233231522439:3...,1932086538588242822,54.546053,1.000000,16,0.960338,0.962638,0.978433,0


- 将f19进行转换为点击概率后，xgboost的验证集loss由0.084273减小到0.080542
- 将f1进行转换为点击概率后，xgboost的验证集loss由0.080542减小到0.0775
- 将f2;2用onehot表示后，xgboost的验证集loss由0.0775减小到0.07373

### 3.3.3高势集类别（High Categorical）进行经验贝叶斯转换成数值feature

In [74]:
# 'Embarked'dummy化
# dummies_df = pd.get_dummies(df['Embarked'])

# dummy化生成的列重命名 'Embarked_S', 'Embarked_C', 'Embarked_Q'，（其实get_dummies有一个prefix的参数，比这个更方便）
# dummies_df = dummies_df.rename(columns=lambda x: 'Embarked_' + str(x))

#  加到原数据中
# data = pd.concat([data, dummies_df], axis=1)
# data.drop(['Embarked'], axis=1,inplace=True)

### 3.3.4平均数编码
- 平均数编码（mean encoding），针对高基数类别特征的有监督编码。当一个类别特征列包括了极多不同类别时（如家庭地址，动辄上万）时，可以采用。优点：和独热编码相比，节省内存、减少算法计算时间、有效增强模型表现。

- 将f2属性拆分成三个子属性后，xgboost的验证集loss不变，没用

# 4.特征组合
## 4.1加入统计比例数据
### 4.1.1f19

In [75]:
# 对于test，这里要用train的data
dic_appear1 = {}
dic_ok1 = {}

for num in range(len(data['f19'])):
    if(data['f19'][num] not in dic_appear1):
        dic_appear1[data['f19'][num]] = 1
        dic_ok1[data['f19'][num]] = 0
    else:
        dic_appear1[data['f19'][num]] += 1
    if(data['label'][num] == 1):
        dic_ok1[data['f19'][num]] += 1

In [96]:
temp = pd.Series([-1.00] * len(data))
for i in range(len(temp)):
    try:
        temp[i] = dic_ok1[data['f19'][i]] * 1.0 / dic_appear1[data['f19'][i]]
    except:
        temp[i] = -1
        print(i)

565
1875
3701
4122
4173
4815
5279
5535
5888
6273
6386
6444
7435
7437
7737
7984
8001
8458
8689
8690
8692
8995
9353
10410
10581
11079
12027
12147
12154
12269
12270
12271
12739
12954
13017
13123
13407
13863
14039
14489
14490
14491
14981
15328
15472
15820
15853
16246
16443
16482
16864
16920
17115
17123
17126
17505
17548
17560
17561
17577
17578
18294


In [97]:
data_combination = pd.concat([temp], axis = 1)
data_combination.rename({0:'f19-label'}, axis='columns',inplace = True)

In [98]:
data_combination.columns

Index(['f19-label'], dtype='object')

### 4.1.2f1

In [79]:
dic_appear2 = {}
dic_ok2 = {}

for num in range(len(data['f1'])):
    if(data['f1'][num] not in dic_appear2):
        dic_appear2[data['f1'][num]] = 1
        dic_ok2[data['f1'][num]] = 0
    else:
        dic_appear2[data['f1'][num]] += 1
    if(data['label'][num] == 1):
        dic_ok2[data['f1'][num]] += 1

In [99]:
temp = pd.Series([-1.00] * len(data))
for i in range(len(temp)):
    try:
        temp[i] = dic_ok2[data['f1'][i]] * 1.0 / dic_appear2[data['f1'][i]]
    except:
        temp[i] = -1
        print(i)

data_combination = pd.concat([temp,data_combination], axis = 1)
data_combination.rename({0:'f1-label'}, axis='columns',inplace = True)

565
1154
1156
1157
1158
1649
1746
1832
1841
1875
2014
2035
2481
2483
2505
2511
2513
2702
2704
3318
3618
3701
3756
3842
3912
3915
3917
3919
3924
3925
3928
4122
4173
4783
4801
4815
4996
5276
5279
5441
5483
5485
5493
5519
5535
5888
6273
6375
6386
6444
6621
6830
7182
7300
7301
7435
7437
7737
7754
7761
7863
7928
7984
8001
8215
8458
8472
8509
8514
8689
8690
8692
8995
9106
9107
9353
9686
9781
10173
10410
10540
10542
10581
10598
10599
10835
10837
10845
10931
11000
11049
11079
11177
11339
11444
11450
11527
11678
11842
12027
12052
12092
12147
12148
12154
12269
12270
12271
12419
12487
12696
12726
12739
12769
12954
13004
13005
13007
13008
13009
13010
13011
13013
13017
13108
13123
13197
13198
13199
13381
13389
13407
13429
13611
13796
13798
13800
13860
13863
13883
13892
13991
14039
14489
14490
14491
14616
14645
14727
14730
14731
14781
14824
14981
15006
15007
15100
15159
15161
15328
15472
15535
15571
15592
15593
15595
15596
15597
15598
15600
15601
15602
15603
15604
15677
15678
15694
15702
15719
15748

### 4.1.3f2

In [81]:
dic_appear3 = {}
dic_ok3 = {}

for num in range(len(data['f2:2'])):
    if(data['f2:2'][num] not in dic_appear3):
        dic_appear3[data['f2:2'][num]] = 1
        dic_ok3[data['f2:2'][num]] = 0
    else:
        dic_appear3[data['f2:2'][num]] += 1
    if(data['label'][num] == 1):
        dic_ok3[data['f2:2'][num]] += 1

In [100]:
temp = pd.Series([-1.00] * len(data))
for i in range(len(temp)):
    try:
        temp[i] = dic_ok3[data['f2:2'][i]] * 1.0 / dic_appear3[data['f2:2'][i]]
    except:
        temp[i] = -1
        print(i)

In [101]:
data_combination = pd.concat([data_combination.loc[:,'f1-label'],temp,data_combination['f19-label']], axis = 1)
data_combination.rename({0:'f2-label'}, axis='columns',inplace = True)

## 4.2时间序列：
- 把昨天的特征加入今天的特征，或者把和昨天相比，特征数值的改变量加入今天的特征。

## 4.3特征合并

In [102]:
data = pd.concat([data.loc[:,'f0':'f25'],data_combination,data.loc[:,'label']],axis = 1)

# 5.特征选择
- 除非万不得已，不要用PCA或者LDA降维，直接减原始特征就行了。

## 5.1质量不好的特征
- 缺失的行特别多，弃用该列，超过15%缺失的特征应该予以删除！
- 质量都不错，最多的f12（0.027）

## 5.2冗余特征（相关性强的保留一个）
- 有些 Feature 之间可能存在线性关系，影响 Model 的性能。
- Feature越少，训练越快。

## 5.3无关特征
- f0样本编号：近似唯一
- f1广告商品编号
- f10用户编号
- f15上下文信息编号：完全唯一
- f19店铺编号

In [103]:
# data.drop('f16',axis = 1)
data.drop(['f0','f2','f2:1','f3','f18'], axis=1,inplace = True)

# 7.标签处理
- 上采样、下采样、分层采样。

# 8.保存结果

In [86]:
data.to_pickle('data/round1_train')

In [104]:
# test数据用
# data.drop('label',axis = 1).to_pickle('data/round1_test')