# 性别预测

In [1]:
# coding=utf-8
"""
Created by Liao Jialing
"""
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# 1. 读入清理好的用户信息宽表

In [2]:
dataset = pd.read_csv('E:/user_all_cleared.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## 将target放在第一列

In [3]:
gender_label = dataset.pop('gender')

In [4]:
dataset.insert(0,'gender',gender_label)

In [5]:
dataset.shape

(503862, 152)

In [6]:
dataset.head()

Unnamed: 0,gender,customerid,accountcash,accountbalance,integralbalance,consumetotal,consumetimes,lastconsume_diff_create_day,lastconsume_diff_create_hour,lastconsume_diff_now,...,age_from_song_language_3.0,age_from_song_language_not_known,shop_com_regioncode_310000.0,shop_com_regioncode_350000.0,shop_com_regioncode_440000.0,shop_com_regioncode_500000.0,shop_com_regioncode_not_known,shop_com_managetype_1.0,shop_com_managetype_2.0,shop_com_managetype_not_known
0,2,5792237,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1,2,5792238,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
2,2,5792239,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
3,2,5792241,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
4,2,5792242,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1


# 2. 分出测试集和训练集

In [7]:
train = dataset[dataset['gender']!=2]
test = dataset[dataset['gender']==2]
test_id = test.customerid
test = test.drop('customerid',axis=1)
X = train.drop(['customerid','gender'],axis=1)
y = train.gender.values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1026)
print(X_train.shape,X_test.shape,test.shape)

((15592, 150), (3899, 150), (484371, 151))


In [8]:
train.shape

(19491, 152)

In [9]:
test.shape

(484371, 151)

In [10]:
type(test_id)

pandas.core.series.Series

# 3. 进行特征选择

In [11]:
model = RandomForestClassifier(n_estimators=500)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### 特征选择

In [12]:
important_ones = zip(X_train.columns.values, model.feature_importances_)
important_ones = sorted(important_ones, key=lambda x:x[1], reverse=True)

In [13]:
for r in important_ones:
    print r[0], r[1]

lastconsume_diff_now 0.226945471017
member_age 0.148385573714
consumetotal 0.110599538522
integralbalance 0.0899071371628
accountbalance 0.0447531409436
accountcash 0.0438568331387
consumetimes 0.0426879442794
lastconsume_diff_create_hour 0.0390689061362
coupon_get_sum 0.035282542133
lastconsume_diff_create_day 0.0316004879997
source_company_id 0.0313438491892
get_com_roomnum 0.027783646547
get_coupon_type_cnt 0.00859714782871
coupon_get_cnt 0.00857971924204
source_company_cnt 0.00854672778479
shop_companyid 0.00347262766886
shop_original_sum 0.00328078408933
shop_com_roomnum 0.00325817966732
shop_original_sum_max 0.00321522907008
use_coupon_order_sum 0.00319140318304
shop_real_sum 0.00318094240256
shop_original_sum_avg 0.00315749447191
shop_real_sum_avg 0.00300797011151
shop_original_sum_min 0.00295141525503
shop_real_sum_max 0.00294015796771
shop_real_sum_min 0.00285870621531
shop_weekday_sum 0.00284652264328
wechat_province_广东 0.00274780713328
shop_night_sum 0.00262390546092
use_com

In [14]:
len(important_ones)

150

## 使用1e-5阈值选出特征个数

In [15]:
THRESHOLD = 1e-5
important_features = pd.DataFrame(important_ones, columns=['feature', 'score'])
filtered_features = important_features.loc[important_features['score']>THRESHOLD, 'feature'].values

In [16]:
len(filtered_features)

119

## 使用1e-6阈值选出特征

In [17]:
THRESHOLD = 1e-6
important_features = pd.DataFrame(important_ones, columns=['feature', 'score'])
filtered_features = important_features.loc[important_features['score']>THRESHOLD, 'feature'].values

In [18]:
filtered_features

array(['lastconsume_diff_now', 'member_age', 'consumetotal',
       'integralbalance', 'accountbalance', 'accountcash', 'consumetimes',
       'lastconsume_diff_create_hour', 'coupon_get_sum',
       'lastconsume_diff_create_day', 'source_company_id',
       'get_com_roomnum', 'get_coupon_type_cnt', 'coupon_get_cnt',
       'source_company_cnt', 'shop_companyid', 'shop_original_sum',
       'shop_com_roomnum', 'shop_original_sum_max', 'use_coupon_order_sum',
       'shop_real_sum', 'shop_original_sum_avg', 'shop_real_sum_avg',
       'shop_original_sum_min', 'shop_real_sum_max', 'shop_real_sum_min',
       'shop_weekday_sum', 'wechat_province_\xe5\xb9\xbf\xe4\xb8\x9c',
       'shop_night_sum', 'use_company_id', 'use_coupon_sum',
       'wechat_province_\xe6\x9c\xaa\xe7\x9f\xa5', 'shop_weekend_sum',
       'get_com_regioncode_500000.0', 'shop_afternoon_sum',
       'wechat_province_\xe4\xb8\x8a\xe6\xb5\xb7',
       'wechat_country_\xe4\xb8\xad\xe5\x9b\xbd', 'get_coupon_crosscom',
      

In [19]:
len(filtered_features)

129

## 按选出的特征分别处理训练集、测试集和验证集

In [20]:
X_train = X_train[filtered_features]
test = test[filtered_features]
X_test = X_test[filtered_features]

# 3. 用Xgboost进行模型训练

In [21]:
params={
'booster':'gbtree',
'objective': 'binary:logistic', # 这里用户群体性别只有两个值，故选用binary
'gamma':0.05,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 [0:]
'max_depth':12, # 构建树的深度 [1:]
'subsample':0.5, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
'colsample_bytree':0.6, # 构建树树时的采样比率 (0:1]
'min_child_weight':2, # 节点的最少特征数
'silent':1 ,
'eta': 0.03, # ste size shrinkage,如同学习率
'seed':710,
'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

In [22]:
plst = list(params.items())

In [23]:
num_rounds = 600 # 迭代次数
xgtest = xgb.DMatrix(test)
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgval = xgb.DMatrix(X_test, label=y_test)

### return训练和验证的错误

In [24]:
watchlist = [(xgtrain, 'train'),(xgval,'val')]

### 训练模型

In [25]:
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50)

Will train until val error hasn't decreased in 50 rounds.
[0]	train-error:0.376988	val-error:0.428571
[1]	train-error:0.358902	val-error:0.408566
[2]	train-error:0.351526	val-error:0.406258
[3]	train-error:0.347101	val-error:0.403693
[4]	train-error:0.341329	val-error:0.409592
[5]	train-error:0.339982	val-error:0.407027
[6]	train-error:0.338635	val-error:0.406514
[7]	train-error:0.337930	val-error:0.407027
[8]	train-error:0.337930	val-error:0.400616
[9]	train-error:0.337096	val-error:0.401641
[10]	train-error:0.336326	val-error:0.398051
[11]	train-error:0.336583	val-error:0.397025
[12]	train-error:0.334274	val-error:0.401129
[13]	train-error:0.335942	val-error:0.401641
[14]	train-error:0.333184	val-error:0.399846
[15]	train-error:0.333440	val-error:0.402667
[16]	train-error:0.330426	val-error:0.399590
[17]	train-error:0.329143	val-error:0.400359
[18]	train-error:0.329271	val-error:0.400359
[19]	train-error:0.327860	val-error:0.400103
[20]	train-error:0.327476	val-error:0.397025
[21]	tr

### 目前最佳 72次达到最优0.384458

In [39]:
# params={
# 'booster':'gbtree',
# 'objective': 'binary:logistic', # 这里用户群体年龄是连续的，因此采用了默认的线性分类器
# 'gamma':0.05,  # 在树的叶子节点下一个分区的最小损失，越大算法模型越保守 [0:]
# 'max_depth':12, # 构建树的深度 [1:]
# 'subsample':0.5, # 采样训练数据，设置为0.5，随机选择一般的数据实例 (0:1]
# 'colsample_bytree':0.6, # 构建树树时的采样比率 (0:1]
# 'min_child_weight':2, # 节点的最少特征数
# 'silent':1 ,
# 'eta': 0.03, # ste size shrinkage,如同学习率
# 'seed':710,
# 'nthread':4,# cpu 线程数,根据自己U的个数适当调整
# }

In [110]:
# model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50)

Will train until val error hasn't decreased in 50 rounds.
[0]	train-error:0.378078	val-error:0.421390
[1]	train-error:0.363712	val-error:0.407797
[2]	train-error:0.355439	val-error:0.401898
[3]	train-error:0.350180	val-error:0.396512
[4]	train-error:0.342804	val-error:0.395486
[5]	train-error:0.344087	val-error:0.397025
[6]	train-error:0.343445	val-error:0.395999
[7]	train-error:0.339790	val-error:0.398051
[8]	train-error:0.340752	val-error:0.395486
[9]	train-error:0.339661	val-error:0.396512
[10]	train-error:0.338956	val-error:0.395486
[11]	train-error:0.340110	val-error:0.393434
[12]	train-error:0.339148	val-error:0.392152
[13]	train-error:0.338250	val-error:0.393434
[14]	train-error:0.337224	val-error:0.390613
[15]	train-error:0.335428	val-error:0.388818
[16]	train-error:0.333120	val-error:0.390613
[17]	train-error:0.334659	val-error:0.391895
[18]	train-error:0.332863	val-error:0.391126
[19]	train-error:0.330811	val-error:0.392152
[20]	train-error:0.329913	val-error:0.397025
[21]	tr

### 保存模型

In [121]:
model.save_model('E:/gender.model')

In [122]:
preds = model.predict(xgtest,ntree_limit=model.best_iteration)

In [123]:
preds

array([ 0.45050052,  0.45050052,  0.45050052, ...,  0.43804327,
        0.4322716 ,  0.29275906], dtype=float32)

# 4. 对预测结果进行处理和保存

In [125]:
df = pd.DataFrame(preds,columns=['gender'])

In [126]:
df.head()

Unnamed: 0,gender
0,0.450501
1,0.450501
2,0.450501
3,0.450501
4,0.450501


In [143]:
df[df > 0.5] = 1

In [144]:
df[~(df > 0.5)] = 0

In [145]:
pd.value_counts(df['gender'])

0    432005
1     52366
Name: gender, dtype: int64

In [163]:
df_id = pd.DataFrame(test_id)

In [164]:
df.shape

(484371, 1)

In [165]:
df_id.shape

(484371, 1)

In [184]:
id_index = range(0,484371)

In [185]:
df_id = df_id.reindex(index=id_index)

In [186]:
df_id.shape

(484371, 1)

In [187]:
gender_predicted = pd.concat([df_id,df],axis=1)

In [188]:
gender_predicted.head()

Unnamed: 0,customerid,gender
0,5792237,0
1,5792238,0
2,5792239,0
3,5792241,0
4,5792242,0


## 最终预测了484371项

In [189]:
gender_predicted.shape

(484371, 2)

In [190]:
pd.value_counts(gender_predicted['gender'])

0    432005
1     52366
Name: gender, dtype: int64

## 保存预测结果到gender_predicted.csv

In [191]:
gender_predicted.to_csv('E:/gender_predicted.csv')