In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import warnings
import datetime
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']

In [2]:
train = pd.read_csv('./data/train_data.csv')
test = pd.read_csv('./data/test_a.csv')

In [3]:
# 初步清洗 0.87410 为了节省空间只保留一个测试模型代码，后面都使用相同参数
target = 'tradeMoney'
test[target] = -1
data = pd.concat([train,test])

columns = test.columns.tolist()
columns.remove(target)
columns.remove("ID")
object_col = ['buildYear','city','communityName','houseDecoration','houseFloor','houseToward','houseType',
             'plate','region','rentType','tradeTime'] # object型特征
num_col = [x for x in columns if x not in object_col] # 数值型特征

# 缺失值处理
data['pv'] = data['pv'].fillna(data['pv'].mean())
data['uv'] = data['uv'].fillna(data['uv'].mean())

median_year = data[data['buildYear'] != '暂无信息']['buildYear'].median()
data['buildYear'][data['buildYear'] == '暂无信息'] = median_year
data['buildYear'] = data['buildYear'].astype(int)
object_col.remove('buildYear')
columns.remove('houseDecoration')
object_col.remove('houseDecoration')
data['houseToward'][data['houseToward']=='暂无数据'] = '南'

# 异常值处理
data.drop(data[(data[target]>50000)].index,inplace=True) 
data.drop(data[data['houseType']=='0室0厅1卫'].index,inplace=True)

In [6]:
%%time 
params = {
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'min_child_samples':20,
    'objective': 'regression',
    'learning_rate': 0.01,
    "boosting": "gbdt",
    "feature_fraction": 0.8,
    "bagging_freq": 1,
    "bagging_fraction": 0.85,
    "bagging_seed": 23,
    "metric": 'rmse',
    "lambda_l1": 0.2,
    "nthread": 4,
}

train = data[data[target] != -1][columns]
test = data[data[target] == -1][columns]
tar = data[data[target] != -1][target]

for col in object_col:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
folds = KFold(n_splits=5, shuffle=True, random_state=2333)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx], 
                           label=tar.iloc[trn_idx],
                           categorical_feature=object_col)
    val_data = lgb.Dataset(train.iloc[val_idx], 
                           label=tar.iloc[val_idx],
                           categorical_feature=object_col)


    num_round = 10000
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=2000, early_stopping_rounds = 200)
    
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
    
print("CV Score: {:<8.5f}".format(r2_score(tar, oof_lgb))) 


fold 0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1620]	training's rmse: 726.934	valid_1's rmse: 1443.18
fold 1
Training until validation scores don't improve for 200 rounds.
[2000]	training's rmse: 742.211	valid_1's rmse: 1250.52
Early stopping, best iteration is:
[2426]	training's rmse: 691.525	valid_1's rmse: 1249.43
fold 2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1782]	training's rmse: 745.24	valid_1's rmse: 1450.89
fold 3
Training until validation scores don't improve for 200 rounds.
[2000]	training's rmse: 730.869	valid_1's rmse: 1273.37
Early stopping, best iteration is:
[2039]	training's rmse: 725.527	valid_1's rmse: 1273.01
fold 4
Training until validation scores don't improve for 200 rounds.
[2000]	training's rmse: 728.229	valid_1's rmse: 1316.78
Early stopping, best iteration is:
[2067]	training's rmse: 719.951	valid_1's rmse: 1316.42
CV Score: 0.87410 
CPU times

### 特征工程

In [5]:
# 分割特征 0.88120
train = train[train['area'] < 200]
train = train[train['area'] > 10]

data['room'] = data['houseType'].apply(lambda x : x.split('室')[0][-1]).astype(int)
data['living'] = data['houseType'].apply(lambda x : x.split('厅')[0][-1]).astype(int)
data['toilet'] = data['houseType'].apply(lambda x : x.split('卫')[0][-1]).astype(int)
columns.remove('houseType')
object_col.remove('houseType')
columns.extend(['room','living','toilet'])
data['tradeYear'] = data['tradeTime'].apply(lambda x : x.split('/')[0]).astype(int)
data['tradeMonth'] = data['tradeTime'].apply(lambda x : x.split('/')[1]).astype(int)
data['tradeDate'] = data['tradeTime'].apply(lambda x : x.split('/')[2]).astype(int)
columns.remove('tradeTime')
object_col.remove('tradeTime')
columns.extend(['tradeYear','tradeMonth','tradeDate'])
columns.remove('city')
object_col.remove('city')
columns.remove('tradeYear')
print(columns)
print(object_col)

['area', 'rentType', 'houseFloor', 'totalFloor', 'houseToward', 'communityName', 'region', 'plate', 'buildYear', 'saleSecHouseNum', 'subwayStationNum', 'busStationNum', 'interSchoolNum', 'schoolNum', 'privateSchoolNum', 'hospitalNum', 'drugStoreNum', 'gymNum', 'bankNum', 'shopNum', 'parkNum', 'mallNum', 'superMarketNum', 'totalTradeMoney', 'totalTradeArea', 'tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney', 'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum', 'supplyNewNum', 'supplyLandNum', 'supplyLandArea', 'tradeLandNum', 'tradeLandArea', 'landTotalPrice', 'landMeanPrice', 'totalWorkers', 'newWorkers', 'residentPopulation', 'pv', 'uv', 'lookNum', 'room', 'living', 'toilet', 'tradeMonth', 'tradeDate']
['communityName', 'houseFloor', 'houseToward', 'plate', 'region', 'rentType']


In [6]:
# 合并特征  0.88099
data['numRooms'] = data['room']+data['living']+data['toilet']
data['numTansportEquipment'] = data['subwayStationNum']+data['busStationNum']
data['numMedical'] = data['hospitalNum']+data['drugStoreNum']
data['numEducation'] = data['interSchoolNum']+data['schoolNum']+data['privateSchoolNum']
data['numLiving'] = data['gymNum']+data['parkNum']+data['bankNum']
data['numShop'] = data['shopNum']+data['mallNum']+data['superMarketNum']
columns.extend(['numRooms','numTansportEquipment','numMedical','numEducation','numLiving','numShop'])

In [7]:
# 根据房间数，所在楼层，朝向进行组合  0.88163 
gp = data.groupby('numRooms')['houseToward'].value_counts().rename('numRooms_houseToward_count',inplace=True)     
data = pd.merge(data, gp, how='left', on=['numRooms','houseToward'])
gp = data.groupby('numRooms')['houseFloor'].value_counts().rename('numRooms_houseFloor_count',inplace=True)     
data = pd.merge(data, gp, how='left', on=['numRooms','houseFloor'])
gp = data.groupby('houseToward')['houseFloor'].value_counts().rename('houseToward_houseFloor_count',inplace=True)     
data = pd.merge(data, gp, how='left', on=['houseToward','houseFloor'])
columns.extend(['numRooms_houseToward_count','numRooms_houseFloor_count','houseToward_houseFloor_count'])

In [8]:
# 根据区域groupby 0.88168 
gp = data.groupby('region')['numRooms'].mean().rename('region_numRooms_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numTansportEquipment'].mean().rename('region_numTansportEquipment_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numMedical'].mean().rename('region_numMedical_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numEducation'].mean().rename('region_numEducation_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numLiving'].mean().rename('region_numLiving_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numShop'].mean().rename('region_numShop_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['area'].mean().rename('region_area_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['area'].std().rename('region_area_std',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['area'].median().rename('region_area_median',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['houseToward'].value_counts().rename('region_houseToward_count',inplace=True)
data = pd.merge(data, gp, how='left', on=['region','houseToward'])
columns.extend(['region_numRooms_mean','region_numTansportEquipment_mean','region_numMedical_mean','region_numEducation_mean',
               'region_numLiving_mean','region_numShop_mean','region_houseToward_count','region_area_mean','region_area_std',
               'region_area_median'])

In [9]:
# 根据小区信息groupby 0.88933 
gp = data.groupby('communityName')['numRooms'].mean().rename('communityName_numRooms_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numTansportEquipment'].mean().rename('communityName_numTansportEquipment_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numMedical'].mean().rename('communityName_numMedical_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numEducation'].mean().rename('communityName_numEducation_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numLiving'].mean().rename('communityName_numLiving_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numShop'].mean().rename('communityName_numShop_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['area'].mean().rename('communityName_area_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['area'].std().rename('communityName_area_std',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['area'].median().rename('communityName_area_median',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')

gp = data.groupby('communityName')['houseToward'].value_counts().rename('communityName_houseToward_count',inplace=True)
data = pd.merge(data, gp, how='left', on=['communityName','houseToward'])
columns.extend(['communityName_numRooms_mean','communityName_numTansportEquipment_mean','communityName_numMedical_mean',
                'communityName_numEducation_mean','communityName_numLiving_mean','communityName_numShop_mean',
                'communityName_area_mean','communityName_area_std','communityName_area_median','communityName_houseToward_count'])


In [10]:
# Word2vec特征 0.89278 
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
save_path = './w2v' 

L = 10
sentence = []
for line in list(data[['communityName', 'plate', 'region', 'houseToward']].values):
    sentence.append([str(l) for idx, l in enumerate(line)])
    
model = Word2Vec(sentence, size=L, window=2, min_count=1, workers=multiprocessing.cpu_count(),iter=10)
for fea in ['communityName', 'plate', 'region', 'houseToward']:
    values = []
    for line in list(data[fea].values):
        values.append(line)
    values = set(values)
    w2v = []
    for i in values:
        a = [i]
        a.extend(model[str(i)])
        w2v.append(a)
    out_df = pd.DataFrame(w2v)

    name = [fea]
    for i in range(L):
        name.append(name[0] + 'W' + str(i))
    out_df.columns = name
    out_df.to_csv(save_path + '/' + fea + '.csv', index=False)

w2v_features = []
for col in ['communityName', 'plate', 'region', 'houseToward']:
    df = pd.read_csv(save_path + '/' + col + '.csv')
    df = df.drop_duplicates([col])
    fs = list(df)
    fs.remove(col)
    w2v_features += fs
    data = pd.merge(data, df, on=col, how='left')
columns.extend(w2v_features)

In [13]:
# lebel编码  0.89278 对lgb来说指定了categorical feature就相当于编码了
from sklearn.preprocessing import LabelEncoder
for i in object_col:
    lbl = LabelEncoder()
    data[i] = lbl.fit_transform(data[i])

In [16]:
# 聚类特征  0.89295 提升一点点不知道这么用对不对
from sklearn.mixture import GaussianMixture  
cv_types = ['spherical', 'tied', 'diag', 'full']
res = []
for cv_type in cv_types:
    gmm = GaussianMixture(n_components=3, covariance_type=cv_type)
    gmm.fit(data[columns].fillna(0))
    res.append(gmm.predict(data[columns].fillna(0)))
gm_df = pd.DataFrame(res).T
gm_df.rename(columns={0:'gm_spherical',1:'gm_tied',2:'gm_diag',3:'gm_full'},inplace=True)
data = pd.concat([data,gm_df],axis=1)
columns.extend(gm_df.columns.tolist())

spherical
tied
diag
full


### 特征选择

之前有对特征选择做过一些研究，推荐一下Jungdong Li的论文Feature Selection: A Data Perspective，他做了一个库https://github.com/jundongl/scikit-feature 以及他的网站http://featureselection.asu.edu (里面有数据集和算法说明) py3安装时可能需要根据报错提示做一些源码修改，后面也有不少bug，不过聊胜于无。 这里随便挑了几个

In [18]:
from skfeature.function.statistical_based import f_score
from skfeature.function.statistical_based import chi_square
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

In [19]:
train = data[data['tradeMoney'] != -1]
test = data[data['tradeMoney'] == -1][columns]
train_Y = train['tradeMoney']
train_X = train[columns]

In [20]:
# 存在空值的特征
for i in train_X.columns:
    if train_X[i].isnull().sum() > 0:
        print(i)

region_area_std
communityName_area_std


In [21]:
train_X = train_X.fillna(0)

In [22]:
%%time
# RF
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf = clf.fit(train_X, train_Y)
rf_w = clf.feature_importances_
sort_idx = np.argsort(rf_w)
feature_rank_RF = np.array(columns)[sort_idx]

CPU times: user 15.3 s, sys: 60.1 ms, total: 15.4 s
Wall time: 15.4 s


In [23]:
%%time
# RFE默认只用了一个核，跑的时间会长一些
estimator = RandomForestRegressor()
selector = RFE(estimator, 1, step=1)
selector = selector.fit(train_X, train_Y)
RFE_w = selector.ranking_
feature_rank_RFE = np.array(columns)[RFE_w-1]

CPU times: user 19min 51s, sys: 1.35 s, total: 19min 52s
Wall time: 19min 52s


In [24]:
%%time
# f_score
f_w = f_score.f_score(train_X,train_Y)
sort_idx = np.argsort(f_w)
feature_rank_fscore = np.array(columns)[sort_idx]

CPU times: user 258 ms, sys: 40.3 ms, total: 299 ms
Wall time: 296 ms


In [25]:
# 卡方检验 报了个错，X非正定
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
model1 = SelectKBest(chi2, k='all')#选择k个最佳特征
model1.fit_transform(train_X, train_Y)
chi2_w = model1.scores_ 

ValueError: Input X must be non-negative.

In [26]:
'''取排名前50的特征进行训练，比较结果'''
# RF CV Score: 0.33206
# RFE CV Score: 0.87362
# f_score CV Score: 0.67830 
select_num = 50
rank_list = [feature_rank_RF,feature_rank_RFE,feature_rank_fscore]
for i in rank_list:
    use_cols = i[:select_num]
    params = {
        'num_leaves': 31,
        'min_data_in_leaf': 20,
        'min_child_samples':20,
        'objective': 'regression',
        'learning_rate': 0.01,
        "boosting": "gbdt",
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.85,
        "bagging_seed": 23,
        "metric": 'rmse',
        "lambda_l1": 0.2,
        "nthread": 4,
    }

    train = data[data['tradeMoney'] != -1][use_cols]
    test = data[data['tradeMoney'] == -1][use_cols]
    tar = data[data['tradeMoney'] != -1]['tradeMoney']
    cat_cols = [x for x in use_cols if x in object_col]
    for col in cat_cols:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

    folds = KFold(n_splits=5, shuffle=True, random_state=2333)
    oof_lgb = np.zeros(len(train))
    predictions_lgb = np.zeros(len(test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx], 
                               label=tar.iloc[trn_idx],
                               categorical_feature=cat_cols)
        val_data = lgb.Dataset(train.iloc[val_idx], 
                               label=tar.iloc[val_idx],
                               categorical_feature=cat_cols)


        num_round = 10000
        clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=10000, early_stopping_rounds = 200)

        oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = use_cols
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits

    print("CV Score: {:<8.5f}".format(r2_score(tar, oof_lgb)))

fold 0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[393]	training's rmse: 3050.91	valid_1's rmse: 3081.6
fold 1
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[382]	training's rmse: 3042.09	valid_1's rmse: 3125.06
fold 2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[629]	training's rmse: 3006.47	valid_1's rmse: 3192.35
fold 3
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[407]	training's rmse: 3059.12	valid_1's rmse: 3039.78
fold 4
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[370]	training's rmse: 3047.98	valid_1's rmse: 3098.05
CV Score: 0.33206 
fold 0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1577]	training's rmse: 858.759	valid_1's rmse: 1485
fold 1
Training until va

In [27]:
'''取排名前100的特征进行训练，比较结果'''
# RF CV Score: 0.83902
# RFE CV Score: 0.89320
# f_score CV Score: 0.70658
select_num = 100
rank_list = [feature_rank_RF,feature_rank_RFE,feature_rank_fscore]
for i in rank_list:
    use_cols = i[:select_num]
    params = {
        'num_leaves': 31,
        'min_data_in_leaf': 20,
        'min_child_samples':20,
        'objective': 'regression',
        'learning_rate': 0.01,
        "boosting": "gbdt",
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.85,
        "bagging_seed": 23,
        "metric": 'rmse',
        "lambda_l1": 0.2,
        "nthread": 4,
    }

    train = data[data['tradeMoney'] != -1][use_cols]
    test = data[data['tradeMoney'] == -1][use_cols]
    tar = data[data['tradeMoney'] != -1]['tradeMoney']
    cat_cols = [x for x in use_cols if x in object_col]
    for col in cat_cols:
        train[col] = train[col].astype('category')
        test[col] = test[col].astype('category')

    folds = KFold(n_splits=5, shuffle=True, random_state=2333)
    oof_lgb = np.zeros(len(train))
    predictions_lgb = np.zeros(len(test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx], 
                               label=tar.iloc[trn_idx],
                               categorical_feature=cat_cols)
        val_data = lgb.Dataset(train.iloc[val_idx], 
                               label=tar.iloc[val_idx],
                               categorical_feature=cat_cols)


        num_round = 10000
        clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=10000, early_stopping_rounds = 200)

        oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = use_cols
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits

    print("CV Score: {:<8.5f}".format(r2_score(tar, oof_lgb)))

fold 0
Training until validation scores don't improve for 200 rounds.
[10000]	training's rmse: 622.566	valid_1's rmse: 1651.07
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 622.566	valid_1's rmse: 1651.07
fold 1
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[7591]	training's rmse: 739.806	valid_1's rmse: 1444.66
fold 2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[4210]	training's rmse: 865.076	valid_1's rmse: 1710.15
fold 3
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[6890]	training's rmse: 762.482	valid_1's rmse: 1361.59
fold 4
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[8292]	training's rmse: 713.44	valid_1's rmse: 1430.75
CV Score: 0.83902 
fold 0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[