In [1]:
# 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import warnings
import datetime
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']

In [2]:
train = pd.read_csv('./data/train_data.csv')
test = pd.read_csv('./data/test_a.csv')
print(test.shape)

(2469, 50)


In [3]:
# 初步清洗 0.87410 r2 0.9505601666837905为了节省空间只保留一个测试模型代码，后面都使用相同参数
target = 'tradeMoney'
test[target] = -1
data = pd.concat([train,test])

columns = test.columns.tolist()
columns.remove(target)
columns.remove("ID")
object_col = ['buildYear','city','communityName','houseDecoration','houseFloor','houseToward','houseType',
             'plate','region','rentType','tradeTime'] # object型特征
num_col = [x for x in columns if x not in object_col] # 数值型特征

# 缺失值处理
data['pv'] = data['pv'].fillna(data['pv'].mean())
data['uv'] = data['uv'].fillna(data['uv'].mean())

median_year = data[data['buildYear'] != '暂无信息']['buildYear'].median()
data['buildYear'][data['buildYear'] == '暂无信息'] = median_year
data['buildYear'] = data['buildYear'].astype(int)
object_col.remove('buildYear')
columns.remove('houseDecoration')
object_col.remove('houseDecoration')
data['houseToward'][data['houseToward']=='暂无数据'] = '南'

# 异常值处理
train = data[data[target] != -1]
test = data[data[target] == -1]

train.drop(train[(train[target]>50000)].index,inplace=True) 
train.drop(train[train['houseType']=='0室0厅1卫'].index,inplace=True)
data = pd.concat([train,test])

In [4]:
# 分割特征 0.88120 r2 0.9523926133806397
train = train[train['area'] < 200]
train = train[train['area'] > 10]

data['room'] = data['houseType'].apply(lambda x : x.split('室')[0][-1]).astype(int)
data['living'] = data['houseType'].apply(lambda x : x.split('厅')[0][-1]).astype(int)
data['toilet'] = data['houseType'].apply(lambda x : x.split('卫')[0][-1]).astype(int)
columns.remove('houseType')
object_col.remove('houseType')
columns.extend(['room','living','toilet'])
data['tradeYear'] = data['tradeTime'].apply(lambda x : x.split('/')[0]).astype(int)
data['tradeMonth'] = data['tradeTime'].apply(lambda x : x.split('/')[1]).astype(int)
data['tradeDate'] = data['tradeTime'].apply(lambda x : x.split('/')[2]).astype(int)
columns.remove('tradeTime')
object_col.remove('tradeTime')
columns.extend(['tradeYear','tradeMonth','tradeDate'])
columns.remove('city')
object_col.remove('city')
columns.remove('tradeYear')
# print(columns)
# print(object_col)

In [5]:
# # 合并特征  
data['numRooms'] = data['room']+data['living']+data['toilet']
data['numTansportEquipment'] = data['subwayStationNum']+data['busStationNum']
data['numMedical'] = data['hospitalNum']+data['drugStoreNum']
data['numEducation'] = data['interSchoolNum']+data['schoolNum']+data['privateSchoolNum']
data['numLiving'] = data['gymNum']+data['parkNum']+data['bankNum']
data['numShop'] = data['shopNum']+data['mallNum']+data['superMarketNum']
# columns.extend(['numRooms','numTansportEquipment','numMedical','numEducation','numLiving','numShop'])

In [None]:
# # 根据房间数，所在楼层，朝向进行组合  0.88163  r2  0.9505506582265562
# gp = data.groupby('numRooms')['houseToward'].value_counts().rename('numRooms_houseToward_count',inplace=True)     
# data = pd.merge(data, gp, how='left', on=['numRooms','houseToward'])
# gp = data.groupby('numRooms')['houseFloor'].value_counts().rename('numRooms_houseFloor_count',inplace=True)     
# data = pd.merge(data, gp, how='left', on=['numRooms','houseFloor'])
# gp = data.groupby('houseToward')['houseFloor'].value_counts().rename('houseToward_houseFloor_count',inplace=True)     
# data = pd.merge(data, gp, how='left', on=['houseToward','houseFloor'])
# columns.extend(['numRooms_houseToward_count','numRooms_houseFloor_count','houseToward_houseFloor_count'])

In [6]:
# 根据区域groupby 0.88168  r2 0.9503471924087608 -> 0.952271276439161
gp = data.groupby('region')['numRooms'].mean().rename('region_numRooms_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numTansportEquipment'].mean().rename('region_numTansportEquipment_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numMedical'].mean().rename('region_numMedical_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numEducation'].mean().rename('region_numEducation_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numLiving'].mean().rename('region_numLiving_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['numShop'].mean().rename('region_numShop_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['area'].mean().rename('region_area_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['area'].std().rename('region_area_std',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['area'].median().rename('region_area_median',inplace=True)     
data = pd.merge(data, gp, how='left', on='region')
gp = data.groupby('region')['houseToward'].value_counts().rename('region_houseToward_count',inplace=True)
data = pd.merge(data, gp, how='left', on=['region','houseToward'])
columns.extend(['region_numRooms_mean','region_numTansportEquipment_mean','region_numMedical_mean','region_numEducation_mean',
               'region_numLiving_mean','region_numShop_mean','region_houseToward_count','region_area_mean','region_area_std',
               'region_area_median'])

In [7]:
# 根据小区信息groupby 0.88933  r2 0.9524753772367441 -> 0.9528857144503925
gp = data.groupby('communityName')['numRooms'].mean().rename('communityName_numRooms_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numTansportEquipment'].mean().rename('communityName_numTansportEquipment_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numMedical'].mean().rename('communityName_numMedical_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numEducation'].mean().rename('communityName_numEducation_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numLiving'].mean().rename('communityName_numLiving_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['numShop'].mean().rename('communityName_numShop_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['area'].mean().rename('communityName_area_mean',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['area'].std().rename('communityName_area_std',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')
gp = data.groupby('communityName')['area'].median().rename('communityName_area_median',inplace=True)     
data = pd.merge(data, gp, how='left', on='communityName')

gp = data.groupby('communityName')['houseToward'].value_counts().rename('communityName_houseToward_count',inplace=True)
data = pd.merge(data, gp, how='left', on=['communityName','houseToward'])
columns.extend(['communityName_numRooms_mean','communityName_numTansportEquipment_mean','communityName_numMedical_mean',
                'communityName_numEducation_mean','communityName_numLiving_mean','communityName_numShop_mean',
                'communityName_area_mean','communityName_area_std','communityName_area_median','communityName_houseToward_count'])


In [8]:
# Word2vec特征 0.89278 r2 0.9550027597748323 -> 0.9573779127898738  
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
save_path = './w2v' 

L = 10
sentence = []
for line in list(data[['communityName', 'plate', 'region', 'houseToward']].values):
    sentence.append([str(l) for idx, l in enumerate(line)])
    
model = Word2Vec(sentence, size=L, window=2, min_count=1, workers=multiprocessing.cpu_count(),iter=10)
for fea in ['communityName', 'plate', 'region', 'houseToward']:
    values = []
    for line in list(data[fea].values):
        values.append(line)
    values = set(values)
    w2v = []
    for i in values:
        a = [i]
        a.extend(model[str(i)])
        w2v.append(a)
    out_df = pd.DataFrame(w2v)

    name = [fea]
    for i in range(L):
        name.append(name[0] + 'W' + str(i))
    out_df.columns = name
    out_df.to_csv(save_path + '/' + fea + '.csv', index=False)

w2v_features = []
for col in ['communityName', 'plate', 'region', 'houseToward']:
    df = pd.read_csv(save_path + '/' + col + '.csv')
    df = df.drop_duplicates([col])
    fs = list(df)
    fs.remove(col)
    w2v_features += fs
    data = pd.merge(data, df, on=col, how='left')
columns.extend(w2v_features)

In [9]:
# 聚类特征  0.89295 r2 0.9555599679001301 -> 0.9561665920565924 提升一点点不知道这么用对不对
from sklearn.preprocessing import LabelEncoder
for i in object_col:
    lbl = LabelEncoder()
    data[i] = lbl.fit_transform(data[i])
    
from sklearn.mixture import GaussianMixture  
cv_types = ['spherical', 'tied', 'diag', 'full']
res = []
for cv_type in cv_types:
    gmm = GaussianMixture(n_components=3, covariance_type=cv_type)
    gmm.fit(data[columns].fillna(0))
    res.append(gmm.predict(data[columns].fillna(0)))
gm_df = pd.DataFrame(res).T
gm_df.rename(columns={0:'gm_spherical',1:'gm_tied',2:'gm_diag',3:'gm_full'},inplace=True)
data = pd.concat([data,gm_df],axis=1)
columns.extend(gm_df.columns.tolist())

In [10]:
%%time
params = {
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'min_child_samples':20,
    'objective': 'regression',
    'learning_rate': 0.01,
    "boosting": "gbdt",
    "feature_fraction": 0.8,
    "bagging_freq": 1,
    "bagging_fraction": 0.85,
    "bagging_seed": 23,
    "metric": 'rmse',
    "lambda_l1": 0.2,
    "nthread": 1,
}

train = data[data[target] != -1][columns]
test = data[data[target] == -1][columns]
tar = data[data[target] != -1][target]

for col in object_col:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    
folds = KFold(n_splits=5, shuffle=True, random_state=2333)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, train.values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx], 
                           label=tar.iloc[trn_idx],
                           categorical_feature=object_col)
    val_data = lgb.Dataset(train.iloc[val_idx], 
                           label=tar.iloc[val_idx],
                           categorical_feature=object_col)


    num_round = 10000
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=2000, early_stopping_rounds = 200)
    
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
    
print("CV Score: {:<8.5f}".format(r2_score(tar, oof_lgb))) 

eval_data = pd.read_csv('./data/评分文件/sub_a_913.csv')
r2score = r2_score(eval_data['pre'],predictions_lgb)
print("线上R^2 score:",r2score)

fold 0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1409]	training's rmse: 783.083	valid_1's rmse: 1386.1
fold 1
Training until validation scores don't improve for 200 rounds.
[2000]	training's rmse: 795.757	valid_1's rmse: 1173.67
Early stopping, best iteration is:
[2790]	training's rmse: 732.228	valid_1's rmse: 1168.94
fold 2
Training until validation scores don't improve for 200 rounds.
[2000]	training's rmse: 764.426	valid_1's rmse: 1340.26
Early stopping, best iteration is:
[2216]	training's rmse: 746.356	valid_1's rmse: 1339.72
fold 3
Training until validation scores don't improve for 200 rounds.
[2000]	training's rmse: 787.021	valid_1's rmse: 1145.89
Early stopping, best iteration is:
[3464]	training's rmse: 684.332	valid_1's rmse: 1136.68
fold 4
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[1435]	training's rmse: 846.07	valid_1's rmse: 1263.08
CV Score: 0.88976 
线上R^2 scor

In [25]:
params_lgb = {
    'max_depth': range(5,15,2),
    'num_leaves': range(10,40,5),
    'learning_rate':[0.01,0.05,0.1]
}
estimator = lgb.LGBMRegressor(
    n_estimators = 1000, 
    objective = 'regression', 
    min_child_weight = 1, 
    subsample = 0.8,
    colsample_bytree=0.8,
    nthread = 8,
)
gsearch = GridSearchCV(estimator , param_grid = params_lgb, scoring='r2', cv=3,n_jobs=10)
gsearch.fit(train, tar)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.8,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=1, min_split_gain=0.0,
       n_estimators=1000, n_jobs=-1, nthread=8, num_leaves=31,
       objective='regression', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'max_depth': range(5, 15, 2), 'learning_rate': [0.01, 0.05, 0.1], 'num_leaves': range(10, 40, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=0)

In [26]:
print(gsearch.best_params_)
print(gsearch.best_score_)
est = gsearch.best_estimator_
eval_data = pd.read_csv('./data/评分文件/sub_a_913.csv')
r2score = r2_score(eval_data['pre'],est.predict(test))
print(r2score)

{'max_depth': 11, 'learning_rate': 0.05, 'num_leaves': 20}
0.8722676161713834
0.9515421967765794


xgb

In [33]:
for col in object_col:
    train[col] = train[col].astype('float64')
    test[col] = test[col].astype('float64')
    
params_xgb = {
    'max_depth': range(5,15,2),  # 最大深度
    'eta': [0.01,0.05,0.1],  # 学习率

    }

estimator = xgb.XGBRegressor(
    n_estimators = 1000, 
    objective = 'reg:gamma', 
    min_child_weight = 1, 
    subsample = 0.8,
    nthread = 8,
    colsample_bytree = 0.7,
    silent = 0,
    eval_metric = 'rmse'
)
gsearch = GridSearchCV(estimator , param_grid = params_xgb, scoring='r2', cv=3, n_jobs=10)
gsearch.fit(train, tar)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, eval_metric='rmse', gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=8, objective='reg:gamma', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=0,
       subsample=0.8),
       fit_params=None, iid='warn', n_jobs=10,
       param_grid={'max_depth': range(5, 15, 2), 'eta': [0.01, 0.05, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=0)

In [35]:
print(gsearch.best_params_)
print(gsearch.best_score_)
est = gsearch.best_estimator_
eval_data = pd.read_csv('./data/评分文件/sub_a_913.csv')
r2score = r2_score(eval_data['pre'],est.predict(test))
print(r2score)

{'max_depth': 5, 'eta': 0.01}
0.8634675175801168
0.9457857720926782


In [34]:
### 真的是太慢了，甚至还没有开始指定的参数效果好