In [26]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import scipy.special as jn
warnings.filterwarnings('ignore')
%matplotlib inline

# sklearn 预测
from sklearn import linear_model,preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

# 数据降维度

from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import xgboost as xgb
import lightgbm as lgb
# 模型评估指标，参数搜索
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [27]:
train_data = pd.read_csv('./data/train_v1.csv')
test_data = pd.read_csv('./data/testA_v1.csv')
features = [col for col in train_data if col not in ['SaleID','name','price','brand']]

In [28]:
x_train = train_data[features]
y_train = train_data['price']
x_test = test_data[features]

In [31]:
def xgb_model(x_data,y_data):
    xgbModel = xgb.XGBRegressor(n_estimators=3000,learning_rate= 0.03,max_depth=7,
                                subsample=0.6,colsample_bytree=0.6,
                                gamma=0.5,reg_alpha=1,reg_lambda = 1,
                                objective='reg:squarederror')
#     param = {
#         'learning_rate':[0.01,0.1]
#     }
#     xgbModel = GridSearchCV(estimate,param)
    xgbModel.fit(x_data,y_data)
    return xgbModel
def lgb_model(x_data,y_data):
    y_data = np.log(1+y_data)
    lgbModel = lgb.LGBMRegressor(n_estimators=300,learning_rate = 0.03,max_depth = 11,num_leaves= 200,
                                 subsample=0.6,colsample_bytree=0.6,lambda_l2 = 1,lambda_l1 = 1)
#     param = {
#         'learning_rate':[0.01,0.1]
#     }
#     lgbModel = GridSearchCV(estimate,param)
    lgbModel.fit(x_data,y_data)
    return lgbModel    

In [32]:
# 5 折交叉验证
score_train = []
score = []
sk = StratifiedKFold(n_splits= 5,shuffle=True,random_state=0)
for train_ind,val_ind in sk.split(x_train,y_train):
    x_data = x_train.iloc[train_ind,:]
    y_data = y_train.iloc[train_ind]
    x_val = x_train.iloc[val_ind,:]
    y_val = y_train.iloc[val_ind]
    
    lgbReg = lgb_model(x_data,y_data)
    score_train.append(mean_absolute_error(y_data,np.exp(lgbReg.predict(x_data))-1))
    score.append(mean_absolute_error(y_val,np.exp(lgbReg.predict(x_val))-1))
    print(score_train)
    print(score)
print("training MAE",np.mean(score_train))
print('val MAE',np.mean(score))


[532.663805337315]
[600.4327619049561]
[532.663805337315, 535.1792437403592]
[600.4327619049561, 605.2740693953159]
[532.663805337315, 535.1792437403592, 532.7023820151663]
[600.4327619049561, 605.2740693953159, 601.6219618685772]
[532.663805337315, 535.1792437403592, 532.7023820151663, 534.4891041564232]
[600.4327619049561, 605.2740693953159, 601.6219618685772, 603.3614241691288]
[532.663805337315, 535.1792437403592, 532.7023820151663, 534.4891041564232, 536.2094598017835]
[600.4327619049561, 605.2740693953159, 601.6219618685772, 603.3614241691288, 589.3310259098486]
training MAE 534.2487990102094
val MAE 600.0042486495653


In [10]:
# 划分单一验证集合，做验证
train_x ,val_x,train_y,val_y = train_test_split(x_train,y_train,test_size = 0.3)

In [18]:
xgbReg = xgb_model(train_x,train_y)
pre_xgb = xgbReg.predict(val_x)
MAE_xgb = mean_absolute_error(val_y,pre_xgb)
print('XGB_MAE',MAE_xgb)

XGB_MAE 530.6346084688955


In [19]:
lgbReg = lgb_model(train_x,train_y)
pre_lgb = lgbReg.predict(val_x
                        )
MAE_lgb = mean_absolute_error(val_y,pre_lgb)
print('LGB_MAE',MAE_lgb)

LGB_MAE 518.5449946351138


In [20]:
val_blend = pre_xgb * (MAE_lgb/(MAE_lgb+MAE_xgb)) + pre_lgb * (MAE_xgb/(MAE_xgb+MAE_lgb))
MAE_blend = mean_absolute_error(val_y,val_blend)
print("Blend_MAE",MAE_blend)

Blend_MAE 511.1442566400626


In [21]:
#全部训练集合训练一遍，在测集合上做测试 
print('Training XGB')
xgbReg = xgb_model(x_train,y_train)
testA_xgb = xgbReg.predict(x_test)
print("Training LGB")
lgbReg = lgb_model(x_train,y_train)
testB_lgb = lgbReg.predict(x_test)

Training XGB
Training LGB


In [24]:
testA_blend = testA_xgb * (MAE_lgb/(MAE_lgb+MAE_xgb)) + testB_lgb * (MAE_xgb/(MAE_xgb+MAE_lgb))
testA_blend[testA_blend<0] = 10

submissionA_v4 = pd.DataFrame(np.concatenate([test_data['SaleID'][:,np.newaxis],
                                             testA_blend[:,np.newaxis]],axis = 1),
                              columns = ['SaleID','price'])


In [25]:
submissionA_v4.head()
submissionA_v4.to_csv('./data/submissionA_v4.csv',index = False)