删除V5,V9,V11,V17,V21,V22,V28

先用Lasso做二次化的拟合，然后用随机森林做一次的拟合，然后融合。

要删除每个模型的异常点。

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [24]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [25]:
#加载数据，删除异常分布的特征
#返回训练集的X，y和测试集test
def load_data():
    train = pd.read_csv('zhengqi_train.txt', sep='\t')
    test = pd.read_csv('zhengqi_test.txt', sep='\t')
    cols=['V5','V9','V11','V17','V21','V22','V28']
    #['V5','V9','V11','V14','V17','V21','V22','V25','V26','V28','V32','V33','V34']
    train = train.drop(cols, axis=1)
    test = test.drop(cols, axis=1)
    x_train = train.loc[:,'V0':'V37']
    y_train = train['target']
    return x_train, y_train, test

In [26]:
#输入模型，X，y，返回异常点的指标。
def outliers(model, x, y, sigma=3):
    model.fit(x,y)
    y_pre=pd.Series(model.predict(x), index=y.index)
    re=y-y_pre
    mean_re=re.mean()
    std_re =re.std()
    z=(re-mean_re)/std_re
    outliers=z[abs(z)>sigma].index
    print(len(outliers),'outliers:')
    print(outliers)
    print()
    return outliers

In [27]:
#输入模型，备选参数，训练集，返回调参后的模型，并输出参数。
def train_model(model, params, x_train, y_train):
    gs=GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error',cv=4)
    gs.fit(x_train, y_train)
    model=gs.best_estimator_
    score1=round(gs.best_score_, 4)
    params1=gs.best_params_
    print(model.__class__.__name__,':',score1,params1)
    print()
    return model

In [28]:
#输入调参后的模型、测试集、训练集，返回预测值。
def y_pre(model, test, x_train, y_train):
    model.fit(x_train, y_train)
    pre=model.predict(test)
    p=pd.DataFrame(pre)
    return p

In [29]:
#删除异常点，返回训练集
def del_outs(outs,x,y):
    x=x.drop(outs)
    y=y.drop(outs)
    return x,y

In [30]:
#对特征进行二次多项式变换，返回训练集和测试集。
def poly_features(x,t,deg=2):
    poly=PolynomialFeatures(degree=deg)
    x=pd.DataFrame(poly.fit_transform(x))
    t=pd.DataFrame(poly.fit_transform(t))
    return x, t

In [41]:
#########################Lasso
#第一次训练
lasso_model=Lasso()

In [42]:
x_train,y_train,test=load_data()
x_train, test = poly_features(x_train, test, deg=2)
outs=outliers(lasso_model, x_train, y_train)
x_train,y_train=del_outs(outs,x_train,y_train)

27 outliers:
Int64Index([ 344,  401,  430,  843,  874,  884,  903,  909,  921, 1038, 1077,
            1147, 1485, 1547, 1548, 1679, 1864, 1870, 1882, 1894, 1933, 2590,
            2652, 2767, 2768, 2790, 2840],
           dtype='int64')



In [43]:
p={'alpha':np.arange(0.011, 0.020, 0.001)}

lasso_model=train_model(lasso_model,p,x_train,y_train)

Lasso : -0.1291 {'alpha': 0.015000000000000003}



In [44]:
#用训练后的模型来判断异常点，再第二次训练模型
lasso_model=Lasso(alpha=0.015)

In [45]:
x_train,y_train,test=load_data()
x_train, test = poly_features(x_train, test, deg=2)
outs=outliers(lasso_model, x_train, y_train)
x_train,y_train=del_outs(outs,x_train,y_train)

36 outliers:
Int64Index([ 321,  344,  693,  776,  777, 1046, 1069, 1085, 1141, 1145, 1164,
            1310, 1311, 1523, 1704, 1874, 1934, 1979, 2002, 2160, 2211, 2264,
            2274, 2279, 2620, 2645, 2647, 2667, 2668, 2669, 2696, 2697, 2769,
            2807, 2842, 2863],
           dtype='int64')



In [46]:
p={'alpha':np.arange(0.005, 0.020, 0.001)}

lasso_model=train_model(lasso_model,p,x_train,y_train)

Lasso : -0.1067 {'alpha': 0.012}



In [47]:
Lasso_pre=y_pre(lasso_model,test,x_train,y_train)

In [48]:
###########################################RFR######
rf_model=RandomForestRegressor()

In [52]:
x_train,y_train,test=load_data()
outs=outliers(rf_model,x_train,y_train)
x_train,y_train=del_outs(outs,x_train,y_train)

37 outliers:
Int64Index([  70,  321,  693,  715,  776,  805,  809, 1036, 1064, 1125, 1140,
            1145, 1146, 1294, 1511, 1523, 1684, 1704, 1878, 1901, 1936, 1950,
            2158, 2159, 2160, 2255, 2270, 2279, 2607, 2619, 2620, 2647, 2667,
            2696, 2770, 2801, 2807],
           dtype='int64')



In [55]:
p={'n_estimators':range(120,150,10),
    'max_depth':[None,11,12,13,14,],
    #'n_estimators':[120],'max_depth':[13],
    #'min_samples_split':range(6,12,1)
  }

rf_model=train_model(rf_model,p,x_train,y_train)

RandomForestRegressor : -0.1269 {'max_depth': 14, 'n_estimators': 140}



In [56]:
#用训练后的模型来判断异常点，再第二次训练模型
rf_model=RandomForestRegressor(max_depth=14, min_samples_split=6, n_estimators=140)

In [57]:
x_train,y_train,test=load_data()
outs=outliers(rf_model,x_train,y_train)
x_train,y_train=del_outs(outs,x_train,y_train)

39 outliers:
Int64Index([ 344,  348,  376,  419,  771,  776,  777,  843,  884, 1128, 1140,
            1145, 1164, 1311, 1412, 1458, 1476, 1704, 1934, 2166, 2211, 2264,
            2274, 2279, 2592, 2620, 2645, 2647, 2655, 2667, 2668, 2669, 2696,
            2697, 2769, 2800, 2801, 2807, 2863],
           dtype='int64')



In [60]:
p={#'n_estimators':range(80,130,10),
    #'max_depth':[None,11,12,13,14,15,],
    'n_estimators':[110],
    'max_depth':[14],
    'min_samples_split':range(3,12,1)
  }

rf_model=train_model(rf_model,p,x_train,y_train)

RandomForestRegressor : -0.1179 {'max_depth': 14, 'min_samples_split': 4, 'n_estimators': 110}



In [61]:
RF_pre=y_pre(rf_model,test,x_train,y_train)

In [62]:
y=0.7*Lasso_pre+0.3*RF_pre#0.1220

In [None]:
y.to_csv('average2.0.txt',sep='\t',index=False,header=False)

不删特征，用二次多项式化的Lasso+不二次化的ETR

In [None]:
def load_data_():
    train = pd.read_csv('zhengqi_train.txt', sep='\t')
    test = pd.read_csv('zhengqi_test.txt', sep='\t')
    #cols=['V5','V9','V11','V17','V21','V22','V28']
    #['V5','V9','V11','V14','V17','V21','V22','V25','V26','V28','V32','V33','V34']
    #train = train.drop(cols, axis=1)
    #test = test.drop(cols, axis=1)
    x_train = train.loc[:,'V0':'V37']
    y_train = train['target']
    return x_train, y_train, test

In [None]:
#########################Lasso  二次
lasso_model=Lasso(alpha=0.009)
#lasso_model=Lasso()

In [None]:
x_train,y_train,test=load_data_()
x_train, test = poly_features(x_train, test, deg=2)
outs=outliers(lasso_model, x_train, y_train)
x_train,y_train=del_outs(outs,x_train,y_train)

In [None]:
p={'alpha':np.arange(0.005, 0.020, 0.001)}

lasso_model=train_model(lasso_model,p,x_train,y_train)

In [None]:
Lasso_pre=y_pre(lasso_model,test,x_train,y_train)

In [None]:
################################ETR
et_model=ExtraTreesRegressor(max_depth=18,n_estimators=200,
                             max_features=0.9,min_samples_split=5,min_samples_leaf=2)
#et_model=ExtraTreesRegressor()

In [None]:
x_train,y_train,test=load_data_()
outs=outliers(et_model,x_train,y_train)
x_train,y_train=del_outs(outs,x_train,y_train)

In [None]:
p={
#    'max_depth':range(16,20,1),'n_estimators':np.arange(180,230,10),
    'max_depth':[19],'n_estimators':[200],
    'max_features':[0.8,],'min_samples_split':range(2,8,1),'min_samples_leaf':range(1,7,1),
#    'max_features':[0.7],'min_samples_split':[4],'min_samples_leaf':[2],
  }

et_model=train_model(et_model,p,x_train,y_train)

In [None]:
ETR_pre=y_pre(et_model,test,x_train,y_train)

In [None]:
y=0.7*Lasso_pre+0.3*ETR_pre

In [None]:
y.to_csv('average3.0.txt',sep='\t',index=False,header=False)