In [51]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor 
from sklearn import cross_validation
from scipy import sparse
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV
import warnings
import time
import sys
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

In [52]:
train = pd.read_csv('jinnan_round1_train_20181227.csv',encoding='gbk')
test = pd.read_csv('jinnan_round1_testA_20181227.csv',encoding='gbk')

In [53]:
# 删除类别唯一的特征
for df in [train, test]:
    df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True)

In [54]:
# 删除某一类别占比超过90%的列
good_cols = list(train.columns)
for col in train.columns:
    rate = train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.9:
        good_cols.remove(col)
        print(col,rate)

# 删除异常值
train = train[train['收率']>0.87]
        
train = train[good_cols]
good_cols.remove('收率')
test  = test[good_cols]

A1 0.9863896848137536
A2 0.9699140401146131
A3 0.9570200573065902
A4 0.9570200573065902
B2 0.9842406876790831


In [55]:
# 合并数据集
target = train['收率']
del train['收率']
data = pd.concat([train,test],axis=0,ignore_index=True)
data = data.fillna(-1)

In [56]:
def timeTranSecond(t):
    try:
        t,m,s=t.split(":")
    except:
        if t=='1900/1/9 7:00':
            return 7*3600/3600
        elif t=='1900/1/1 2:30':
            return (2*3600+30*60)/3600
        elif t==-1:
            return -1
        else:
            return 0
    
    try:
        tm = (int(t)*3600+int(m)*60+int(s))/3600
    except:
        return (30*60)/3600
    
    return tm
for f in ['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7']:
    try:
        data[f] = data[f].apply(timeTranSecond)
    except:
        print(f,'应该在前面被删除了！')

def getDuration(se):
    try:
        sh,sm,eh,em=re.findall(r"\d+\.?\d*",se)
    except:
        if se == -1:
            return -1 
        
    try:
        if int(sh)>int(eh):
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600 + 24
        else:
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600
    except:
        if se=='19:-20:05':
            return 1
        elif se=='15:00-1600':
            return 1
    
    return tm
for f in ['A20','A28','B4','B9','B10','B11']:
    data[f] = data.apply(lambda df: getDuration(df[f]), axis=1)

In [57]:
data['样本id'] = data['样本id'].apply(lambda x: int(x.split('_')[1]))

categorical_columns = [f for f in data.columns if f not in ['样本id']]
numerical_columns = [f for f in data.columns if f not in categorical_columns]

In [58]:
#label encoder
for f in categorical_columns:
    data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
train = data[:train.shape[0]]
test  = data[train.shape[0]:]
print(train.shape)
print(test.shape)

(1381, 33)
(150, 33)


In [59]:
#train['target'] = list(target) 
train['target'] = target
train['intTarget'] = pd.cut(train['target'], 5, labels=False)
train = pd.get_dummies(train, columns=['intTarget'])
li = ['intTarget_0.0','intTarget_1.0','intTarget_2.0','intTarget_3.0','intTarget_4.0']
mean_columns = []
for f1 in categorical_columns:
    cate_rate = train[f1].value_counts(normalize=True, dropna=False).values[0]
    if cate_rate < 0.90:
        for f2 in li:
            col_name = 'B14_to_'+f1+"_"+f2+'_mean'
            mean_columns.append(col_name)
            order_label = train.groupby([f1])[f2].mean()
            train[col_name] = train['B14'].map(order_label)
            miss_rate = train[col_name].isnull().sum() * 100 / train[col_name].shape[0]
            if miss_rate > 0:
                train = train.drop([col_name], axis=1)
                mean_columns.remove(col_name)
            else:
                test[col_name] = test['B14'].map(order_label)
                
train.drop(li+['target'], axis=1, inplace=True)
print(train.shape)
print(test.shape)

(1381, 108)
(150, 108)


In [60]:
X_train = train[mean_columns+numerical_columns].values
X_test = test[mean_columns+numerical_columns].values
# one hot
enc = OneHotEncoder()
for f in categorical_columns:
    enc.fit(data[f].values.reshape(-1, 1))
    X_train = sparse.hstack((X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr')
    X_test = sparse.hstack((X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr')
print(X_train.shape)
print(X_test.shape)

(1381, 1245)
(150, 1245)


In [61]:
y_train = target.values

In [62]:
param = {'num_leaves': 125,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.008,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}
folds = KFold(n_splits=3, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target)))

fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000229624	valid_1's l2: 0.000273472
[400]	training's l2: 0.00015443	valid_1's l2: 0.000198242
[600]	training's l2: 0.000126734	valid_1's l2: 0.000169483
[800]	training's l2: 0.000114587	valid_1's l2: 0.000157977
[1000]	training's l2: 0.000107763	valid_1's l2: 0.000152152
[1200]	training's l2: 0.00010316	valid_1's l2: 0.000148404
[1400]	training's l2: 9.99909e-05	valid_1's l2: 0.000145559
[1600]	training's l2: 9.7583e-05	valid_1's l2: 0.000143689
[1800]	training's l2: 9.54951e-05	valid_1's l2: 0.000142104
[2000]	training's l2: 9.37515e-05	valid_1's l2: 0.000140864
[2200]	training's l2: 9.23157e-05	valid_1's l2: 0.000139799
[2400]	training's l2: 9.15814e-05	valid_1's l2: 0.00013928
Early stopping, best iteration is:
[2312]	training's l2: 9.15814e-05	valid_1's l2: 0.00013928
fold n°2
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000246916	valid_1's l2: 0.0

In [63]:
def get_best_params(x_,y_,params_):
    gbp = GridSearchCV(estimator = lgb.LGBMModel(boosting_type='gbdt',learning_rate=0.03,n_estimator=10,num_leaves=35,
                                                  objective='regression',subsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0,
                                                   random_state=None, n_jobs=-1, silent=True),
                        param_grid = params_, scoring='neg_mean_squared_error',cv=5)
    gbp.fit(x_,y_)
    print(gbp.best_score_,gbp.grid_scores_,gbp.best_params_)
    return gbp.best_params_


def get_rmse(x_t,y_t,model_name):
    y_p = model_name.predict(x_t)
    rmse =  mean_squared_error(y_t, y_p)
    return rmse

In [64]:
model_lgb = lgb.LGBMModel(boosting_type='gbdt',learning_rate=0.04,n_estimator=10,num_leaves=35,max_depth = 20,
                           objective='regression',subsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0,
                           random_state=None, n_jobs=-1, silent=True).fit(X_train,y_train)

cross_validation.cross_val_score(model_lgb, X_train,y_train, n_jobs=-1,scoring='neg_mean_squared_error')

array([-0.00015122, -0.00012757, -0.00016375])

In [65]:
xgb_pre = clf.predict(X_train)
lgb_pre = model_lgb.predict(X_train)
stack = pd.DataFrame()
stack['xgb'] = xgb_pre
stack['lgb'] = lgb_pre

rg = RidgeCV(cv = 5).fit(stack,y_train)
print('mse =  %.8f' %mean_squared_error(y_train,rg.predict(stack)))
-cross_validation.cross_val_score(rg,stack,y_train, n_jobs=-1,scoring='neg_mean_squared_error')

mse =  0.00008140


array([9.78751662e-05, 7.17664605e-05, 8.31716716e-05])

In [67]:
xgb_test = clf.predict(X_test)
lgb_test = model_lgb.predict(X_test)

stack_pre = pd.DataFrame()
stack_pre['xgb'] = xgb_test
stack_pre['lgb'] = lgb_test

result = rg.predict(stack_pre)
result = pd.DataFrame(result)
result = result.apply(lambda x:round(x,3))
result.to_csv('fff2.csv',header=None,index=False)