In [74]:

import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)
        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")
        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
       
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out
data_path = 'datas/'

train = pd.read_csv(data_path+'train.csv',encoding='gb2312')
test = pd.read_csv(data_path+'testA.csv',encoding='gb2312')

def make_feat(train,test):
    data = pd.concat([train,test])
    data.drop("id",axis=1,inplace=True)
    data['性别'] = data['性别'].map({'男':1,'女':0})
    data['体检日期'] = (pd.to_datetime(data['体检日期']) - parse('2017-09-20')).dt.days
    data.fillna(data.median(axis=0),inplace=True)
    train_feat = data[:5642]
    test_feat = data[5642:]
    
    return train_feat,test_feat

def evalerror(pred, df):
    label = df.get_label().values.copy()
    score = mean_squared_error(label,pred)*0.5
    return ('0.5mse',score,False)

In [36]:
train.columns.values

array(['id', '性别', '年龄', '体检日期', '*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶',
       '*r-谷氨酰基转换酶', '*总蛋白', '白蛋白', '*球蛋白', '白球比例', '甘油三酯', '总胆固醇',
       '高密度脂蛋白胆固醇', '低密度脂蛋白胆固醇', '尿素', '肌酐', '尿酸', '乙肝表面抗原', '乙肝表面抗体',
       '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体', '白细胞计数', '红细胞计数', '血红蛋白', '红细胞压积',
       '红细胞平均体积', '红细胞平均血红蛋白量', '红细胞平均血红蛋白浓度', '红细胞体积分布宽度', '血小板计数',
       '血小板平均体积', '血小板体积分布宽度', '血小板比积', '中性粒细胞%', '淋巴细胞%', '单核细胞%',
       '嗜酸细胞%', '嗜碱细胞%', '血糖'], dtype=object)

In [14]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [15]:
train_feat,test_feat = make_feat(train,test)

In [75]:
train_feat,test_feat = make_feat(train,test)
predictors = [f for f in train_feat.columns if f not in ['血糖']]

In [77]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class FeaturesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
class age_Stratifie(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X["年龄分段"] = np.ceil(X["年龄"] / 10)
        X["年龄分段"].where(X["年龄分段"] <7, 7.0, inplace=True)
        X["年龄分段"].where(X["年龄分段"] >3, 3.0, inplace=True)
        return X[self.attribute_names].values
class date_encoded(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        enc = OneHotEncoder()
        if len(X[attribute_names])>5000:
            pass
        
        
        
        return X[self.attribute_names].values

In [80]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
num_feature=['*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶',
       '*r-谷氨酰基转换酶', '*总蛋白', '白蛋白', '*球蛋白', '白球比例', '甘油三酯', '总胆固醇',
       '高密度脂蛋白胆固醇', '低密度脂蛋白胆固醇', '尿素', '肌酐', '尿酸', '白细胞计数', '红细胞计数', '血红蛋白', '红细胞压积',
       '红细胞平均体积', '红细胞平均血红蛋白量', '红细胞平均血红蛋白浓度', '红细胞体积分布宽度', '血小板计数',
       '血小板平均体积', '血小板体积分布宽度', '血小板比积', '中性粒细胞%', '淋巴细胞%', '单核细胞%',
       '嗜酸细胞%', '嗜碱细胞%']
non_mun_feature=['性别','体检日期']
age_feature=['年龄']
from sklearn.pipeline import Pipeline
data_clean=Pipeline([
    ("FeaturesSelector",FeaturesSelector(num_feature)),
    ('imputer', Imputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])
data_encode = Pipeline([
        ('FeaturesSelector', FeaturesSelector(non_mun_feature)),
        ('encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])
age_clean=Pipeline([
    ("age_Stratifie",age_Stratifie(age_feature)),
    ('encoder', CategoricalEncoder(encoding="onehot-dense")),
])
full_pipeline = FeatureUnion(
    transformer_list=[
        ("data_clean", data_clean),
        ("data_encode", data_encode),
    ])

In [82]:
train_labels=train["血糖"]
train.drop("血糖",axis=1,inplace=True)

In [93]:
train_labels[25]

4.6399999999999997

In [83]:
data=pd.concat([train,test])

In [84]:
data.shape

(6641, 41)

In [85]:
data_prepared = full_pipeline.fit_transform(data)

In [88]:
X_train=data_prepared[:5642]
X_test=data_prepared[5642:]

In [97]:
X_train.shape

(5642, 64)

In [87]:
data_prepared.shape

(6641, 64)

In [90]:
print('开始训练...')
params = {
     'num_leaves': 50,
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.7,
    'colsample_bytree': 0.7,
    'feature_fraction': 0.7,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

print('开始CV 5折训练...')

train_preds = np.zeros(train_feat.shape[0])

test_preds = np.zeros((test_feat.shape[0], 5))

kf = KFold(len(train_feat), n_folds = 5, shuffle=True, random_state=520)

for i, (train_index, test_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    train_feat1 = X_train[train_index]
    train_feat2 = X_train[test_index]
    lgb_train1 = lgb.Dataset(train_feat1, train_labels[train_index])
    lgb_train2 = lgb.Dataset(train_feat2, train_labels[test_index])
    gbm = lgb.train(params,
                    lgb_train1,
                    num_boost_round=3000,
                    valid_sets=lgb_train2,
                    verbose_eval=100,
                    feval=evalerror,
                    early_stopping_rounds=100)
    feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False)
    train_preds[test_index] += gbm.predict(train_feat2)
    test_preds[:,i] = gbm.predict(X_test)
print('线下得分：    {}'.format(mean_squared_error(train_labels[train_index],train_preds)*0.5))


submission = pd.DataFrame({'pred':test_preds.mean(axis=1)})


submission.to_csv("submit.csv",header=None,index=False)

开始训练...
开始CV 5折训练...
第0次训练...


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 35.2216	valid_0's 0.5mse: 17.6108
Early stopping, best iteration is:
[1]	valid_0's l2: 35.2216	valid_0's 0.5mse: 17.6108


ValueError: Wrong number of items passed 64, placement implies 40