1. 编写一个类，支持读取文件，自动将train和test的数据集合并在一起，并且通过flag区分，支持指定label来自动生成y
2. 可以定义几种常用的处理方式来处理特征，例如标准化和onehotcode等
3. 自动分析变量和画图、慢慢来
4. 能够设定评价指标后，自动进行CV测试
5. 能够自动搜索最佳超参数，然后选取最好的那个
6. 支持导出结果文件
7. 支持多模型比较和融合

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection  import cross_val_score
import xgboost as xgb

In [68]:
class MLPipe:
    # type_c_or_r: c  表示分类模型，r 表示回归模型
    def __init__(self, train_file, test_file, y_col, id_col, type_c_or_r):
        self.train_file = train_file
        self.test_file = test_file
        self.y_col = y_col
        self.id_col = id_col
        self.type_c_or_r = type_c_or_r
        self.col_is_train = 'col_is_train'
        self.mod_list = [] 
        self.rand_state = 43
        self.ft_name = '' # 数据集名称 
    
    # 用来标识数据集的名字，主要用来输出结果文件名
    def set_ft_name(self, ft_name):
        self.ft_name = ft_name
        
    # 读取文件，然后合并两个文件
    def parse_file(self):
        self.src_train_df = pd.read_csv(self.train_file)
        self.src_test_df = pd.read_csv(self.test_file)
        
        self.train_df = self.src_train_df.copy()
        self.test_df = self.src_test_df.copy()
        
        # 设置test数据的y值为0，并且区分
        self.test_df[self.y_col] = 0
        self.test_df[self.col_is_train] = 0 
        self.train_df[self.col_is_train] = 1
        # 合并两个数据集，后续所有操作都在all_df中
        self.all_df = pd.concat([self.train_df, self.test_df],ignore_index=True)
        
        self.print_shape()
            
    def print_shape(self):
        print("Train Shape: %d:%d" %(self.train_df.shape))
        print("Test  Shape: %d:%d" %(self.test_df.shape))
        print("All   Shape: %d:%d" %(self.all_df.shape))
        
    def split_X_y(self):
        self.X_train = self.all_df[self.all_df[self.col_is_train] == 1]
        self.y_train = self.X_train[self.y_col]
        self.X_train = self.X_train.drop(self.y_col, axis=1)

        self.X_test = self.all_df[self.all_df[self.col_is_train] == 0]
        self.y_test = self.X_test[self.y_col]
        self.X_test = self.X_test.drop(self.y_col, axis=1)
        
    #print("MSE:",mean_squared_error(y_eval, y_pred))      
    def get_model(self, model_name):
        if model_name == 'RFR':
            from sklearn.ensemble import RandomForestRegressor
            # Fit regression model
            mod = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=self.rand_state)
            res_file = 'f%s_m%s_n%d_d%s.csv' %(self.ft_name, model_name, mod.n_estimators,
                                               mod.max_depth)
            
        if model_name == 'XGB_R':
            params={
                'n_estimators':100,
                'objective': 'reg:linear', #回归问题 reg:linear 分类问题  binary:logistic  排序 rank:pairwise
                'learning_rate': 0.007, # 学习率，默认0.1
                'max_depth':6, # 构建树的深度，越大越容易过拟合.默认6 
                'eval_metric': 'auc', # rmse mae logloss error auc 
                'gamma':0,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
                'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
                'subsample':1, # 随机采样训练样本，默认1
                'min_child_weight':1, # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
                #，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
                #这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。 
                'seed':819
                }
            #from sklearn.ensemble import XGBRegressor
            mod = xgb.XGBRegressor(**params)
            res_file = 'f%s_m%s_n%d_d%s.csv'  %(self.ft_name, model_name, mod.n_estimators,
                                               mod.max_depth)
            
            if model_name == 'XGB_C':
                params={
                'n_estimators':100,
                'objective': 'binary:logistic', #回归问题 reg:linear 分类问题  binary:logistic  排序 rank:pairwise
                'learning_rate': 0.1, # 学习率，默认0.3
                'max_depth':6, # 构建树的深度，越大越容易过拟合.默认6 
                'eval_metric': 'auc', # rmse mae logloss error auc 
                'gamma':0,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
                'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
                'subsample':1, # 随机采样训练样本，默认1
                'min_child_weight':1, # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
                #，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
                #这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。 
                'seed':819
                }
                #from sklearn.ensemble import XGBRegressor
                mod = xgb.XGBClassifier(**params)
                res_file = 'f%s_m%s_n%d_d%s.csv'  %(self.ft_name, model_name, mod.n_estimators,
                                               mod.max_depth)
            
        mod_parm = {}
        mod_parm['name'] = model_name
        mod_parm['mod'] = mod 
        mod_parm['res_file'] = res_file 
        return mod_parm
    
    def add_model(self, model_name):
        mod_parm = self.get_model(model_name)
        self.mod_list.append(mod_parm)
        
    def set_model(self, model_name):
        mod_parm = self.get_model(model_name)
        self.mod_list = []
        self.mod_list.append(mod_parm)
        
    def cv_model(self):
        for mod_parm in self.mod_list:
            scores = cross_val_score(mod_parm['mod'], self.X_train, self.y_train, cv=5, scoring='r2')
            mod_parm['score_mean'] = scores.mean()
            mod_parm['score_std'] = scores.std()
            print("Mod[%s]  R2: %0.6f (+/- %0.6f)" % (mod_parm['name'], mod_parm['score_mean'], mod_parm['score_std']))
            
    def out_res_file(self):
        for mod_parm in self.mod_list:
            mod =  mod_parm['mod'].fit(self.X_train, self.y_train)
            y_pred =  mod.predict(self.X_test)
            output = pd.DataFrame({'id': self.X_test[self.id_col].astype(np.int32), self.y_col: y_pred})
            out_file = mod_parm['res_file']
            print('out_res_file to %s' %out_file)
            output.to_csv(out_file, index=False)

In [69]:
def fs_drop_col(df):
    count_dict = {}
    for col in df.columns:
        #print(col)
        arr_count = len(df[col].unique())
        count_dict.setdefault(arr_count, [])
        count_dict[arr_count].append(col)
     
    #for key,val in count_dict.items():
     #   print key,val
    
    # 计算每个特征中最多变量所占比例
    lc = [ df[col].value_counts().max() for col in df.columns ]
    val_count = pd.Series(lc, index= df.columns )

    # 删除不要的特征
    #drop_col = count_dict[1]
    drop_col = []
    # drop_col.append('X4')
    drop_col = drop_col + val_count.index[val_count>len(df)*0.98].tolist()

    drop_col = list(set(drop_col))
    df.drop(drop_col, axis=1, inplace=True)
    #for col in set(drop_col):
        #print col 
        #df.drop(col, axis=1, inplace=True)

    df.info()

In [70]:
def fs_drop_row(df):
    df.drop(df.index[df.y == df.y.max()], inplace=True)
    
def fs_dummpy(df):
    return pd.get_dummies(df)

In [71]:
ml = MLPipe(train_file='./car/train.csv', test_file='./car/test.csv', 
            y_col='y', id_col='ID', type_c_or_r='r')
print(ml.id_col)
ml.parse_file()

ID


Train Shape: 4209:379
Test  Shape: 4209:379
All   Shape: 8418:379


In [73]:
fs_drop_col(ml.all_df)
#fs_drop_row(ml.all_df)
ml.all_df = fs_dummpy(ml.all_df)
ml.set_ft_name('dummy')
ml.print_shape()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8418 entries, 0 to 8417
Columns: 200 entries, ID to y
dtypes: float64(1), int64(192), object(7)
memory usage: 12.8+ MB
Train Shape: 4209:379
Test  Shape: 4209:379
All   Shape: 8418:400


In [74]:
ml.split_X_y()
#ml.set_model('RFR')
ml.set_model('XGB')
#ml.cv_model()

In [75]:
ml.out_res_file()

out_res_file to fdummy_mXGB_n1000_d2.csv


In [78]:
for i, ind in enumerate(list(range(4))):
    print i,ind

0 0
1 1
2 2
3 3


In [131]:
ml.X_test.shape

(4209, 399)

In [45]:
ml.mod_list[1]['mod'].get_params()

{'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'nthread': -1,
 'objective': 'reg:linear',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 1}

In [61]:
from sklearn.model_selection import GridSearchCV
cv_params = {'max_depth': [2,3,4,5,6], 'min_child_weight': [1,5]}
ind_params = {'learning_rate': 0.01, 'n_estimators': 1000, 
              'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,
             'reg_alpha':0, 'reg_lambda':1} #regularization => L1 : alpha, L2 : lambda
clt = xgb.XGBRegressor(**ind_params)

optimized_GBM = GridSearchCV(clt, 
                             cv_params,  
                             cv = 5, verbose=10,
                             n_jobs = 2)
optimized_GBM.fit(ml.X_train, ml.y_train)
#optimized_GBM.grid_scores_
optimized_GBM

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   23.1s


[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   59.4s


[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.8min


[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  2.9min


[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  4.6min


[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  6.5min


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  9.4min


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 12.7min


[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed: 14.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'max_depth': [2, 3, 4, 5, 6], 'min_child_weight': [1, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [67]:
optimized_GBM.best_params_
#optimized_GBM.best_estimator_

ml.mod_list[1]['mod']

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [64]:
optimized_GBM.cv_results_

{'mean_fit_time': array([ 22.13940001,  19.81980009,  27.28840003,  26.69040003,
         31.84960003,  33.41399999,  40.33579998,  39.01080003,
         46.83200006,  48.64400001]),
 'mean_score_time': array([ 0.0566    ,  0.04359989,  0.04319997,  0.04139996,  0.03980002,
         0.05380006,  0.05660005,  0.04879999,  0.05159998,  0.05079999]),
 'mean_test_score': array([ 0.49864456,  0.52326292,  0.44414339,  0.48678984,  0.41074115,
         0.46114118,  0.37334339,  0.43046364,  0.27229416,  0.42463209]),
 'mean_train_score': array([ 0.60930927,  0.60262171,  0.65474553,  0.63972517,  0.70270241,
         0.6786426 ,  0.74995286,  0.71821972,  0.79545233,  0.75475748]),
 'param_max_depth': masked_array(data = [2 2 3 3 4 4 5 5 6 6],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'param_min_child_weight': masked_array(data = [1 5 1 5 1 5 1 5 1 5],
              mask = [False False False False False False False False Fal

In [65]:
print("Grid scores on development set:")
print()
means = optimized_GBM.cv_results_['mean_test_score']
stds = optimized_GBM.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, optimized_GBM.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

Grid scores on development set:
()
0.499 (+/-0.182) for {'max_depth': 2, 'min_child_weight': 1}
0.523 (+/-0.194) for {'max_depth': 2, 'min_child_weight': 5}
0.444 (+/-0.236) for {'max_depth': 3, 'min_child_weight': 1}
0.487 (+/-0.280) for {'max_depth': 3, 'min_child_weight': 5}
0.411 (+/-0.298) for {'max_depth': 4, 'min_child_weight': 1}
0.461 (+/-0.348) for {'max_depth': 4, 'min_child_weight': 5}
0.373 (+/-0.402) for {'max_depth': 5, 'min_child_weight': 1}
0.430 (+/-0.444) for {'max_depth': 5, 'min_child_weight': 5}
0.272 (+/-0.770) for {'max_depth': 6, 'min_child_weight': 1}
0.425 (+/-0.451) for {'max_depth': 6, 'min_child_weight': 5}
()


In [132]:
feat_names = ml.X_train.columns.values
importances = ml.mod_list[0]['mod'].feature_importances_
indices = np.argsort(importances)
indices[:20].tolist()
ml.X_train.columns.values[indices[:20].tolist()]

sort_list = sorted(zip(map(lambda x: round(x, 4), importances), feat_names), reverse=True)
sort_list[:20]

[(0.5982, 'X314'),
 (0.1099, 'X315'),
 (0.044, 'X118'),
 (0.0409, 'X263'),
 (0.0398, 'X119'),
 (0.0196, 'X136'),
 (0.0194, 'ID'),
 (0.015, 'X29'),
 (0.014, 'X127'),
 (0.0121, 'X279'),
 (0.0091, 'X189'),
 (0.0068, 'X76'),
 (0.0068, 'X5_ag'),
 (0.0061, 'X232'),
 (0.0047, 'X54'),
 (0.0034, 'X1_f'),
 (0.0032, 'X5_q'),
 (0.0022, 'X6_c'),
 (0.002, 'X6_k'),
 (0.0015, 'X345')]

In [90]:
ml.mod_list

[{'mod': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, n_jobs=1, oob_score=False, random_state=43,
             verbose=0, warm_start=False), 'name': 'RFR'}]

In [47]:
ml.all_df.describe()
ml.all_df.get_dtype_counts()

float64      1
int64      128
object       7
dtype: int64

In [28]:
ml.test_df.describe()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X377,X378,X379,X380,X382,X383,X384,X385,y,col_is_train
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4211.039202,0.019007,0.000238,0.074364,0.06106,0.427893,0.000713,0.002613,0.008791,0.010216,...,0.311951,0.019244,0.011879,0.008078,0.008791,0.000475,0.000713,0.001663,0.0,0.0
std,2423.078926,0.136565,0.015414,0.262394,0.239468,0.494832,0.026691,0.051061,0.093357,0.10057,...,0.463345,0.137399,0.108356,0.089524,0.093357,0.021796,0.026691,0.040752,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4202.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6310.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8416.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
