1. 编写一个类，支持读取文件，自动将train和test的数据集合并在一起，并且通过flag区分，支持指定label来自动生成y
2. 可以定义几种常用的处理方式来处理特征，例如标准化和onehotcode等
3. 自动分析变量和画图、慢慢来
4. 能够设定评价指标后，自动进行CV测试
5. 能够自动搜索最佳超参数，然后选取最好的那个
6. 支持导出结果文件
7. 支持多模型比较和融合

In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection  import cross_val_score

In [57]:
class MLPipe:
    # type_c_or_r: c  表示分类模型，r 表示回归模型
    def __init__(self, train_file, test_file, y_col, id_col, type_c_or_r):
        self.train_file = train_file
        self.test_file = test_file
        self.y_col = y_col
        self.id_col = id_col
        self.type_c_or_r = type_c_or_r
        self.col_is_train = 'col_is_train'
        self.mod_list = [] 
        
    # 读取文件，然后合并两个文件
    def parse_file(self):
        self.src_train_df = pd.read_csv(self.train_file)
        self.src_test_df = pd.read_csv(self.test_file)
        
        self.train_df = self.src_train_df.copy()
        self.test_df = self.src_test_df.copy()
        
        # 设置test数据的y值为0，并且区分
        self.test_df[self.y_col] = 0
        self.test_df[self.col_is_train] = 0 
        self.train_df[self.col_is_train] = 1
        # 合并两个数据集，后续所有操作都在all_df中
        self.all_df = pd.concat([self.train_df, self.test_df])
        
        self.print_shape()
            
    def print_shape(self):
        print("Train Shape: %d:%d" %(self.train_df.shape))
        print("Test  Shape: %d:%d" %(self.test_df.shape))
        print("All   Shape: %d:%d" %(self.all_df.shape))
        
    def split_X_y(self):
        self.X_train = self.all_df[self.all_df[self.col_is_train] == 1]
        self.y_train = self.X_train[self.y_col]
        self.X_train = self.X_train.drop(self.y_col, axis=1)

        self.X_test = self.all_df[self.all_df[self.col_is_train] == 0]
        self.y_test = self.X_test[self.y_col]
        self.X_test = self.X_test.drop(self.y_col, axis=1)
        
    #print("MSE:",mean_squared_error(y_eval, y_pred))      
    def add_model(self, model_name):
        if model_name == 'RFR':
            from sklearn.ensemble import RandomForestRegressor
            # Fit regression model
            mod = RandomForestRegressor(n_estimators=50, max_depth=5)
        
        mod_parm = {}
        mod_parm['name'] = model_name
        mod_parm['mod'] = mod 
        self.mod_list.append(mod_parm)
        
    def cv_model(self):
        for mod_parm in self.mod_list:
            scores = cross_val_score(mod_parm['mod'], self.X_train, self.y_train, cv=5, scoring='r2')
            mod_parm['score_mean'] = scores.mean()
            mod_parm['score_std'] = scores.std()
            print("Mod[%s]  R2: %0.2f (+/- %0.2f)" % (mod_parm['name'], mod_parm['score_mean'], mod_parm['score_std']))

In [38]:
def fs_drop_col(df):
    count_dict = {}
    for col in df.columns:
        #print(col)
        arr_count = len(df[col].unique())
        count_dict.setdefault(arr_count, [])
        count_dict[arr_count].append(col)
     
    #for key,val in count_dict.items():
     #   print key,val
    
    # 计算每个特征中最多变量所占比例
    lc = [ df[col].value_counts().max() for col in df.columns ]
    val_count = pd.Series(lc, index= df.columns )

    # 删除不要的特征
    #drop_col = count_dict[1]
    drop_col = []
    # drop_col.append('X4')
    drop_col = drop_col + val_count.index[val_count>len(df)*0.95].tolist()

    drop_col = list(set(drop_col))
    df.drop(drop_col, axis=1, inplace=True)
    #for col in set(drop_col):
        #print col 
        #df.drop(col, axis=1, inplace=True)

    df.info()

In [48]:
def fs_drop_row(df):
    df.drop(df.index[df.y == df.y.max()], inplace=True)
    
def fs_dummpy(df):
    return pd.get_dummies(df)

In [59]:
ml = MLPipe(train_file='./car/train.csv', test_file='./car/test.csv', 
            y_col='y', id_col='ID', type_c_or_r='r')
print(ml.id_col)
ml.parse_file()

ID


Train Shape: 4209:379
Test  Shape: 4209:379
All   Shape: 8418:379


In [60]:
fs_drop_col(ml.all_df)
fs_drop_row(ml.all_df)
ml.all_df = fs_dummpy(ml.all_df)
ml.print_shape()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8418 entries, 0 to 4208
Columns: 136 entries, ID to y
dtypes: float64(1), int64(128), object(7)
memory usage: 8.8+ MB
Train Shape: 4209:379
Test  Shape: 4209:379
All   Shape: 8416:336


In [62]:
ml.split_X_y()
ml.add_model('RFR')
ml.cv_model()

Mod[RFR]  R2: 0.51 (+/- 0.08)


In [47]:
ml.all_df.describe()
ml.all_df.get_dtype_counts()

float64      1
int64      128
object       7
dtype: int64

In [28]:
ml.test_df.describe()

Unnamed: 0,ID,X10,X11,X12,X13,X14,X15,X16,X17,X18,...,X377,X378,X379,X380,X382,X383,X384,X385,y,col_is_train
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4211.039202,0.019007,0.000238,0.074364,0.06106,0.427893,0.000713,0.002613,0.008791,0.010216,...,0.311951,0.019244,0.011879,0.008078,0.008791,0.000475,0.000713,0.001663,0.0,0.0
std,2423.078926,0.136565,0.015414,0.262394,0.239468,0.494832,0.026691,0.051061,0.093357,0.10057,...,0.463345,0.137399,0.108356,0.089524,0.093357,0.021796,0.026691,0.040752,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4202.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6310.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8416.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
