In [1]:
import os
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

import xlwt
import itertools

  from numpy.core.umath_tests import inner1d


# 数据及参数

In [2]:
random_seed = 42
cv=5
score = 'f1_weighted'

In [3]:
path = os.getcwd()+'/../data/20122018freshwater_four_feature.csv'
data = pd.read_csv(path, na_values = np.nan)

In [4]:
X = data.drop(['本周水质'], axis=1).values # Series
y = data['本周水质'].values.reshape(-1,1) - 1

# 1. 中位数填充缺失值，2.Z-score标准化
clean_pipeline = Pipeline([('imputer', preprocessing.Imputer(missing_values='NaN',strategy="median")),
                           ('std_scaler', preprocessing.StandardScaler())])
X = clean_pipeline.fit_transform(X)

In [5]:
X.shape

(33612, 4)

In [6]:
comb_3=list(itertools.combinations([0,1,2,3],3))
comb_2=list(itertools.combinations([0,1,2,3],2))

# k折交叉验证

In [7]:
# function：使用5折交叉验证统计各类别5次平均后的Acc，5次平均后的F1，和模型的总Acc以及总Weighted F1

# Input: 
#     X: 总样本
#     y: 总样本
#     model: function
#     cv: cross_validation的次数
# Output:
#     Acc_mean, 各类别的Acc
#     F1_mean, 各类别F1
#     Support_mean, 各类别预测样本占总样本的比重
#     Acc, 总Acc
#     F1_weighted 总Weighted F1
# function：使用5折交叉验证统计各类别5次平均后的Acc，5次平均后的F1，和模型的总Acc以及总Weighted F1

# Input: 
#     X: 总样本
#     y: 总样本
#     model: function
#     cv: cross_validation的次数
# Output:
#     Acc_mean, 各类别的Acc
#     F1_mean, 各类别F1
#     Support_mean, 各类别预测样本占总样本的比重
#     Acc, 总Acc
#     F1_weighted 总Weighted F1


def kftrain(X, y, model, cv):
    model_name = model.__class__.__name__

    n_samples = X.shape[0]
    n_features = X.shape[1]
    n_class = np.unique(y).shape[0]

    _Acc_matrix = np.zeros((n_class, cv))
    _Pr_matrix = np.zeros((n_class, cv))
    _Rc_matrix = np.zeros((n_class, cv))
    _F1_matrix = np.zeros((n_class, cv))
    _Support_matrix = np.zeros((n_class, cv))

    Acc_matrix = np.zeros((n_class, cv))
    Pr_matrix = np.zeros((n_class, cv))
    Rc_matrix = np.zeros((n_class, cv))
    F1_matrix = np.zeros((n_class, cv))
    Support_matrix = np.zeros((n_class, cv))
    
    k = 0
    skf = StratifiedKFold(n_splits=cv) # 定义5折分层划分器
    
    for train_index, test_index in skf.split(X, y):
        K_train_x, K_test_x = X[train_index], X[test_index]
        K_train_y, K_test_y = y[train_index], y[test_index]
        if model_name == 'GCForest':
            model.fit_transform(K_train_x, K_train_y.reshape(K_train_y.shape[0]))
        else:
            model.fit(K_train_x, K_train_y)
    
        K_test_y_pred = model.predict(K_test_x)
        K_train_y_pred = model.predict(K_train_x)
        
        # 由混淆矩阵计算各类别的Acc
        test_cm = confusion_matrix(K_test_y, K_test_y_pred)
        train_cm = confusion_matrix(K_train_y, K_train_y_pred)
        test_acc_all_class = np.zeros(n_class)
        train_acc_all_class = np.zeros(n_class)
        
        i = 0
        for c in test_cm:
            test_acc_all_class[i] = c[i]/np.sum(c)
            i += 1
        Acc_matrix[:,k] = test_acc_all_class
        
        i = 0
        for c in train_cm:
            train_acc_all_class[i] = c[i]/np.sum(c)
            i += 1
        _Acc_matrix[:,k] = train_acc_all_class
        
        # 由classification_report提取各类别的precision, recall, F1
        cr = classification_report(K_test_y, K_test_y_pred, digits=4)
        _cr = classification_report(K_train_y, K_train_y_pred, digits=4)
        pr_all_class = np.zeros(n_class)
        rc_all_class = np.zeros(n_class)
        f1_all_class = np.zeros(n_class)
        _pr_all_class = np.zeros(n_class)
        _rc_all_class = np.zeros(n_class)
        _f1_all_class = np.zeros(n_class)
        
        support_all_class = np.zeros(n_class)
        _support_all_class = np.zeros(n_class)
        
        i = 0
        for l in range(2,8):
            pr_all_class[i] = float(cr.splitlines()[l].split()[1])
            _pr_all_class[i] = float(_cr.splitlines()[l].split()[1])
            rc_all_class[i] = float(cr.splitlines()[l].split()[2])
            _rc_all_class[i] = float(_cr.splitlines()[l].split()[2])
            f1_all_class[i] = float(cr.splitlines()[l].split()[3])
            _f1_all_class[i] = float(_cr.splitlines()[l].split()[3])
            
            support_all_class[i] = float(cr.splitlines()[l].split()[4])/(n_samples/cv)
            _support_all_class[i] = float(_cr.splitlines()[l].split()[4])/(n_samples*(1-1/cv))
            i = i + 1
        Pr_matrix[:,k] = pr_all_class
        Rc_matrix[:,k] = rc_all_class
        F1_matrix[:,k] = f1_all_class
        Support_matrix[:,k] = support_all_class

        _Pr_matrix[:,k] = _pr_all_class
        _Rc_matrix[:,k] = _rc_all_class
        _F1_matrix[:,k] = _f1_all_class
        _Support_matrix[:,k] = _support_all_class
        
        k += 1
    
    return Acc_matrix, Pr_matrix, Rc_matrix, F1_matrix, Support_matrix, _Acc_matrix, _Pr_matrix, _Rc_matrix, _F1_matrix, _Support_matrix

In [8]:
model = RandomForestClassifier()

In [9]:
for cols in comb_3:
    print("RF"+str(cols))
    X_new = X[:,cols]
    workbook = xlwt.Workbook(encoding = 'utf-8')  

    model_name = model.__class__.__name__
    print(model_name)
    Acc_matrix, Pr_matrix, Rc_matrix, F1_matrix, Support_matrix, _Acc_matrix, _Pr_matrix, _Rc_matrix, _F1_matrix, _Support_matrix = kftrain(X_new, y, model, cv)

    Acc_cv = np.mean(Acc_matrix, axis=0)
    Pr_cv = np.mean(Pr_matrix, axis=0)
    Rc_cv = np.mean(Rc_matrix, axis=0)
    F1_weighted_cv = np.sum(F1_matrix*Support_matrix, axis=0)
    _Acc_cv = np.mean(_Acc_matrix, axis=0)
    _Pr_cv = np.mean(_Pr_matrix, axis=0)
    _Rc_cv = np.mean(_Rc_matrix, axis=0)
    _F1_weighted_cv = np.sum(_F1_matrix*_Support_matrix, axis=0)

    Acc_mean = np.mean(Acc_matrix, axis=1)
    Pr_mean = np.mean(Pr_matrix, axis=1)
    Rc_mean = np.mean(Rc_matrix, axis=1)
    F1_mean = np.mean(F1_matrix, axis=1)
    Acc_SD = np.std(Acc_matrix, axis=1)
    Pr_SD = np.std(Pr_matrix, axis=1)
    Rc_SD = np.std(Rc_matrix, axis=1)
    F1_SD = np.std(F1_matrix, axis=1)

    _Acc_mean = np.mean(_Acc_matrix, axis=1)
    _Pr_mean = np.mean(_Pr_matrix, axis=1)
    _Rc_mean = np.mean(_Rc_matrix, axis=1)
    _F1_mean = np.mean(_F1_matrix, axis=1)
    _Acc_SD = np.std(_Acc_matrix, axis=1)
    _Pr_SD = np.std(_Pr_matrix, axis=1)
    _Rc_SD = np.std(_Rc_matrix, axis=1)
    _F1_SD = np.std(_F1_matrix, axis=1)
    
    worksheet = workbook.add_sheet(model_name, cell_overwrite_ok=True)
    style = xlwt.XFStyle()
    pattern = xlwt.Pattern()
    pattern.pattern_fore_colour = xlwt.Style.colour_map['yellow']
    style.pattern = pattern
    
    l=0
    worksheet.write(l,0, "train_Acc_matrix")
    worksheet.write(l,cv+2, "test_Acc_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _Acc_matrix[i,j])
            worksheet.write(l, j+cv+2, Acc_matrix[i,j])
        worksheet.write(l, j+1, _Acc_mean[i]) 
        worksheet.write(l, j+2, _Acc_SD[i])
        worksheet.write(l, j+2+cv+1, Acc_mean[i])
        worksheet.write(l, j+2+cv+2, Acc_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _Acc_cv[j])
        worksheet.write(l, j+cv+2, Acc_cv[j])
    worksheet.write(l, j+1, np.mean(_Acc_cv), style=style)
    worksheet.write(l, j+2, np.std(_Acc_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(Acc_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(Acc_cv), style=style)
    #=======#
    l+=3
    worksheet.write(l,0, "train_Pr_matrix")
    worksheet.write(l,cv+2, "test_Pr_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _Pr_matrix[i,j])
            worksheet.write(l, j+cv+2, Pr_matrix[i,j])
        worksheet.write(l, j+1, _Pr_mean[i]) 
        worksheet.write(l, j+2, _Pr_SD[i])
        worksheet.write(l, j+2+cv+1, Pr_mean[i])
        worksheet.write(l, j+2+cv+2, Pr_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _Pr_cv[j])
        worksheet.write(l, j+cv+2, Pr_cv[j])
    worksheet.write(l, j+1, np.mean(_Pr_cv), style=style)
    worksheet.write(l, j+2, np.std(_Pr_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(Pr_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(Pr_cv), style=style)

    #=======#
    l+=3
    worksheet.write(l,0, "train_Rc_matrix")
    worksheet.write(l,cv+2, "test_Rc_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _Rc_matrix[i,j])
            worksheet.write(l, j+cv+2, Rc_matrix[i,j])
        worksheet.write(l, j+1, _Rc_mean[i]) 
        worksheet.write(l, j+2, _Rc_SD[i])
        worksheet.write(l, j+2+cv+1, Rc_mean[i])
        worksheet.write(l, j+2+cv+2, Rc_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _Rc_cv[j])
        worksheet.write(l, j+cv+2, Rc_cv[j])
    worksheet.write(l, j+1, np.mean(_Rc_cv), style=style)
    worksheet.write(l, j+2, np.std(_Rc_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(Rc_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(Rc_cv), style=style)
    #=======#
    l+=3
    worksheet.write(l,0, "train_F1_matrix")
    worksheet.write(l,cv+2, "test_F1_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _F1_matrix[i,j])
            worksheet.write(l, j+cv+2, F1_matrix[i,j])
        worksheet.write(l, j+1, _F1_mean[i]) 
        worksheet.write(l, j+2, _F1_SD[i])
        worksheet.write(l, j+2+cv+1, F1_mean[i])
        worksheet.write(l, j+2+cv+2, F1_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _F1_weighted_cv[j])
        worksheet.write(l, j+cv+2, F1_weighted_cv[j])
    worksheet.write(l, j+1, np.mean(_F1_weighted_cv), style=style)
    worksheet.write(l, j+2, np.std(_F1_weighted_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(F1_weighted_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(F1_weighted_cv), style=style)


    workbook.save("../res/RF"+str(cols)+".xls")

RF(0, 1, 2)
RandomForestClassifier




RF(0, 1, 3)
RandomForestClassifier




RF(0, 2, 3)
RandomForestClassifier




RF(1, 2, 3)
RandomForestClassifier




In [10]:
model = RandomForestClassifier()

In [11]:
for cols in comb_2:
    print("RF"+str(cols))
    X_new = X[:,cols]
    workbook = xlwt.Workbook(encoding = 'utf-8')  

    model_name = model.__class__.__name__
    print(model_name)
    Acc_matrix, Pr_matrix, Rc_matrix, F1_matrix, Support_matrix, _Acc_matrix, _Pr_matrix, _Rc_matrix, _F1_matrix, _Support_matrix = kftrain(X_new, y, model, cv)

    Acc_cv = np.mean(Acc_matrix, axis=0)
    Pr_cv = np.mean(Pr_matrix, axis=0)
    Rc_cv = np.mean(Rc_matrix, axis=0)
    F1_weighted_cv = np.sum(F1_matrix*Support_matrix, axis=0)
    _Acc_cv = np.mean(_Acc_matrix, axis=0)
    _Pr_cv = np.mean(_Pr_matrix, axis=0)
    _Rc_cv = np.mean(_Rc_matrix, axis=0)
    _F1_weighted_cv = np.sum(_F1_matrix*_Support_matrix, axis=0)

    Acc_mean = np.mean(Acc_matrix, axis=1)
    Pr_mean = np.mean(Pr_matrix, axis=1)
    Rc_mean = np.mean(Rc_matrix, axis=1)
    F1_mean = np.mean(F1_matrix, axis=1)
    Acc_SD = np.std(Acc_matrix, axis=1)
    Pr_SD = np.std(Pr_matrix, axis=1)
    Rc_SD = np.std(Rc_matrix, axis=1)
    F1_SD = np.std(F1_matrix, axis=1)

    _Acc_mean = np.mean(_Acc_matrix, axis=1)
    _Pr_mean = np.mean(_Pr_matrix, axis=1)
    _Rc_mean = np.mean(_Rc_matrix, axis=1)
    _F1_mean = np.mean(_F1_matrix, axis=1)
    _Acc_SD = np.std(_Acc_matrix, axis=1)
    _Pr_SD = np.std(_Pr_matrix, axis=1)
    _Rc_SD = np.std(_Rc_matrix, axis=1)
    _F1_SD = np.std(_F1_matrix, axis=1)
    
    worksheet = workbook.add_sheet(model_name, cell_overwrite_ok=True)
    style = xlwt.XFStyle()
    pattern = xlwt.Pattern()
    pattern.pattern_fore_colour = xlwt.Style.colour_map['yellow']
    style.pattern = pattern
    
    l=0
    worksheet.write(l,0, "train_Acc_matrix")
    worksheet.write(l,cv+2, "test_Acc_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _Acc_matrix[i,j])
            worksheet.write(l, j+cv+2, Acc_matrix[i,j])
        worksheet.write(l, j+1, _Acc_mean[i]) 
        worksheet.write(l, j+2, _Acc_SD[i])
        worksheet.write(l, j+2+cv+1, Acc_mean[i])
        worksheet.write(l, j+2+cv+2, Acc_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _Acc_cv[j])
        worksheet.write(l, j+cv+2, Acc_cv[j])
    worksheet.write(l, j+1, np.mean(_Acc_cv), style=style)
    worksheet.write(l, j+2, np.std(_Acc_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(Acc_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(Acc_cv), style=style)
    #=======#
    l+=3
    worksheet.write(l,0, "train_Pr_matrix")
    worksheet.write(l,cv+2, "test_Pr_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _Pr_matrix[i,j])
            worksheet.write(l, j+cv+2, Pr_matrix[i,j])
        worksheet.write(l, j+1, _Pr_mean[i]) 
        worksheet.write(l, j+2, _Pr_SD[i])
        worksheet.write(l, j+2+cv+1, Pr_mean[i])
        worksheet.write(l, j+2+cv+2, Pr_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _Pr_cv[j])
        worksheet.write(l, j+cv+2, Pr_cv[j])
    worksheet.write(l, j+1, np.mean(_Pr_cv), style=style)
    worksheet.write(l, j+2, np.std(_Pr_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(Pr_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(Pr_cv), style=style)

    #=======#
    l+=3
    worksheet.write(l,0, "train_Rc_matrix")
    worksheet.write(l,cv+2, "test_Rc_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _Rc_matrix[i,j])
            worksheet.write(l, j+cv+2, Rc_matrix[i,j])
        worksheet.write(l, j+1, _Rc_mean[i]) 
        worksheet.write(l, j+2, _Rc_SD[i])
        worksheet.write(l, j+2+cv+1, Rc_mean[i])
        worksheet.write(l, j+2+cv+2, Rc_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _Rc_cv[j])
        worksheet.write(l, j+cv+2, Rc_cv[j])
    worksheet.write(l, j+1, np.mean(_Rc_cv), style=style)
    worksheet.write(l, j+2, np.std(_Rc_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(Rc_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(Rc_cv), style=style)
    #=======#
    l+=3
    worksheet.write(l,0, "train_F1_matrix")
    worksheet.write(l,cv+2, "test_F1_matrix")

    for i in range(6):
        l+=1
        for j in range(cv):
            worksheet.write(l, j, _F1_matrix[i,j])
            worksheet.write(l, j+cv+2, F1_matrix[i,j])
        worksheet.write(l, j+1, _F1_mean[i]) 
        worksheet.write(l, j+2, _F1_SD[i])
        worksheet.write(l, j+2+cv+1, F1_mean[i])
        worksheet.write(l, j+2+cv+2, F1_SD[i])
    l+=1
    for j in range(cv):
        worksheet.write(l, j, _F1_weighted_cv[j])
        worksheet.write(l, j+cv+2, F1_weighted_cv[j])
    worksheet.write(l, j+1, np.mean(_F1_weighted_cv), style=style)
    worksheet.write(l, j+2, np.std(_F1_weighted_cv), style=style)
    worksheet.write(l, j+2+cv+1, np.mean(F1_weighted_cv), style=style)
    worksheet.write(l, j+2+cv+2, np.std(F1_weighted_cv), style=style)


    workbook.save("../res/RF"+str(cols)+".xls")

RF(0, 1)
RandomForestClassifier




RF(0, 2)
RandomForestClassifier




RF(0, 3)
RandomForestClassifier




RF(1, 2)
RandomForestClassifier




RF(1, 3)
RandomForestClassifier




RF(2, 3)
RandomForestClassifier


