In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import skimage as ski
import skimage.io as skio
import skimage.transform as sktransform
from skimage.filters import threshold_otsu, threshold_sauvola
from skimage.color import rgb2gray
from IPython.core.display import Image, display
from skimage import feature

In [2]:
from sklearn import svm, naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler

> **scikit-learn**是一个Python第三方提供的非常强力的机器学习库，它包含了从数据预处理到训练模型的各个方面。在实战使用scikit-learn中可以极大的节省我们编写代码的时间以及减少我们的代码量，使我们有更多的精力去分析数据分布，调整模型和修改超参数。sklearn拥有可以用于监督和无监督学习的方法，一般来说监督学习使用的更多。[关于scikit-learn的介绍](https://zhuanlan.zhihu.com/p/33420189 "知乎")

In [3]:
from sklearn.preprocessing import StandardScaler

> **导入预处理模块中的scale模块  将用于训练数据的标准化和归一化处理**    
>  在机器学习领域中，不同评价指标（即特征向量中的不同特征就是所述的不同评价指标）往往具有不同的量纲和量纲单位，这样的情况会影响到数据分析的结果，为了消除指标之间的量纲影响，需要进行数据标准化处理，以解决数据指标之间的可比性。原始数据经过数据标准化处理后，各指标处于同一数量级，适合进行综合对比评价。其中，最典型的就是数据的归一化处理。[引用博客链接](https://blog.csdn.net/zenghaitao0128/article/details/78361038)，具体关于标准化和归一化请参考[特征工程中的「归一化」有什么作用？ - 知乎](https://www.zhihu.com/question/20455227)

### 批量提取GLCM, HOG,LBP特征

In [4]:
def get_features_list(path):
    """
    Args:
        path : string :要进行特征提取的图片路径
    Return:
        feature_list : dict : 包含三种特征的字典
    """

    origin_image = skio.imread(path)
    origin_image = sktransform.rescale(origin_image, 0.01)
    gray_image = rgb2gray(origin_image)
    otsu_threshold = threshold_otsu(gray_image)
    gray_image = ski.img_as_ubyte(gray_image)
    bin_image = gray_image > otsu_threshold
    # 此处分别使用了水平、竖直、以及45 和135 度方向求 GLCM特征
    feature_glcm = feature.greycomatrix(gray_image, [3], [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4], levels=256)
#     hog_feature_vector, hog_image = feature.hog(gray_image, orientations=8, pixels_per_cell=(5, 5),
#                                                 cells_per_block=(1, 1), visualize=True, block_norm='L2-Hys',
#                                                 feature_vector=True)
    # 设置LBP 特征提取算法的参数
#     radius = 3
#     n_points = 8 * radius
    #print(gray_image.dtype)
#     feature_lbp = feature.local_binary_pattern(gray_image, n_points, radius, 'uniform')
    feature_glcm_flattened = feature_glcm.flatten()
#     feature_hog_flattened = hog_feature_vector
#     feature_lbp_flattended = feature_lbp.flatten()
    result = {'glcm': feature_glcm_flattened}
              #, 'hog': feature_hog_flattened, 'lbp': feature_lbp_flattended}
    return result

### 找出指定文件夹下的所有jpg图片

In [5]:
def list_files(root_path):
    """
    Args:
        root_path : string : 图片所在文件夹的路径 
    Return:
        file_list : list : 文件路径列表
    
    """
    file_list = []
    for file in os.listdir(root_path):
        if file.endswith(".png"):
            file_list.append(file)
    return file_list

### 数据预处理
> 数据降维
> 数据标准化，归一化

In [6]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
def apply_pca(features_list_dict, components_num=2):
    result = {}
    pca1 = PCA(n_components=components_num, copy=True, whiten=False)
#     pca2 = PCA(n_components=components_num, copy=True, whiten=False)
#     pca3 = PCA(n_components=components_num, copy=True, whiten=False)
    glcm = np.array(features_list_dict['glcm']).astype(float)
#     hog = np.array(features_list_dict['hog']).astype(float)
#     lbp = np.array(features_list_dict['lbp']).astype(float)
    glcm_pca = pca1.fit(glcm[:, 0:-1])
#     hog_pca = pca2.fit(hog[:, 0:-1])
#     lbp_pca = pca3.fit(lbp[:, 0:-1])
    glcm_coef = []
#     hog_coef = []
#     lbp_coef = []
    for i in range(components_num):
        glcm_coef.append(tuple(glcm_pca.components_[i]))
#         hog_coef.append(tuple(hog_pca.components_[i]))
#         lbp_coef.append(tuple(lbp_pca.components_[i]))
    glcm_coef_matrix = np.array(glcm_coef)
#     hog_coef_matrix = np.array(hog_coef)
#     lbp_coef_matrix = np.array(lbp_coef)
    '''
    np.save('glcm_coef_matrix.npy',glcm_coef_matrix)
    np.save('hog_coef_matrix.npy',hog_coef_matrix)
    np.save('lbp_coef_matrix.npy',lbp_coef_matrix)
    '''
    glcm_features_matrix = np.dot(glcm[:, 0:-1], glcm_coef_matrix.T)
#     hog_features_matrix = np.dot(hog[:, 0:-1], hog_coef_matrix.T)
#     lbp_features_matrix = np.dot(lbp[:, 0:-1], lbp_coef_matrix.T)
    glcm_features_matrix_ = np.column_stack([scale(glcm_features_matrix), glcm[:, -1]])
#     hog_features_matrix_ = np.column_stack([scale(hog_features_matrix), hog[:, -1]])
#     lbp_features_matrix_ = np.column_stack([scale(lbp_features_matrix), lbp[:, -1]])
    result['glcm'] = glcm_features_matrix_
#     result['hog'] = hog_features_matrix_
#     result['lbp'] = lbp_features_matrix_
    return result

### 构造训练集和测试集
> 将带标签的各特征矩阵 打乱顺序之后进行划分   
> 使用sklearn自带的数据集划分函数进行划分

In [16]:
from sklearn.model_selection import train_test_split


def split_train_test_dataset(features_list_dict):
    glcm_train_data = features_list_dict['glcm'][:,0:-1]
#     hog_train_data = features_list_dict['hog'][:,0:-1]
#     lbp_train_data = features_list_dict['lbp'][:,0:-1]
    
    label_data = features_list_dict['glcm'][:,-1]
    train_data = np.column_stack([glcm_train_data])
                                  #, hog_train_data, lbp_train_data])
    x_train, x_test, y_train, y_test = train_test_split(train_data, label_data, test_size=0.3, random_state=0)
    return x_train, x_test ,y_train, y_test

### 选择分类器进行分类
> - SVM
> - LogisticRegression
> - RandomForestClassifier
> - AdaBoostClassifier

In [8]:
    clfs = {'svm': svm.SVC(gamma='scale'),
            'random_forest': RandomForestClassifier(n_estimators=50),
            'adaboost': AdaBoostClassifier(n_estimators=50),
            }

In [9]:
def classify(x_train, y_train, x_test, y_test):
    for clf_key in clfs.keys():
        clf = clfs[clf_key]
        clf.fit(x_train, y_train.ravel())
        score = clf.score(x_test, y_test.ravel())
        print('the classifier is\t :', clf_key, '\t the score is :', score)


### 使用交叉验证的方式来训练

In [10]:
from sklearn.model_selection import cross_validate


def by_cross_validate(x_train, x_test, y_train, y_test):
    features_data = np.row_stack([x_train, x_test])
    label_data = np.row_stack([np.array([y_train]).T, np.array([y_test]).T])
    for clf_key in clfs.keys():
        clf = clfs[clf_key]
        scoring = ['accuracy']
        scores = cross_validate(clf, features_data, label_data.ravel(), cv=6, scoring=scoring)
        print('classifier:\t', clf_key)
        print('by_cross_validate test_accuracy score  :', scores['test_accuracy'])

In [11]:
from tqdm import *

### 主程序

In [13]:
path = r'D:\project\image_classification_demo\swedish-dataset-square-256'
features_list_dict = {}
features_list_dict['glcm'] = []
features_list_dict['hog'] = []
features_list_dict['lbp'] = []
dirs = os.listdir(path)
print(dirs)
for d in tqdm(dirs):
    file_list = list_files(os.path.join(path, d))
    for file in file_list:
        file_path = os.path.join(path, d, file)
        result = get_features_list(file_path)
        features_list_dict['glcm'].append(np.hstack([result['glcm'], d]))
#         features_list_dict['hog'].append(np.hstack([result['hog'], d]))
#         features_list_dict['lbp'].append(np.hstack([result['lbp'], d]))
print('ok')


['0', '1', '10', '11', '12', '13', '14', '2', '3', '4', '5', '6', '7', '8', '9']


100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [03:38<00:00, 14.54s/it]


ok


KeyError: 'hog'

In [None]:
pca_result = apply_pca(features_list_dict, components_num=10)
x_train, x_test, y_train, y_test = split_train_test_dataset(pca_result)
by_cross_validate(x_train, x_test, y_train, y_test)
classify(x_train, y_train, x_test, y_test)