# 前言
本质上python和sklearn对待模型的原理是创建一个xx器（预测器、预处理器……），再使用内置的方法去调用处理里面的东西。  
所有的代码，为可重复性，都应该加入随机种子。参数具体是random_state = ...，为节省空间，下述不再写作。

# 预处理
预处理的基本步骤分为三个。第一步是创建一个预处理器，随后使用fit方法使得预处理器和数据拟合/适配（计算一些数据的特有参数，例如均值、方差等），最后采用transform方法让数据转换。第二步和第三步也可以结合使用fit_transform函数来进行。基本语法如下：  
```
tool = function()
tool.fit(df)  
df = tool.transform(df)
```
第二步和第三步也可以改写成
```
df = tool.fit_transform(df)
```


In [None]:
# 一些假数据，防报错
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scikitplot as skplt
df = pd.DataFrame()
x, y = df.iloc[:, :-1].values, df.iloc[:, 3].values

In [None]:
# 缺失值处理
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_value = 'NaN', strategy = '...') # 缺失值处理器的api
df = imputer.fit_transform(df) # 缺失值处理


# 重编码
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # 有序离散变量编码，虚拟变量编码
labelencoder = LabelEncoder()
df = labelencoder.fit_transform(df)

onehotencoder = OneHotEncoder(categorical_features = [0])
df = onehotencoder.fit_transform(df).toarray()


# 标准化/归一化
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer
Scaler = StandardScaler()
x, y = Scaler.fit_transform(x), Scaler.fit_transform(y)


# 数据拆分
train_rate = 0.8
val_rate = 0.5

## 数据打乱（这一步不一定需求） TODO 有实际数据再测试
shuffle_index = np.random.permutation(len(y.ravel()))
x = x[shuffle_index]
y = y[shuffle_index]

## 数据拆分（训练集/测试集）
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0) # random_state用于抽样，必须一致以复现结果。另提供shuffle, stratify等参数控制抽样方法。

## 数据拆分（训练集/验证集/测试集）
X_train, X_val_test, y_train, y_val_test = train_test_split(x, y, train_size=train_rate, test_size = 1-train_rate, random_state = 0) # 将数据分为训练集
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, train_size = val_rate/(1-train_rate), test_size = 1-val_rate/(1-train_rate), random_state = 0) # 将数据分为验证集和测试集

# 有监督方法：传统统计方法

In [None]:
# OLS回归。因为OLS的回归器不会自动预处理，所以要记得对数据进行预处理，特别是标准化。
from sklearn.linear_model import LinearRegression # OLS回归（只支持OLS拟合）
lm = LinearRegression()
lm.fit(X = X_train, y = y_train) # 拟合方程。此时lm是已经拟合的最小二乘线性回归方程。
y_pred = lm.predict(X = X_val) # 计算预测y
plt.scatter(X_train, y_train, color = 'red') # 数据散点
plt.plot(X_train, lm.predict(X_train), color = 'blue') # 拟合线
plt.title('Linear Regression')
plt.show()

# 模型进入方式的参数：机器学习用语
## All-in(Enter), Backward Elimination, Forward Selection, Bidirectional Elimination（本质都是stepwise）, Score Comparison（信息量比较，例如AIC/BIC，穷尽所有可能的组合情况）
## sklearn不提供stepwise reg，需要的应调用statsmodels.formula.api.OLS()

# 单自变量多项式回归
from sklearn.preprocessing import PolynomialFeatures
poly_reg_pre = PolynomialFeatures(degree = 2) # degree控制多项式的最大值
X_poly = poly_reg_pre.fit_transform(X_train) # 因为本质上要对X进行修订，所以用的是预处理的类，修订单向量成多个向量（多变量）
poly_reg = LinearRegression().fit(X_poly, y_train)

# Logistics reg
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X = X_train, y = y_train)
y_pred = lr.predict(X = X_val)

# 具有惩罚项的回归
from sklearn.linear_model import Ridge, Lasso, ElasticNet

# 其他GLM回归
from sklearn.linear_model import PoissonRegressor, GammaRegressor

# 模型指标

In [None]:
# 回归任务：R^2等
## https://blog.csdn.net/qq_34160248/article/details/127740194
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
def adj_r_squared(x_test, y_test, y_predict):
    SS_R = sum((y_test-y_predict)**2)
    SS_T = sum((y_test-np.mean(y_test))**2)
    r_squared = 1 - (float(SS_R))/SS_T
    adj_r_squared = 1 - (1-r_squared)*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)
    return adj_r_squared

r2 = r2_score(y_true = y_val, y_pred = y_pred)                                 # r2一般不用。也可以使用.score的方法调用线性回归器的r2
rmse = sqrt(np.sum((y_val - y_pred) ** 2) / len(y_val))                        # 均方根误差会受到样本量的影响
rmse = sqrt(mean_squared_error(y_true = y_val, y_pred = y_pred))               # 均方根误差会受到样本量的影响
rse = np.sum((y_val - y_pred) ** 2) / np.sum((y_val - np.mean(y_val)) ** 2)    # 相对平方误差
mae = np.sum(np.abs(y_val - y_pred) / len(y_val))
mae = mean_absolute_error(y_true = y_val, y_pred = y_pred)
adj_r2 = adj_r_squared(X_val, y_val, y_pred)                                   # 尽量使用调整r2


# 回归任务：信息量
## sklearn包不为非线性模型提供AIC/BIC/AICC/mallowscp。需要通过statsmodels.formula.api对线性模型进行拟合，再通过不同模型的.aic/.bic方法调取数据。不展开。
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
lasso = LassoLarsIC(criterion = 'aic') # or bic
lasso.fit(X = X_train, y = y_train)
lasso.alpha_


# 分类任务：混淆矩阵等（包含I/II类错误），请注意，分类器的性能受到数据集平衡的影响。对于一分类数据需要使用其他方法。
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score, roc_curve, multilabel_confusion_matrix
cm = confusion_matrix(y_true = y_val, y_pred = y_pred)         # 双类别混淆矩阵，多类别应使用multilable_cm
# 该混淆矩阵在二分类的结构为（注意，这与正常的混淆矩阵样式是不同的）：
#         test	
#         阴     阳
# gs  阴  tn     fp
#     阳  fn     tp
accuracy = accuracy_score(y_true = y_val, y_pred = y_pred)     # 准确率
precision = precision_score(y_true = y_val, y_pred = y_pred)   # 简单精度
recall = recall_score(y_true = y_val, y_pred = y_pred)         # 召回率
f1 = f1_score(y_true = y_val, y_pred = y_pred)                 # f1值
# 以下调用方法在单组和多组分类（完全不平衡数据/异常值检测/多分类任务）时不适用。只适用于二分类数据。
true_negative = cm[0][0]                                # 真阴性数量
false_negative = cm[1][0]                               # 假阴性数量
true_positive = cm[1][1]                                # 真阳性数量
false_positive = cm[0][1]                               # 假阳性数量
fpr = false_positive / (false_positive + true_negative) # 假阳性率，alpha，I类错误
fnr = false_negative / (false_negative + true_positive) # 假阴性率，beta，II类错误

# 可视化：分类任务

In [None]:
# 分类任务：绘图：混淆矩阵，ROC曲线，auc，CAP（累计准确）曲线，增益曲线，提升曲线
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, precision_recall_curve, PrecisionRecallDisplay
from sklearn.inspection import DecisionBoundaryDisplay
from scikitplot.metrics import plot_confusion_matrix, plot_roc, plot_precision_recall, plot_cumulative_gain, plot_lift_curve

# 混淆矩阵
ConfusionMatrixDisplay(cm)
ConfusionMatrixDisplay(y_val, y_pred)
plot_confusion_matrix(y_true = y_val, y_pred = y_pred)

# 受试者操作曲线及曲线下面积 ROC+auc
## 梯形auc/ROCauc不适用于不平衡数据。如有不平衡数据，应采用精确召回曲线precision-recall curve
y_pred_p1 = lr.predict_proba(X_val)[:,1]                                  # 应使用模型计算预测概率而非预测结果，以调用ROC AUCH法auc
y_pred_p2 = lr.predict_proba(X_val)                                       # 不太确定预测概率需不需要使用后面的调用内容，等有具体工程时验证
fpr, tpr, threshold = roc_curve(y_true = y_val, y_pred = y_pred)          # ROC曲线
Auc = auc(x = fpr, y = tpr)                                               # 梯形法auc
roc_auc = roc_auc_score(y_true = y_val, y_pred = y_pred_p1)               # ROC AUCH法auc。该方法使用y_pred_p1
## sklearn包的ROC曲线绘图
## 其他自定义参数：name（ROC曲线名），color（颜色）等。本质上这是一个运用plt的绘图，所运用的函数是plot函数，因此其他附加的参数都可以附加到kwargs字典-中传入。
RocCurveDisplay.from_estimator(lr, X_test, y_test)
RocCurveDisplay.from_predictions(y_true = y_val, y_pred = y_pred)
## scikitplot包的ROC曲线绘图
plot_roc(y_true = y_val,  y_probas = y_pred_p2)                           # 该方法使用y_pred_p2

# 精度召回曲线
prec, recall, threshold = precision_recall_curve(y_true = y_val, probas_pred = y_pred)
PrecisionRecallDisplay(precision = prec, recall = recall).plot()
plot_precision_recall(y_true = y_val, y_probas = y_pred_p2)               # 该方法使用y_pred_p2

# 累计准确曲线 Cumulative Accuracy Profile, CAP。该函数未测试，并仅适用于0-1编码的二分类测量
def CAP(y_true = y_val, y_pred = y_pred):
    lm = [y for _, y in sorted(zip(y_pred, y_true), reverse=True)]
    x = np.arange(0, len(y_true)+1)
    y = np.append([0], np.cumsum(lm))
    plt.figure(figsize = (20, 12))
    plt.plot([0, len(y_true)], [0, np.sum(y_true)], c = 'b', linestyle = '--', label = 'Random Model')
    plt.plot(x, y, c='r', label = 'Random Forest Classifier')
    plt.plot([0, np.sum(y_true), len(y_true)], [0, np.sum(y_true), np.sum(y_true)],
            c = 'grey', linewidth = 2, label = 'Perfect Model')
    plt.legend()
    return plt.show()
CAP(y_true = y_val, y_pred = y_pred)

# 增益曲线 Gain curve   不用于评估模型，只用于商业决策
plot_cumulative_gain(y_true = y_val, y_probas = y_pred_p2)                # 该方法使用y_pred_p2

# 提升曲线 Lift curve   不用于评估模型，只用于商业决策
plot_lift_curve(y_true = y_val, y_probas = y_pred_p2)                     # 该方法使用y_pred_p2

# 绘制决策边界（暂空）
DecisionBoundaryDisplay()




# 所有作图函数都应使用：
plt.show()
# 来展示图表

# 可视化：回归任务

In [None]:
from sklearn.metrics import PredictionErrorDisplay
# 散点图：预测误差可视化
PredictionErrorDisplay(y_true = y_val, y_pred = y_pred).plot()

# 所有作图函数都应使用：
plt.show()
# 来展示图表

# 交叉验证与超参搜索

In [None]:
# Cross-validation k-fold 单次处理。推荐使用该方法进行模型超参优化
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, GroupKFold # K-fold类方法只用于划分数据，不用于计算结果。计算结果需要使用cross_val_score
CrossValidator = KFold(n_splits = 5)
scores = cross_val_score(estimator = lr, X = X_train, y = y_train, cv = CrossValidator)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # 输出准确性均值和标准差。

# 一个更详细的Cross-validation流程，假设x与y已经预处理过
from sklearn.svm import LinearSVC
model = LinearSVC()
def Kfold(n_splits = 5, X_train = X_train, y_train = y_train, model = model):
    CrossValidator = KFold(n_splits = n_splits)
    val_score = []
    num = 0
    for train_index, val_index in CrossValidator.split(X = X_train, y = y_train):
        train_X, val_X = X_train(train_index), X_train(val_index)
        train_y, val_y = y_train(train_index), y_train(val_index)
        model.fit(train_X, train_y)
        pred = model.predict(val_X)
        score = precision_score(val_y, pred)
        print("Fold " + str(num + 1) + "============> Precision:" + str(round(score, 4)))
        num += 1
        val_score.append(score)
    return val_score

# Cross-validation LeaveOneOut
from sklearn.model_selection import LeaveOneOut

# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 10, 100, 1000], 'kernel':['linear']},
              {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.5, 0.1, 0.01, 0.001, 0.0001]}
              ]
grid_search = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_index = grid_search.best_index_
best_score = grid_search.best_score_

# 有监督方法：机器学习方法
只列出部分参数用于自己调参，不代表调参只有这些参数

In [None]:
# 假参数，代表后续自己调整。
k = 0

# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm = "auto", leaf_size = k, metric = "minkowski", metric_params = None, n_jobs = k, n_neighbors= k, p = k, weight = 'uniform')
knn.fit(X = X_train, y = y_train)
y_pred = knn.predict(X_val)


# LDA TODO
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


# 树方法：CART
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
CARTclassifier = DecisionTreeClassifier(criterion = 'gini',  # 信息准则
                                        max_depth = k,       # 最大深度
                                        max_features = k     # 最多分割节点，一般为特征个数的sqrt  
                                        )
CARTclassifier.fit(X = X_train, y = y_train)
y_pred = CARTclassifier.predict(X_val)


# 集成：普通随机森林
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
forest = RandomForestClassifier(n_estimators = 500,           # 树个数
                                criterion = 'gini',           # 信息准则
                                max_depth = 7,                # 最大深度
                                max_features = 2              # 最多分割节点，一般为特征个数的sqrt  
                                )
forest.fit(X = X_train, y = y_train)
y_pred = forest.predict(X_val)


# classifier SVM
from sklearn.svm import LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM, SVC, SVR # 线性、一分类（异常值检测）支持向量机、Nu/C支持向量机（惩罚因子支持向量机）、支持向量回归
svm = SVC(C = 1.0,           # 惩罚系数/松弛变量 slack variable/nu 边界存在错误点的比例
          kernel = 'linear') # 线性核，其他核函数有高斯核/径向基函数（RBF），sigmoid核（Sigmoid），Poly（Poly）
svm.fit(X = X_train, y = y_train)
y_pred = svm.predict(X_val)


# Naive Bayesian 朴素贝叶斯
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, GaussianNB, MultinomialNB
## 分别适用于特征取值为伯努利、多分类、高斯（正态）和不平衡数据集。在大多数场景中，特征是正态的，因此例子选用高斯朴素贝叶斯。然而，在NLP中，多分类或是伯努利更加常见
BayesClassifier = GaussianNB()
BayesClassifier.fit(X = X_train, y = y_train)
y_pred = BayesClassifier.predict(X = X_val)


# 无监督方法：聚类

In [1]:
# Kmeans。sklearn的kmeans类默认使用了Kmeans++方法，因此不用再调整。
## first step：如何确定k值？
### 1. k = sqrt(n/2), n为样本量
from math import sqrt
k = sqrt(len(x[0]))
### 2. 肘部法则 Elbow Method，适用于小样本。思想理解可参考EFA/PCA的碎石图
def Elbow(k = k, x = x):
    wcss = []
    for i in range (1, k):
        kmeans = KMeans(n_cluster = 1, max_iter = 300, n_init = 10, init = 'k-means++', random_state = 0)
        kmeans.fit(x)
        wcss.append(kmeans.inertia_)  # 利用组内平方和wcss计算k最优解
    plt.plot(range(1, k), wcss)
    plt.title('The Elbow Method')
    plt.xlabel('Number of Cluster')
    plt.ylabel('WCSS')
    return plt.show()
Elbow(k, x)
### 3. Canopy算法，利用Canopy聚类（精度较低的聚类方法）得到一个初步结果，再利用这个结果进行精度较高的KMeans聚类
from ML_utilities import Canopy, showCanopy # 
t1, t2 = 0.6, 0.4
gc = Canopy(x)
gc.setThreshold(t1, t2)
canopies = gc.clustering()
print('Get %s initial centers.' % len(canopies)) # 其实到这步就够了
showCanopy(canopies, x, t1, t2)

## second step：运行kmeans。默认使用kmeans++来确定初始质心位置。得到的label_kmeans就是聚类后标签。
k = 3 # 输入上一步确定的k值
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = k, init = 'k-means++')
label_kmeans = kmeans.fit_predict(x)

# 学习向量量化
from sklearn_lvq import GlvqModel
lvq = GlvqModel()

def train_weighting_vectors(learning_rate, n_epochs, initial_weighting_vectors, training_df, y_name):
    # Select features
    y_idx = training_df.columns.get_loc(y_name)
    feature_names = training_df.columns[training_df.columns != y_name]
    feature_loc = [i for i in range(len(training_df.columns)) if i != y_idx]
    
    best_matching_vector = initial_weighting_vectors.copy() # Make a deep copy to compare
    
    for epoch in range(n_epochs):
        # Learning rate impact will decrease each epoch
        rate = learning_rate * (1 - (epoch / float(n_epochs)))
        
        # Iterate through each row of the training dataset
        
        for idx in range(len(training_df)):
            row = training_df.iloc[[idx], feature_loc]
            
            # compute the Euclidean Distance against the training row and select the column with the closest distance
            
            bmu = row.apply(lambda row: np.sqrt(((best_matching_vector.iloc[:, feature_loc] - row)**2).sum(axis=1)), axis=1).idxmin(axis='columns').iloc[0]
            
            # Select the BMU vector with the closest distance
            selected_bmu = best_matching_vector.loc[[bmu], feature_names]
            
            # Compute the error between the Closest BMU and the training row
            error = (row.reset_index(drop=True) - selected_bmu.reset_index(drop=True))
            
            # If the BMU has the same class as the current training row, adjust the BMU vector closer to the training row through error * learning rate
            if best_matching_vector.loc[bmu, y_name] == training_df.iloc[idx, y_idx]:
                best_matching_vector.loc[[bmu], feature_names] = best_matching_vector.loc[bmu, feature_names].values + (error.values * rate)
            # Otherwise, make it them ever more far apart from the current training row to make sure they are not selected in the next iteration (i.e. Euclidean Distance)
            else:
                best_matching_vector.loc[[bmu], feature_names] = best_matching_vector.loc[bmu, feature_names].values - (error.values * rate)
    
    return best_matching_vector

def predict_lqr(test_df, trained_vectors_df, y_name):
    idx_y = test_df.columns.get_loc(y_name)
    features_iloc = [i for i in range(len(test_df.columns)) if i != idx_y]
    filt = test_df.iloc[:, features_iloc].apply(lambda row: np.sqrt(((trained_vectors_df.iloc[:, features_iloc] - row)**2).sum(axis=1)), axis=1).idxmin(axis=1)
    return trained_vectors_df.loc[filt, y_name].values

# 高斯混合聚类
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture()
gmm.fit(X = X_train)

# DBSCAN
from sklearn.cluster import DBSCAN
dbscan = DBSCAN()
clusters = dbscan.fit_predict(X = X_train)

# 凝聚聚类
from sklearn.cluster import AgglomerativeClustering
agg = AgglomerativeClustering(n_cluster = k)
assignment = agg.fit_predict(X = X_train)

# 无监督方法：降维

In [None]:
# 非负矩阵分解
from sklearn.decomposition import NMF, MiniBatchNMF, non_negative_factorization
nmf = NMF(n_components = k)
x_score = nmf.fit_transform(X = X_train)

# PCA，详细代码见EFA+CFA的训练文件。KernelPCA等方法的写作方式类似
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, SparsePCA

# 因子分析不使用sklearn包。

# ICA
from sklearn.decomposition import FastICA, fastica
from mne.preprocessing import ICA

# 流形学习 TODO
from sklearn.manifold import TSNE
## t-SNE，用于高维数据可视化
from matplotlib import ticker

def add_2d_scatter(ax, points, points_color, title=None):
    x, y = points.T
    ax.scatter(x, y, c=points_color, s=50, alpha=0.8)
    ax.set_title(title)
    ax.xaxis.set_major_formatter(ticker.NullFormatter())
    ax.yaxis.set_major_formatter(ticker.NullFormatter())

def plot_2d(points, points_color, title):
    fig, ax = plt.subplots(figsize=(3, 3), facecolor="white", constrained_layout=True)
    fig.suptitle(title, size=16)
    add_2d_scatter(ax, points, points_color)
    plt.show()
X_embedded = TSNE(n_components = 2, learning_rate = 'auto', init = 'pca', perplexity = 20).fit_transform(X = X_train)
plot_2d(points = X_embedded, points_color = y_train, title = "T-distributed Stochastic \n Neighbor Embedding") # 这里的points color记得要改

# 关联规则学习

In [None]:
# 关联规则学习：先验算法
### 相关性的体现，采用非参数方法计算。用于离散数据。一般收到的原始数据为稀疏矩阵。
### 预处理
path = ''
dataset = pd.read_csv(filepath_or_buffer = path, header = None)
transaction = []
for i in range(0, len(dataset)):
    transaction.append([str(dataset.values[i, j]) for j in range(0, 20)])
from apyori import apriori # https://zhuanlan.zhihu.com/p/71538840，不对三个指标进行解释，需要可以后续观看该专栏。
rules = apriori(transactions = transaction, min_support = 0.003, min_confidence = 0.2, min_lift = 3, min_length = 2) # 需要根据业务情况，合理设置最小支持度、最小可信度、最小提升度
result = [list(x) for x in list(rules)] # 出来的是变量之间的三个指标。可以后续整理成相关矩阵。