# Predicting Republican and Democratic donations

## Part 2


下面以“预测共和党和民主党的捐款”为例，进行说明，数据下载[地址](https://www.dataquest.io/blog/large_files/input.csv)。

在Part1中已经对多个模型进行了平均化处理，为了更好的区别模型之间的性能差别，下面采用平均加权的方法进行数据的处理，即在对每个模型赋予初始权值，再进行平均处理。

In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# set seed to reproducibility
SEED = 222
np.random.seed(SEED)

#读取数据
df = pd.read_csv("/tmp/data_input/kaggle/Predicting_donations/input.csv")

#处理数据
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def get_train_test(test_size = 0.95):
    # 将数据分为训练集和测试集
    # 获取共和党的标签
    y = 1 * (df.cand_pty_affiliation == "REP")
    #去掉除共和党和民主党其他党派的捐款
    X = df.drop(["cand_pty_affiliation"], axis=1)
    X = pd.get_dummies(X, sparse=True)
    X.drop(X.columns[X.std() == 0], axis=1, inplace=True)
    
    return train_test_split(X, y, test_size=test_size)

#将原始数据处理为训练数据集和测试数据集
xtrain, xtest, ytrain, ytest = get_train_test()


In [78]:
# 集成更多的模型来对数据进行训练
from sklearn.svm import SVC, LinearSVC  # 支持向量机
from sklearn.naive_bayes import GaussianNB # 高斯朴素贝叶斯
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # 随机森林和Boosting
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier # K近邻
from sklearn.neural_network import MLPClassifier # 神经网络
from sklearn.kernel_approximation import Nystroem, RBFSampler
from sklearn.pipeline import make_pipeline


#生成基本的学习器
def get_models():
    nb = GaussianNB();
    svc = SVC(C=100, probability=True)
    knn = KNeighborsClassifier(n_neighbors=3)
    lr = LogisticRegression(C=100, random_state=SEED)
    nn = MLPClassifier((80, 10), early_stopping=False,  random_state=SEED)
    gb = GradientBoostingClassifier(n_estimators=10, random_state=SEED)
    rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)
    
    models = {"svm": svc,
             "knn":knn,
             "native bayes": nb,
             "lr": lr,
             "nn": nn,
             "boosting": gb,
             "random forest": rf,
             }
    
    return models


# 开始进行模型的训练
def train_predict(models_list):
    # fit多个学习模型，并返回预测结果
    store = np.zeros((ytest.shape[0], len(models_list)))
    store = pd.DataFrame(store)
    
    print("Starting to fit\n")
    cols = list()
    for i, (name, model) in enumerate(models_list.items()):
        print("%s..." % name, end=" ", flush=False)
        model.fit(xtrain, ytrain)
        store.iloc[:, i] = model.predict_proba(xtest)[:, 1]
        cols.append(name)
        print("model done\n")
    store.columns = cols
    print("Done\n")
    
    return store


# 构建预测模型
def score_model(y, store):
    print("Scoring model\n")
    for m in store.columns:
        score = roc_auc_score(y, store.loc[:, m])
        print ("%-26s: %.3f" % (m, score))
    print("Done\n")

**（一）定义基学习模型的库**

In [79]:
base_learners = get_models()

**（二）定义元学习器**

目前远学习器的选择有多种，包括logistics regression，SVM，KNN，Decision Tree等，也可以采用另外一个ensemble的集成学习器，下面采用Gradient Boosting Machine（GBM）。

为了确保 GBM 能够探索局部特征，我们需要限定每 1000 个决策树在 4 个基学习器的随机子集和 50% 的输入数据上进行训练。这样，GBM 就会表达每个基学习器在不同近邻输入空间上的预测内容。

In [80]:
meta_learner = GradientBoostingClassifier(n_estimators=1000,
                                         loss="exponential",
                                         max_features = 4,
                                         max_depth = 3,
                                         subsample=0.5,
                                         learning_rate=0.005,
                                         random_state=SEED)



**（三）生成训练集合测试集**

为基学习器准备训练集


In [81]:
xtrain_base, xpre_base, ytrain_base, ypre_base = train_test_split(xtrain, 
                                                                  ytrain, 
                                                                  test_size=0.5, 
                                                                  random_state=SEED)

**（四）在训练集上训练基学习器**


In [82]:
def train_base_learners(base_learner, inp, out, verbose=True):
    """train all base learners in the lib"""
    if verbose:
        print("Fitting models")
        
    for i , (name, m) in enumerate(base_learner.items()):
        if verbose:
            print("%s ..." % name, end=" ", flush=False)
            m.fit(inp, out)
        if verbose:
            print("Done\n")

In [83]:
# 训练学习器
train_base_learners(base_learners, xtrain_base, ytrain_base)

Fitting models
svm ... Done

knn ... Done

native bayes ... Done

lr ... Done

nn ... Done

boosting ... Done

random forest ... Done



**(五) 对数据进行预测**

In [84]:
def predict_base_learners(pre_base_learners, inp, verbose=True):
    """generate a prediction matrix"""
    P = np.zeros((inp.shape[0], len(pre_base_learners)))
    if verbose:
        print("Generate models predictions\n")
    for i, (name, m) in enumerate(pre_base_learners.items()):
        if verbose:
            print("%s ... " % name, end=" ", flush=False)
            pre = m.predict_proba(inp)
            P[:, i] = pre[:,1]
        if verbose:
            print("Done\n")
    return P

In [85]:
#对学习器进行预测
P_base = predict_base_learners(base_learners, xpre_base)

Generate models predictions

svm ...  Done

knn ...  Done

native bayes ...  Done

lr ...  Done

nn ...  Done

boosting ...  Done

random forest ...  Done



**（六）对元学习器进行训练**

In [86]:
meta_learner.fit(P_base, ypre_base)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.005, loss='exponential', max_depth=3,
              max_features=4, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=222, subsample=0.5, verbose=0,
              warm_start=False)

In [87]:
def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
    """Generate prediction from ensemble"""
    P_pred = predict_base_learners(base_learner,inp,verbose=verbose)
    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]

In [88]:
P_prd, p = ensemble_predict(base_learners, meta_learner, xtest, verbose=True)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))

Generate models predictions

svm ...  Done

knn ...  Done

native bayes ...  Done

lr ...  Done

nn ...  Done

boosting ...  Done

random forest ...  Done


Ensemble ROC-AUC score: 0.880


从上述的结果可以看出，这次集成学习模型优于之前的集成学习，但是仍然低于简单的决策树估计，这主要是只对一半的数据进行基学习器和元学习器的训练，所以大量的信息丢失了。为了防止这点，下面使用交叉验证策略进行进一步的优化。

### 交叉验证训练


在交叉验证训练基学习器时，每个基学习器的备份都进行了 K-1 fold 的拟合，并进行了剩余 fold 的预测。这一过程不断重复，直到每个 fold 都被预测。我们指定的 fold 越多，每次训练过程中的数据就越少。这使得交叉验证的预测在测试期间噪声更小，性能更好。但这显著增加了训练时间。通过交叉验证拟合一个集成经常被称为堆叠（stacking），而集成本身会被称为超级学习器（Super Learner）。

In [89]:
from sklearn.base import clone

def stacking(base_learners, meta_learner, X, y, generator):
    """Simple training routine for stacking."""
    print("Fitting final base learners...", end=" ")
    train_base_learners(base_learners, X, y, verbose=False)
    print("done")
    
    # Generate predictions for training meta learners
    # Outer loop:
    
    print("Generating cross-validated predictions...")
    cv_pre, cv_y = [],[]
    
    for i, (train_idx, test_idx) in enumerate(generator.split(X)):
        
        flod_xtrain, flod_ytrain = X[train_idx,:], y[train_idx]
        flod_xtest, flod_ytest = X[test_idx,:], y[test_idx]
        
        flod_base_learners = {name: clone(learner) 
                              for name, learner in base_learners.items()}
        train_base_learners(flod_base_learners, flod_xtrain, flod_ytrain, verbose=False)
        flod_P_base = predict_base_learners(flod_base_learners, flod_xtest, verbose=False)
        
        cv_pre.append(flod_P_base)
        cv_y.append(flod_ytest)
        
        print("Flod %i is done" % (i + 1))
    
    print("CV-predictions done")
    
    cv_pre = np.vstack(cv_pre)
    cv_y = np.hstack(cv_y)
    
    print("Fitting meta learning...", end=" ")
    meta_learner.fit(cv_pre, cv_y)
    print("Done\n")
    
    return base_learners, meta_learner






In [90]:
from sklearn.model_selection import KFold

cv_base_learners, cv_meta_learner = stacking(get_models(), 
                                             clone(meta_learner), 
                                             xtrain.values, 
                                             ytrain.values, 
                                             KFold(2))

P_pre, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))


Fitting final base learners... done
Generating cross-validated predictions...
Flod 1 is done
Flod 2 is done
CV-predictions done
Fitting meta learning... Done


Ensemble ROC-AUC score: 0.500


使用工具包

In [91]:

from mlens.ensemble import SuperLearner
# Instantiate the ensemble with 10 folds
sl = SuperLearner(folds=10, random_state=SEED, verbose=2, backend="multiprocessing")

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True)
sl.add_meta(meta_learner, proba=True)

# Train the ensemble

sl.fit(xtrain, ytrain)

# Predict the test set
p_sl = sl.predict_proba(xtest)
print("\nSuper Learner ROC_AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))


Fitting 2 layers
Processing layer-1             done | 00:03:10
Processing layer-2             done | 00:00:02
Fit complete                        | 00:03:14

Predicting 2 layers
Processing layer-1             done | 00:00:52
Processing layer-2             done | 00:00:01
Predict complete                    | 00:00:54

Super Learner ROC_AUC score: 0.888
