In [1]:
# 代码采用了多个模型进行预测，并进行了简单的预处理
# 这些模型有：梯度下降树，随机森林，决策树，朴素贝叶斯，SVM，以及通过bagging串联起各个模型进行预测

In [2]:
# 导入科学计算库
import pandas as pd
import numpy as np

In [3]:
# 导入机器学习库中的评价函数
from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score

In [4]:
# 导入预处理及相关工具函数
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [5]:
# 导入各种分类器
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [6]:
def get_score(y_pred, y_true):
    """
    获取模型的评价指标(准确率，精确率，回召率，F-score)，用于衡量模型的好坏
    """
    acc_ = accuracy_score(y_true=y_true,y_pred=y_pred)
    TP = np.sum(((y_pred == 1) & (y_true == 1))) 
    precision = TP / np.sum(y_pred)
    recall = TP / np.sum(y_true)
    print('TP: ',TP,'/', np.sum(y_true), 'all ',np.sum(y_pred), ' accuracy: ',acc_, ' precision: ',precision, ' recall: ',recall, ' F_score: ', 2 * precision * recall / (precision + recall),fbeta_score(y_true=y_true,y_pred=y_pred,beta=1) )

In [7]:
def get_features_middle(data):
    model_sample_strong_feature = data.copy()
    # 将身份信息以及财产信息进行编码
    model_sample_strong_feature['x_022/x_020'] = data['x_022'] / (data['x_020'] + 1e-10)
    model_sample_strong_feature['x_023/x_020'] = data['x_023'] / (data['x_020'] + 1e-10)
    model_sample_strong_feature['x_024/x_020'] = data['x_024'] /  (data['x_020'] + 1e-10)
    model_sample_strong_feature['x_025/x_020'] = data['x_025'] /  (data['x_020']+ 1e-10)
    model_sample_strong_feature['x_026/x_020'] = data['x_026'] /  (data['x_020'] + 1e-10)
    
    # 贷记卡的比例特征
    model_sample_strong_feature['x_028/x_021'] = data['x_028'] / (data['x_021'] + 1e-10)
    model_sample_strong_feature['x_029/x_021'] = data['x_029'] / (data['x_021'] + 1e-10)
    model_sample_strong_feature['x_030/x_021'] = data['x_030'] /  (data['x_021'] + 1e-10)
    model_sample_strong_feature['x_031/x_021'] = data['x_031'] /  (data['x_021'] + 1e-10)
    model_sample_strong_feature['x_032/x_021'] = data['x_032'] /  (data['x_021'] + 1e-10)
    
    # 银行卡的比例特征
    model_sample_strong_feature['all_cards'] = (data['x_034'] +  data['x_035'] + data['x_036'] + data['x_037'] + data['x_038'] + data['x_039'] + data['x_040']  ).values

    model_sample_strong_feature['x_034/all_cards'] = data['x_034'] / (model_sample_strong_feature['all_cards'] + 1e-10)
    model_sample_strong_feature['x_035/all_cards'] = data['x_035'] / (model_sample_strong_feature['all_cards'] + 1e-10)
    model_sample_strong_feature['x_036/all_cards'] = data['x_036'] /  (model_sample_strong_feature['all_cards'] + 1e-10)
    model_sample_strong_feature['x_037/all_cards'] = data['x_037'] /  (model_sample_strong_feature['all_cards'] + 1e-10)
    model_sample_strong_feature['x_038/all_cards'] = data['x_038'] /  (model_sample_strong_feature['all_cards'] + 1e-10)
    model_sample_strong_feature['x_039/all_cards'] = data['x_039'] /  (model_sample_strong_feature['all_cards'] + 1e-10)
    model_sample_strong_feature['x_040/all_cards'] = data['x_040'] /  (model_sample_strong_feature['all_cards'] + 1e-10)
   
    # 标准差还原
    model_sample_strong_feature['x_043/x_044'] = data['x_043'] / (data['x_044'] + 1e-10)
    model_sample_strong_feature['x_046/x_047'] = data['x_046'] / (data['x_047'] + 1e-10)
    model_sample_strong_feature['x_050/x_051'] = data['x_050'] / (data['x_051'] + 1e-10)
    model_sample_strong_feature['x_053/x_054'] = data['x_053'] / (data['x_054'] + 1e-10)
    model_sample_strong_feature['x_057/x_058'] = data['x_057'] / (data['x_058'] + 1e-10)
    model_sample_strong_feature['x_060/x_061'] = data['x_060'] / (data['x_061'] + 1e-10)
    model_sample_strong_feature['x_076/x_077'] = data['x_076'] / (data['x_077'] + 1e-10)
    model_sample_strong_feature['x_079/x_080'] = data['x_079'] / (data['x_080'] + 1e-10)
    model_sample_strong_feature['x_083/x_084'] = data['x_083'] / (data['x_084'] + 1e-10)
    model_sample_strong_feature['x_086/x_087'] = data['x_086'] / (data['x_087'] + 1e-10)
    model_sample_strong_feature['x_090/x_091'] = data['x_090'] / (data['x_091'] + 1e-10)
    model_sample_strong_feature['x_094/x_095'] = data['x_094'] / (data['x_095'] + 1e-10)
    model_sample_strong_feature['x_098/x_099'] = data['x_098'] / (data['x_099'] + 1e-10)
    model_sample_strong_feature['x_123/x_124'] = data['x_123'] / (data['x_124'] + 1e-10)
    model_sample_strong_feature['x_126/x_127'] = data['x_126'] / (data['x_127'] + 1e-10)

    
    # 每张卡（信用or其他）交易金额等；每笔（异地每笔）交易金额等；每笔还款金额等；每笔商旅，保险，家装，金融等的均值特征；每个月的平均交易笔数；其他有意义的均值特征
    
    model_sample_strong_feature['x_045/x_41'] = data['x_045'] / (data['x_041'] + 1e-10)
    model_sample_strong_feature['x_052/x_48'] = data['x_052'] / (data['x_048'] + 1e-10)
    model_sample_strong_feature['x_059/x_55'] = data['x_059'] / (data['x_055'] + 1e-10) 
    model_sample_strong_feature['x_064/x_062'] = data['x_064'] / (data['x_062'] + 1e-10)
    model_sample_strong_feature['x_067/x_065'] = data['x_067'] / (data['x_065'] + 1e-10)  
    model_sample_strong_feature['x_070/x_068'] = data['x_070'] / (data['x_068'] + 1e-10)
    model_sample_strong_feature['x_073/x_071'] = data['x_073'] / (data['x_071'] + 1e-10) 
    model_sample_strong_feature['x_078/x_074'] = data['x_078'] / (data['x_074'] + 1e-10)  
    model_sample_strong_feature['x_085/x_081'] = data['x_085'] / (data['x_081'] + 1e-10) 
    model_sample_strong_feature['x_100/x_101'] = data['x_100'] / (data['x_101'] + 1e-10)
    model_sample_strong_feature['x_102/x_103'] = data['x_102'] / (data['x_103'] + 1e-10) 
    model_sample_strong_feature['x_108/x_105'] = data['x_108'] / (data['x_105'] + 1e-10)
    model_sample_strong_feature['x_104/x_102'] = data['x_104'] / (data['x_102'] + 1e-10) 
    model_sample_strong_feature['x_109/x_110'] = data['x_109'] / (data['x_110'] + 1e-10)
    model_sample_strong_feature['x_111/x_109'] = data['x_111'] / (data['x_109'] + 1e-10) 
    model_sample_strong_feature['x_112/x_113'] = data['x_112'] / (data['x_113'] + 1e-10)
    model_sample_strong_feature['x_114/x_112'] = data['x_114'] / (data['x_112'] + 1e-10) 
    model_sample_strong_feature['x_115/x_116'] = data['x_115'] / (data['x_116'] + 1e-10)
    model_sample_strong_feature['x_117/x_115'] = data['x_117'] / (data['x_115'] + 1e-10)  
    model_sample_strong_feature['x_118/x_119'] = data['x_118'] / (data['x_119'] + 1e-10)
    model_sample_strong_feature['x_120/x_118'] = data['x_120'] / (data['x_118'] + 1e-10) 
    model_sample_strong_feature['x_125/x_121'] = data['x_125'] / (data['x_121'] + 1e-10) 
    model_sample_strong_feature['x_128/x_129'] = data['x_128'] / (data['x_129'] + 1e-10)
    model_sample_strong_feature['x_130/x_128'] = data['x_130'] / (data['x_128'] + 1e-10)

    # 每笔放款金额，每个机构的放款笔数，每个机构的放款金额
    model_sample_strong_feature['x_133/x_134'] = data['x_133'] / (data['x_134'] + 1e-10)
    model_sample_strong_feature['x_133/x_132'] = data['x_133'] / (data['x_132'] + 1e-10)
    model_sample_strong_feature['x_134/x_132'] = data['x_134'] / (data['x_132'] + 1e-10) 
    model_sample_strong_feature['x_138/x_139'] = data['x_138'] / (data['x_139'] + 1e-10)
    model_sample_strong_feature['x_138/x_137'] = data['x_138'] / (data['x_137'] + 1e-10)
    model_sample_strong_feature['x_139/x_137'] = data['x_139'] / (data['x_137'] + 1e-10) 
    model_sample_strong_feature['x_143/x_142'] = data['x_143'] / (data['x_142'] + 1e-10)
    model_sample_strong_feature['x_143/x_144'] = data['x_143'] / (data['x_144'] + 1e-10)
    model_sample_strong_feature['x_144/x_142'] = data['x_144'] / (data['x_142'] + 1e-10)

    # 每个机构的放款均值,失败还款笔数占比
    model_sample_strong_feature['x_151/x_149'] = data['x_151'] / (data['x_149'] + 1e-10)
    model_sample_strong_feature['x_152/x_149'] = data['x_152'] / (data['x_149'] + 1e-10)
    model_sample_strong_feature['x_152/x_151'] = data['x_152'] / (data['x_151'] + 1e-10)
    model_sample_strong_feature['x_154/x_153'] = data['x_154'] / (data['x_153'] + 1e-10)
    model_sample_strong_feature['x_156/x_153'] = data['x_156'] / (data['x_153'] + 1e-10)
    model_sample_strong_feature['x_157/x_153'] = data['x_157'] / (data['x_153'] + 1e-10)
    model_sample_strong_feature['x_158/x_153'] = data['x_158'] / (data['x_153'] + 1e-10)
    model_sample_strong_feature['x_159/x_153'] = data['x_159'] / (data['x_153'] + 1e-10)  
    model_sample_strong_feature['x_154/x_155'] = data['x_154'] / (data['x_155'] + 1e-10)  

    model_sample_strong_feature['x_164/x_162'] = data['x_164'] / (data['x_162'] + 1e-10)
    model_sample_strong_feature['x_165/x_162'] = data['x_165'] / (data['x_162'] + 1e-10)
    model_sample_strong_feature['x_165/x_164'] = data['x_165'] / (data['x_164'] + 1e-10)
    model_sample_strong_feature['x_167/x_166'] = data['x_167'] / (data['x_166'] + 1e-10)
    model_sample_strong_feature['x_169/x_166'] = data['x_169'] / (data['x_166'] + 1e-10)
    model_sample_strong_feature['x_170/x_166'] = data['x_170'] / (data['x_166'] + 1e-10)
    model_sample_strong_feature['x_171/x_166'] = data['x_171'] / (data['x_166'] + 1e-10)
    model_sample_strong_feature['x_180/x_181'] = data['x_180'] / (data['x_181'] + 1e-10) 
    model_sample_strong_feature['x_167/x_168'] = data['x_167'] / (data['x_168'] + 1e-10) 
    model_sample_strong_feature['x_172/x_167'] = data['x_172'] / (data['x_167'] + 1e-10)  

    model_sample_strong_feature['x_177/x_175'] = data['x_177'] / (data['x_175'] + 1e-10)
    model_sample_strong_feature['x_178/x_175'] = data['x_178'] / (data['x_175'] + 1e-10)
    model_sample_strong_feature['x_178/x_177'] = data['x_178'] / (data['x_177'] + 1e-10)
    model_sample_strong_feature['x_180/x_179'] = data['x_180'] / (data['x_179'] + 1e-10)
    model_sample_strong_feature['x_182/x_179'] = data['x_182'] / (data['x_179'] + 1e-10)
    model_sample_strong_feature['x_183/x_179'] = data['x_183'] / (data['x_179'] + 1e-10)
    model_sample_strong_feature['x_184/x_179'] = data['x_184'] / (data['x_179'] + 1e-10)
    model_sample_strong_feature['x_180/x_181'] = data['x_180'] / (data['x_181'] + 1e-10) 
    model_sample_strong_feature['x_185/x_180'] = data['x_185'] / (data['x_180'] + 1e-10)
 
    # 90天与30天的申请贷款机构的趋势，180天与90天的申请贷款机构的趋势，180天与30天的申请贷款机构的趋势；90天与30天的成功申请贷款机构的趋势，180天与90天的成功申请贷款机构的趋势，180天；
    # 30天的成功申请贷款机构的趋势；90天与30天的申请贷款笔数的趋势，180天与90天的申请贷款笔数的趋势，180天与30天的申请贷款笔数的趋势90天的申请贷款笔数的趋势
    model_sample_strong_feature['x_189/x_188'] = data['x_189'] / (data['x_188'] + 1e-10)
    model_sample_strong_feature['x_191/x_190'] = data['x_191'] / (data['x_190'] + 1e-10)
    model_sample_strong_feature['x_193/x_192'] = data['x_193'] / (data['x_192'] + 1e-10)
    model_sample_strong_feature['x_195/x_194'] = data['x_195'] / (data['x_194'] + 1e-10)
    model_sample_strong_feature['x_197/x_196'] = data['x_197'] / (data['x_196'] + 1e-10)
    model_sample_strong_feature['x_199/x_198'] = data['x_199'] / (data['x_198'] + 1e-10)
    model_sample_strong_feature['x_196/x_188'] = data['x_196'] / (data['x_188'] + 1e-10)
    model_sample_strong_feature['x_192/x_188'] = data['x_192'] / (data['x_188'] + 1e-10)
                                                        
    return model_sample_strong_feature 

In [8]:
# 导入训练文件
data = pd.read_csv("model.csv")

In [9]:
# 以下进行数据预处理
# 去重 + 特征属性处理 + 数据规范化
data = data.dropna(subset=['x_001'])
data[data.iloc[:, 2: ] < 0] = np.nan
X = data.iloc[:, 2:]
training_mean = X.mean()
X = X.fillna(training_mean)
Y = data['y']
X = get_features_middle(X)
X = preprocessing.scale(X)




In [10]:
# 声明PCA，用来将数据降低到50个维度
pca = PCA(50)

In [11]:
# 划分数据集
training_X, test_X, training_Y, test_Y = train_test_split(X, Y, test_size=0.3,shuffle = True)

In [12]:
# PCA对训练数据集进行降维
training_X = pca.fit_transform(training_X)

In [13]:
# 创建SVM分类器
clf = svm.SVC(gamma='auto', probability=True)
# 用SVM分类器进行训练，其中training_X，training_Y 分别代表属性列和类标签列
clf.fit(training_X, training_Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [14]:
regressor = DecisionTreeClassifier(random_state=0)

In [15]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=3, random_state=0).fit(training_X, training_Y)

In [16]:
regressor.fit(training_X, training_Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')

In [17]:
abc = AdaBoostClassifier(n_estimators=100)

In [18]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=None,
    min_samples_split=2, random_state=0)

In [19]:
etc = ExtraTreesClassifier(n_estimators=100, max_depth=None,
    min_samples_split=2, random_state=0)

In [20]:
nb = GaussianNB()

In [21]:
nb.fit(training_X, training_Y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
eclf = VotingClassifier(estimators=[('gbc', gbc), ('nb', nb), ('clf', clf)], voting='soft', weights=[3,5,1])  # 无权重投票


In [23]:
eclf.fit(training_X, training_Y)

VotingClassifier(estimators=[('gbc',
                              GradientBoostingClassifier(criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=1.0,
                                                         loss='deviance',
                                                         max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                              

In [25]:
rfc.fit(training_X, training_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [26]:
abc.fit(training_X, training_Y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=None)

In [27]:
etc.fit(training_X, training_Y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=0, verbose=0,
                     warm_start=False)

In [28]:
# 对测试数据进行降维处理
test_X = pca.transform(test_X)

In [29]:
# 下边**.predict是用各种分类器进行预测

In [30]:
# 用SVM预测，得到一个分类值的序列
preY = clf.predict(test_X)

In [31]:
# 用决策树预测，得到一个分类值的序列
preY1 = regressor.predict(test_X)

In [32]:
# 用梯度提升预测，得到一个分类值的序列
preY2 = gbc.predict(test_X)

In [33]:
# 用随机森林进行预测，得到一个分类值的序列
preY3 = rfc.predict(test_X)

In [34]:
preY4 = etc.predict(test_X)

In [35]:
# 使用集成学习的方法进行预测
preY5 = abc.predict(test_X)

In [36]:
# 用朴素贝叶斯分类器进行预测
preY6 = nb.predict(test_X)

In [37]:
# 使用了投票器，综合多个分类器的结果
preY7 = eclf.predict(test_X)

In [38]:
pd.Series(preY1).value_counts()

0    2359
1     638
dtype: int64

In [39]:
pd.Series(preY).value_counts()

0    2883
1     114
dtype: int64

In [40]:
get_score(preY, test_Y) # SVM评价结果

TP:  59 / 592 all  114  accuracy:  0.8038038038038038  precision:  0.5175438596491229  recall:  0.09966216216216216  F_score:  0.1671388101983003 0.1671388101983003


In [41]:
get_score(preY1, test_Y) # 决策树评价结果

TP:  198 / 592 all  638  accuracy:  0.7217217217217218  precision:  0.3103448275862069  recall:  0.3344594594594595  F_score:  0.32195121951219513 0.32195121951219513


In [42]:
get_score(preY2, test_Y) # 梯度下降树评价结果

TP:  152 / 592 all  454  accuracy:  0.7524190857524191  precision:  0.33480176211453744  recall:  0.25675675675675674  F_score:  0.2906309751434034 0.2906309751434034


In [43]:
get_score(preY3, test_Y) # 随机森林评价结果

TP:  42 / 592 all  79  accuracy:  0.8041374708041374  precision:  0.5316455696202531  recall:  0.07094594594594594  F_score:  0.12518628912071533 0.12518628912071533


In [44]:
get_score(preY4, test_Y)

TP:  22 / 592 all  50  accuracy:  0.8004671338004671  precision:  0.44  recall:  0.037162162162162164  F_score:  0.06853582554517133 0.06853582554517133


In [45]:
get_score(preY5, test_Y) # 运用集成学习综合多个决策树的评价结果

TP:  135 / 592 all  288  accuracy:  0.7964631297964632  precision:  0.46875  recall:  0.22804054054054054  F_score:  0.3068181818181818 0.3068181818181818


In [46]:
get_score(preY6, test_Y) # 朴素贝叶斯

TP:  481 / 592 all  2230  accuracy:  0.3793793793793794  precision:  0.215695067264574  recall:  0.8125  F_score:  0.3408929836995039 0.3408929836995039


In [47]:
get_score(preY7, test_Y) # bagging（朴素贝叶斯+决策树+SVM）

TP:  348 / 592 all  1077  accuracy:  0.675342008675342  precision:  0.3231197771587744  recall:  0.5878378378378378  F_score:  0.4170161773517076 0.4170161773517076


In [48]:
# 读取测试文件
test_data = pd.read_csv("test.csv")

In [49]:
test_data = test_data.iloc[:, 1:]
test_data[test_data.iloc[:, 1: ] < 0] = np.nan
test_data = test_data.fillna(training_mean)
test_data = get_features_middle(test_data)
test_data = preprocessing.scale(test_data)



In [50]:
test_data = pca.transform(test_data)

In [51]:
# 对测试数据进行预测，这里采用的SVM分类器进行预测，将clf, 改成regressor， rfc等便可以改用其他模型进行预测
res = clf.predict(test_data)

In [52]:
# 生成测试结果，存放在result.csv中
np.savetxt('result.csv', res,fmt='%d',delimiter=',')