In [31]:
import pandas as pd
loans_2007 = pd.read_csv('/Users/charley/百度云同步盘/chen/cc/data/LoanStats3a.csv', skiprows=1)

In [32]:
## 自动 删除缺失数据太多的列
half_count = len(loans_2007) / 2  #设置阈值
loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
loans_2007 = loans_2007.drop(['desc', 'url'],axis=1)

In [33]:
## 自动 去掉单调重复列
orig_columns = loans_2007.columns
drop_columns = []
for col in orig_columns:
    col_series = loans_2007[col].dropna().unique()  #不为nan的值
    if len(col_series) == 1:
        drop_columns.append(col) #将单调的列添加到删除的列中
loans_2007 = loans_2007.drop(drop_columns, axis=1)

In [34]:
## 手动 部分特征剔除
loans_2007 = loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d","title","addr_state"], axis=1) #无关特征剔除
loans_2007 = loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1) #结果特征剔除

In [35]:
# 删除
loans_2007 = loans_2007.dropna(axis=0) #删除掉缺失的行
loans_2007 = loans_2007.drop("pub_rec_bankruptcies", axis=1) #删除掉缺失值依然太多的列

In [36]:
## 半自动 对于cat类别的标签替换
loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")] #有些异常的标签不显示了
status_replace = {     #设置替换字典
    "loan_status" : {
        "Fully Paid": 1,
        "Charged Off": 0,
    }
}
loans_2007 = loans_2007.replace(status_replace)  #进行cat类数据替换

In [37]:
## 半自动 处理数据格式
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loans_2007 = loans_2007.replace(mapping_dict)  #复合格式字典替换

loans_2007["int_rate"] = loans_2007["int_rate"].str.rstrip("%").astype("float")  #字符转化
loans_2007["revol_util"] = loans_2007["revol_util"].str.rstrip("%").astype("float")

In [38]:
## 半自动 哑变量处理
cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans_2007[cat_columns]).astype("int64") #生成哑变量
loans_2007 = pd.concat([loans_2007, dummy_df], axis=1) #列向拼接哑变量
loans_2007 = loans_2007.drop(cat_columns, axis=1)
loans_2007 = loans_2007.drop("pymnt_plan", axis=1)

In [39]:
## 半自动 时间格式处理
loans_2007=loans_2007.drop(["earliest_cr_line","last_pymnt_d","last_credit_pull_d"],axis=1)

In [40]:
## 最后的结果
print loans_2007.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38745 entries, 0 to 39749
Data columns (total 44 columns):
loan_amnt                              38745 non-null float64
int_rate                               38745 non-null float64
installment                            38745 non-null float64
annual_inc                             38745 non-null float64
loan_status                            38745 non-null int64
dti                                    38745 non-null float64
delinq_2yrs                            38745 non-null float64
inq_last_6mths                         38745 non-null float64
open_acc                               38745 non-null float64
pub_rec                                38745 non-null float64
revol_bal                              38745 non-null float64
revol_util                             38745 non-null float64
total_acc                              38745 non-null float64
total_rec_int                          38745 non-null float64
total_rec_late_fee       

In [68]:
## 数据集处理
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split 

cols = loans_2007.columns
features= cols.drop("loan_status")
x= loans_2007[features]
y= loans_2007["loan_status"]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.8)
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)


In [71]:
## 模型拟合
def log_reg(y_hat, y):
    p = 1.0 / (1.0 + np.exp(-y_hat))
    g = p - y.get_label()
    h = p * (1.0-p)
    return g, h

def error_rate(y_hat, y):  #定义误差率
    return 'error', float(sum(y.get_label() != (y_hat > 0.5))) / len(y_hat)


params = {
    #'booster': 'gbtree',
    'objective': 'binary:logistic',
    'subsample': 0.8,
    'colsample_bytree': 0.85,
    'eta': 0.3,
    'max_depth': 3,
    'seed': 2016,
    'silent': 1,
    #'eval_metric': 'rmse'
}
watchlist = [(data_test, 'eval'), (data_train, 'train')]
n_round = 5  # 多少轮
bst = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist, obj=log_reg, feval=error_rate) #添加了损失函数的梯度,和错误的评估方法


[0]	eval-error:0.000645	train-error:0.001355	eval-error:0.000645	train-error:0.001355
[1]	eval-error:0.000645	train-error:0.001355	eval-error:0.000645	train-error:0.001355
[2]	eval-error:0.000645	train-error:0.001355	eval-error:0.000645	train-error:0.001355
[3]	eval-error:0.000645	train-error:0.001355	eval-error:0.000645	train-error:0.001355
[4]	eval-error:0.000645	train-error:0.001355	eval-error:0.000645	train-error:0.001355


In [72]:
##预测
y_hat = bst.predict(data_test)
y_hat[y_hat>0.5]=1
y_hat[y_hat<=0.5]=0
y = data_test.get_label()

In [73]:
## 评估
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix #Confusion Matrix 混淆矩阵

confusion=confusion_matrix(y, y_hat)
print confusion_matrix(y, y_hat)
TP=confusion[1,1]
TN=confusion[0,0]
FP=confusion[0,1]
FN=confusion[1,1]

print metrics.precision_score(y, y_hat) # TP/(TP+FP)
print metrics.recall_score(y, y_hat)  # TP/(TP+FN)
print metrics.f1_score


[[1127    5]
 [   0 6617]]
0.999244941105
1.0
<function f1_score at 0x10af78320>
