In [21]:
# 一、导入库和模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestRegressor as rfr
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [8]:
# 二、导入数据并探索
data = pd.read_csv('./datas/rankingcard.csv',index_col=0)
print(data.info())
print()
print(data.shape)
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
SeriousDlqin2yrs                        150000 non-null int64
RevolvingUtilizationOfUnsecuredLines    150000 non-null float64
age                                     150000 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    150000 non-null int64
DebtRatio                               150000 non-null float64
MonthlyIncome                           120269 non-null float64
NumberOfOpenCreditLinesAndLoans         150000 non-null int64
NumberOfTimes90DaysLate                 150000 non-null int64
NumberRealEstateLoansOrLines            150000 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    150000 non-null int64
NumberOfDependents                      146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 13.7 MB
None

(150000, 11)


Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [9]:
# 三、数据预处理（重复值、缺失值处理）

#1.删除重复值并更新索引
data.drop_duplicates(inplace=True)
data.index = range(data.shape[0])

#2.使用均值填补家庭中不包括自身的家属人数（配偶，子女等）
data["NumberOfDependents"].fillna(int(data["NumberOfDependents"].mean()),inplace=True)

#3.使用随机森林填补收入
df = data.loc[:,data.columns != "MonthlyIncome"]
fill = data.loc[:,"MonthlyIncome"]

Ytrain = fill[fill.notnull()]
Ytest = fill[fill.isnull()]
Xtrain = df.iloc[Ytrain.index,:]
Xtest = df.iloc[Ytest.index,:]

rfr = rfr(n_estimators=100)
rfr = rfr.fit(Xtrain,Ytrain)
Ypredict = rfr.predict(Xtest)

data.loc[data["MonthlyIncome"].isnull(),"MonthlyIncome"] = Ypredict

#4.查看预处理完以后的数据
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149391 entries, 0 to 149390
Data columns (total 11 columns):
SeriousDlqin2yrs                        149391 non-null int64
RevolvingUtilizationOfUnsecuredLines    149391 non-null float64
age                                     149391 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    149391 non-null int64
DebtRatio                               149391 non-null float64
MonthlyIncome                           149391 non-null float64
NumberOfOpenCreditLinesAndLoans         149391 non-null int64
NumberOfTimes90DaysLate                 149391 non-null int64
NumberRealEstateLoansOrLines            149391 non-null int64
NumberOfTime60-89DaysPastDueNotWorse    149391 non-null int64
NumberOfDependents                      149391 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.5 MB


In [11]:
# 四、数据预处理（描述性统计处理异常值）
data.describe([0.01,0.1,0.25,.5,.75,.9,.99])

#1.年龄的最小值居然有0，这不符合银行的业务需求，即便是儿童账户也要至少8岁
#查看一下年龄为0的人有多少
print((data["age"] == 0).sum())
print()
#发现只有一个人年龄为0，可以判断这肯定是录入失误造成的，可以当成是缺失值来处理，直接删除掉这个样本
data = data[data["age"] != 0]

#2.NumberOfTime30-59DaysPastDueNotWorse列有人在过去两年内逾期35~59天98次，一年6个60天，两年内逾期98次这是怎么算出来的？
#有225个样本存在这样的情况，并且这些样本，标签并不都是1，他们并不都是坏客户。因此，我们基本可以判断，这些样本是某种异常，应该把它们删除。
print(data[data.loc[:,"NumberOfTimes90DaysLate"] > 90].count())
print()
data = data[data.loc[:,"NumberOfTimes90DaysLate"] < 90]

#3.恢复索引
data.index = range(data.shape[0])

#4.查看预处理完以后的数据
data.info()

1

SeriousDlqin2yrs                        225
RevolvingUtilizationOfUnsecuredLines    225
age                                     225
NumberOfTime30-59DaysPastDueNotWorse    225
DebtRatio                               225
MonthlyIncome                           225
NumberOfOpenCreditLinesAndLoans         225
NumberOfTimes90DaysLate                 225
NumberRealEstateLoansOrLines            225
NumberOfTime60-89DaysPastDueNotWorse    225
NumberOfDependents                      225
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149165 entries, 0 to 149164
Data columns (total 11 columns):
SeriousDlqin2yrs                        149165 non-null int64
RevolvingUtilizationOfUnsecuredLines    149165 non-null float64
age                                     149165 non-null int64
NumberOfTime30-59DaysPastDueNotWorse    149165 non-null int64
DebtRatio                               149165 non-null float64
MonthlyIncome                           149165 non-null float64
NumberOfOpe

In [19]:
# 五、数据预处理（ 上采样处理样本不均衡问题）

#1.预处理前
print('预处理前')
X = data.iloc[:,1:]
y = data.iloc[:,0]
print(y.value_counts())
n_sample = X.shape[0]
n_1_sample = y.value_counts()[1]
n_0_sample = y.value_counts()[0]
print('样本个数：{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample,n_1_sample/n_sample,n_0_sample/n_sample))
print()

#2.预处理后
print('预处理后')
sm = SMOTE(random_state=42) #实例化
X,y = sm.fit_sample(X,y)
n_sample_ = X.shape[0]
print(pd.Series(y).value_counts())
n_1_sample = pd.Series(y).value_counts()[1]
n_0_sample = pd.Series(y).value_counts()[0]
print('样本个数：{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample_,n_1_sample/n_sample_,n_0_sample/n_sample_))

预处理前
0    139292
1      9873
Name: SeriousDlqin2yrs, dtype: int64
样本个数：149165; 1占6.62%; 0占93.38%

预处理后
1    139292
0    139292
Name: SeriousDlqin2yrs, dtype: int64
样本个数：278584; 1占50.00%; 0占50.00%


In [22]:
# 六、分训练集和测试集
X = pd.DataFrame(X)
y = pd.DataFrame(y)
X_train, X_vali, Y_train, Y_vali = train_test_split(X,y,test_size=0.3,random_state=420)

#1.划分训练集
model_data = pd.concat([Y_train, X_train], axis=1)
model_data.index = range(model_data.shape[0])
model_data.columns = data.columns

#2.划分测试集
vali_data = pd.concat([Y_vali, X_vali], axis=1)
vali_data.index = range(vali_data.shape[0])
vali_data.columns = data.columns

#3.导出文件
model_data.to_csv("./datas/model_data.csv")
vali_data.to_csv("./datas/vali_data.csv")