In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### 1.读取数据和数据的基本信息

In [3]:
df = pd.read_csv('/home/mw/input/credit_card8849/card_transdata.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
distance_from_home                1000000 non-null float64
distance_from_last_transaction    1000000 non-null float64
ratio_to_median_purchase_price    1000000 non-null float64
repeat_retailer                   1000000 non-null float64
used_chip                         1000000 non-null float64
used_pin_number                   1000000 non-null float64
online_order                      1000000 non-null float64
fraud                             1000000 non-null float64
dtypes: float64(8)
memory usage: 61.0 MB


distance_from_home :离发生交易的位置的距离  
distance_from_last_transaction:距离上一次交易发生的距离  
ratio_to_median_purchase_price:购买价格与中位购买价格的比值  
repeat_retailer:交易是否来自同一交易商  
used_chip :是否通过信用卡交易  
used_pin_number:交易是否使用密码进行  
online_order :交易是否为在线订单  
fraud:交易是否具有欺诈性

In [4]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


### 2.重复值与缺失值处理

In [5]:
df.isnull().sum()

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
repeat_retailer                   0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64

In [6]:
df.duplicated().sum()

0

不存在重复值与缺失值

In [7]:
df.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


### 3.探索性分析

In [8]:
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

In [32]:
n_sample = df.shape[0] # 1000000
n_0_sample = df.fraud.value_counts()[0]
n_1_sample = df.fraud.value_counts()[1]
print('0:{}; 1:{};'.format(n_0_sample/n_sample,n_1_sample/n_sample))

0:0.912597; 1:0.087403;


In [20]:
df_t = df.fraud.value_counts().reset_index()
df_t.replace({0:'非欺诈',1:'欺诈'},inplace = True)
fig = plt.figure(figsize = (10,4))
ax1 = fig.add_subplot(1,2,1)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
sns.barplot(x = 'index',y = 'fraud',data = df_t)
plt.xlabel('')
plt.xlabel('count',fontdict = {'fontsize':12})
ax2 = fig.add_subplot(1,2,2)
plt.pie(x = df_t['fraud'],labels = df_t['index'],autopct = '%1.1f%%',explode = [0.1,0],
startangle = 90,counterclock = False,wedgeprops = {'linewidth':1,'edgecolor':'black'})
plt.axis('square')
plt.show()

In [26]:
correlation_matrix = df.corr()
import seaborn as sns
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix,annot = True,cmap = 'coolwarm',linewidths = 0.5)
plt.title("热力图")
plt.show()

可以观察到ratio_to_median_purchase_price:购买价格与中位购买价格的比值与欺诈率有明显的联系

In [29]:
X = df.iloc[:,:-1]# 特征列
y = df.iloc[:,-1]# 目标列
from imblearn.over_sampling import SMOTE
smote = SMOTE() #过采样
X_resampled,y_resampled = smote.fit_resample(X,y)# 生成新的合成样本，使之更平衡

In [33]:
n_sample_new = X_resampled.shape[0] # 1000000
n_0_sample_new = y_resampled.value_counts()[0]
n_1_sample_new = y_resampled.value_counts()[1]
print('0:{}; 1:{};'.format(n_0_sample_new/n_sample_new,n_1_sample_new/n_sample_new))

0:0.5; 1:0.5;


### 4.划分训练集和测试集

In [34]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,random_state = 888,test_size = 0.2)

### 5.分别用逻辑回归和Xboost做分类

In [35]:
from sklearn.linear_model import LogisticRegression as LR
model_LR = LR(penalty = 'l2') # 利用l2正则化创建逻辑回归模型
model_LR.fit(X_train,y_train) 

LogisticRegression()

In [36]:
model_LR.score(X_test,y_test)

0.9436991663904405

In [37]:
# 考虑到偶然性的存在，进行五折交叉验证，取五次循环的平均值
from sklearn.model_selection import cross_val_score as CVS
CVS(model_LR,X_train,y_train,cv = 5).mean()

0.9419897202694234

In [38]:
from xgboost import XGBClassifier as XGBC
model_xgb = XGBC(n_estimators = 100,max_depth = 3)# 共100棵树，树的最大深度为3，效果不一定最好，注意调参
model_xgb.fit(X_train,y_train)

XGBClassifier()

In [39]:
model_xgb.score(X_test,y_test)

0.999830155134109

In [40]:
# 五折交叉验证
CVS(model_xgb,X_train,y_train,cv = 5).mean()

0.9995274474285264

#### 分类模型评价标准

In [46]:
# 导入评价指标
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
y_test_pred = model_xgb.predict(X_test)
auc = accuracy_score(y_test,y_test_pred)
print('准确率:{:.2f}%'.format(auc))
rc = recall_score(y_test,y_test_pred)
print('召回率:{:.2f}%'.format(rc))
pc = precision_score(y_test,y_test_pred)
print('精确率:{:.2f}%'.format(pc))
f1 = f1_score(y_test,y_test_pred)
print('f1_score:{:.2f}%'.format(f1))

准确率:1.00%
召回率:1.00%
精确率:1.00%
f1_score:1.00%


In [52]:
model_xgb.feature_importances_ #注意features_importances_后面下划线

array([0.2452287 , 0.12080529, 0.43155187, 0.04221135, 0.05669358,
       0.03601225, 0.06749702], dtype=float32)