# Kaggle 신용카드 사기 검출
https://www.kaggle.com/mlg-ulb/creditcardfraud
## Credit Card Fraud Detection
* creditcard.csv (284, 807 * 31)
* Class : <font color='blue'> '0' (정상결제) </font>, <font color='red'> '1' (부정결제) </font>
* 사기 검출(Fraud Detection), 이상 탐지(Anomaly Detection)

# I. From Github
* 'creditCardFraud.zip' 파일 다운로드

### 코렙 환경
!wget https://raw.githubusercontent.com/rusita-ai/pyData/master/creditCardFraud.zip

### 압축 해제
!unzip /content/creditCardFraud.zip

In [2]:
# 윈도우 환경
import requests
import zipfile
import os

url = 'https://raw.githubusercontent.com/rusita-ai/pyData/master/creditCardFraud.zip'
zip_path = 'creditCardFraud.zip'

# 파일 다운로드
print('파일 다운로드 중......')
response = requests.get(url)
with open(zip_path, 'wb') as f:
    f.write(response.content)
print('다운로드 완료')

# 압축 해제할 경로
extract_dir = './creditCardFraud'

# 폴더가 없다면 생성
os.makedirs(extract_dir, exist_ok=True)

# 압축 해제
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print('압축 해제 완료!')

# (선택) zip 파일 삭제
os.remove(zip_path)

파일 다운로드 중......
다운로드 완료
압축 해제 완료!


# II. Data Preprocessing

> ## 1) 데이터 읽어오기
* pandas DataFrame

In [3]:
import pandas as pd
DF = pd.read_csv('./creditCardFraud/creditcard.csv')
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
DF.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


* 0(정상) Class와 1(사기) Class 개수

In [6]:
# DF['Class'].value_counts()
DF.Class.value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

* 0(정상) Class와 1(사기) Class 비율

In [7]:
DF.Class.value_counts(normalize=True)

Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

> ## 2) 'Time' -> 'hours'
* 'Time' : 각 거래와 첫 번쨰 거래 사이에 경과된 초('Seconds')

In [8]:
DF.Time.value_counts()

Time
163152.0    36
64947.0     26
68780.0     25
3767.0      21
3770.0      20
            ..
172760.0     1
172758.0     1
172757.0     1
172756.0     1
172754.0     1
Name: count, Length: 124592, dtype: int64

> ### (1) 시간('hours') 정보 생성

In [9]:
timedelta = pd.to_timedelta(DF['Time'], unit='s')
DF['Time'] = (timedelta.dt.components.hours).astype(int)

In [10]:
DF.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [11]:
DF.Time.value_counts()

Time
21    17703
18    17039
11    16856
20    16756
10    16598
14    16570
15    16461
16    16453
17    16166
9     15838
19    15649
22    15441
12    15420
13    15365
23    10938
8     10276
0      7695
7      7243
1      4220
6      4101
3      3492
2      3328
5      2990
4      2209
Name: count, dtype: int64

> ## 3) Train_test_split()

* X (Input), y(Output) 지정

In [12]:
X = DF.iloc[:, :-1]
y = DF.iloc[:, -1]
X.shape, y.shape

((284807, 30), (284807,))

* With 'Stratify'

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2045)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((199364, 30), (199364,), (85443, 30), (85443,))

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=2045,
                                                    stratify=y)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((199364, 30), (199364,), (85443, 30), (85443,))

* Train_Data와 Test_Data의 1(부정) 비율이 균열

In [17]:
print('Train_Data : ', '\n', y_train.value_counts(normalize=True))
print()
print('Test_Data : ', '\n', y_test.value_counts(normalize=True))

Train_Data :  
 Class
0    0.998275
1    0.001725
Name: proportion, dtype: float64

Test_Data :  
 Class
0    0.998268
1    0.001732
Name: proportion, dtype: float64


# III. MLP Modeling

> ## 1) MLPClassifier Modeling
* hidden_layer_sizes : 은닉층 노드의 개수
* activation : 활성화 함수
* solver : 최적화 함수
* max_iter : 학습 반복 횟수

In [21]:
from sklearn.neural_network import MLPClassifier

Model_NM = MLPClassifier(
    hidden_layer_sizes=(12),
    activation= 'logistic',
    solver= 'sgd',
    max_iter=5000,
    random_state=2045
)

Model_NM.fit(X_train, y_train)

In [34]:
from sklearn.neural_network import MLPClassifier

Model_NM = MLPClassifier(
    hidden_layer_sizes=(12),
    activation= 'logistic',
    solver= 'adam',
    max_iter=5000,
    random_state=2045
)

Model_NM.fit(X_train, y_train)

> ## 2) Test_Data에 Model 적용

In [35]:
y_hat = Model_NM.predict(X_test)

> ## 3) Confusion Matrix

In [36]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_hat, labels=[1, 0])

array([[  123,    25],
       [   26, 85269]])

> ## 4) Confusion Report

In [37]:
from sklearn.metrics import classification_report

print(classification_report(
    y_test, y_hat, target_names=['정상', '부정'], digits=5
))

              precision    recall  f1-score   support

          정상    0.99971   0.99970   0.99970     85295
          부정    0.82550   0.83108   0.82828       148

    accuracy                        0.99940     85443
   macro avg    0.91261   0.91539   0.91399     85443
weighted avg    0.99941   0.99940   0.99940     85443

