# [과제 3] 로지스틱 회귀분석
### - sklearn 패키지를 사용해 로지스틱 회귀분석을 진행해주세요.
### - 성능지표를 계산하고 이에 대해 해석해주세요.
### - 성능 개선을 시도해주세요. (어떠한 성능지표를 기준으로 개선을 시도했는지, 그 이유도 함께 적어주세요.)
### - 주석으로 설명 및 근거 자세하게 달아주시면 감사하겠습니다. :)

## Data 

출처 : https://www.kaggle.com/mlg-ulb/creditcardfraud


* V1 ~ V28 : 비식별화 된 개인정보 
* **Class** : Target 변수  
  - 1 : fraudulent transactions (사기)
  - 0 : otherwise 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv("assignment3_creditcard.csv")

In [3]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-1.848212,2.3849,0.379573,1.048381,-0.84507,2.537837,-4.542983,-10.201458,-1.504967,-2.234167,...,2.585817,-5.29169,0.859364,0.423231,-0.506985,1.020052,-0.627751,-0.017753,0.280982,0
1,2.071805,-0.477943,-1.444444,-0.548657,0.010036,-0.582242,-0.042878,-0.24716,1.171923,-0.342382,...,-0.077306,0.042858,0.390125,0.041569,0.598427,0.098803,0.979686,-0.093244,-0.065615,0
2,-2.985294,-2.747472,1.194068,-0.003036,-1.151041,-0.263559,0.5535,0.6356,0.438545,-1.806488,...,1.345776,0.37376,-0.385777,1.197596,0.407229,0.008013,0.762362,-0.299024,-0.303929,0
3,-1.479452,1.542874,0.290895,0.838142,-0.52929,-0.717661,0.484516,0.545092,-0.780767,0.324804,...,0.038397,0.116771,0.40556,-0.116453,0.541275,-0.216665,-0.415578,0.027126,-0.150347,0
4,-0.281976,-0.309699,-2.162299,-0.851514,0.106167,-1.483888,1.930994,-0.843049,-1.249272,1.079608,...,-0.875516,-0.004199,1.015108,-0.026748,0.077115,-1.468822,0.7517,0.496732,0.331001,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28678 entries, 0 to 28677
Data columns (total 29 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      28678 non-null  float64
 1   V2      28678 non-null  float64
 2   V3      28678 non-null  float64
 3   V4      28678 non-null  float64
 4   V5      28678 non-null  float64
 5   V6      28678 non-null  float64
 6   V7      28678 non-null  float64
 7   V8      28678 non-null  float64
 8   V9      28678 non-null  float64
 9   V10     28678 non-null  float64
 10  V11     28678 non-null  float64
 11  V12     28678 non-null  float64
 12  V13     28678 non-null  float64
 13  V14     28678 non-null  float64
 14  V15     28678 non-null  float64
 15  V16     28678 non-null  float64
 16  V17     28678 non-null  float64
 17  V18     28678 non-null  float64
 18  V19     28678 non-null  float64
 19  V20     28678 non-null  float64
 20  V21     28678 non-null  float64
 21  V22     28678 non-null  float64
 22

결측치는 없음

# 데이터 분리

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [8]:
X = data.drop(["Class"], axis=1)
y = data["Class"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [10]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(21508, 28) (7170, 28) (21508,) (7170,)


# 칼럼 단위 정규화

In [None]:
normalizer=StandardScaler()
X_train=normalizer.fit_transform(X_train)
X_test=normalizer.transform(X_test)

# 모델

In [11]:
# Logistic Regression 
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

In [12]:
y_pred = classifier.predict(X_test) 
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
#mean score
classifier.score(X_test, y_test)

0.997907949790795

# **성능 평가하기**

In [18]:
from sklearn.metrics import *

In [19]:
# f1 score Precision과 Recall의 조화 평균 0.84가 나타났다.
f1_score(y_pred, y_test) 

0.845360824742268

In [20]:
confusion_matrix(y_pred, y_test) #이를 이용하여 정밀도와 민감도를 계산가능

array([[7114,   11],
       [   4,   41]], dtype=int64)

In [40]:
#칼럼에 따른 beta, exp(beta) 확인
column_name=["const"]+X.columns.tolist()
beta=np.concatenate([classifier.intercept_,classifier.coef_.reshape(-1)]).round(2)
odds=np.exp(beta).round(2)
interpret=np.where(beta>0,"risky","protective")

#beta 해석
beta_analysis=pd.DataFrame(np.c_[beta,odds,interpret],index=column_name,columns=["beta","exp(beta)","interprt"])
beta_analysis

Unnamed: 0,beta,exp(beta),interprt
const,-7.38,0.0,protective
V1,-0.0,1.0,protective
V2,-0.05,0.95,protective
V3,0.03,1.03,risky
V4,0.84,2.32,risky
V5,0.01,1.01,risky
V6,-0.23,0.79,protective
V7,0.04,1.04,risky
V8,-0.2,0.82,protective
V9,0.06,1.06,risky


### **test data 예측**

In [42]:
classifier.predict_proba(X_test)

array([[9.99150341e-01, 8.49658679e-04],
       [9.99678842e-01, 3.21158487e-04],
       [9.80151409e-01, 1.98485909e-02],
       ...,
       [9.99926536e-01, 7.34644299e-05],
       [9.96021189e-01, 3.97881060e-03],
       [9.99271417e-01, 7.28582810e-04]])

In [43]:
#패키지 이용하지 않고 직접 계산하는 경우

#beta 곱하기 X
Xbeta=np.matmul(np.c_[np.ones(X_test.shape[0]),X_test],beta.reshape(-1,1))

#P(Y=1) 계산
P_1=1/(1+np.exp(-Xbeta))
P_1

array([[8.57972949e-04],
       [3.19293682e-04],
       [1.97539127e-02],
       ...,
       [7.50431405e-05],
       [4.06482677e-03],
       [7.43555254e-04]])

## **성능 평가하기**

### **cutoff 조절**

In [45]:
Cut_off=np.linspace(0.01,0.99,10) #cut off 값 만들기
for cutoff in Cut_off:
  y_pred=np.where(P_1.reshape(-1)>=cutoff,1,0)
  #cutoff 보다 크면 1 (유방암), 아니면 0 (유방암 X)
  acc=accuracy_score(y_true=y_test,y_pred=y_pred) #정확도
  recall=recall_score(y_true=y_test,y_pred=y_pred) #민감도
  precision=precision_score(y_true=y_test,y_pred=y_pred) #정밀도
  f1score = f1_score(y_true=y_test,y_pred=y_pred) #f1score

  print(f"정확도 : {acc : 0.2f}",
        f"민감도 : {recall : 0.2f}",
        f"정밀도 : {precision : 0.2f}",
        f"f1score : {f1score : 0.2f}",
        f"cut off : {cutoff : 0.2f}")

정확도 :  0.97 민감도 :  0.98 정밀도 :  0.19 f1score :  0.32 cut off :  0.01
정확도 :  1.00 민감도 :  0.90 정밀도 :  0.85 f1score :  0.88 cut off :  0.12
정확도 :  1.00 민감도 :  0.87 정밀도 :  0.88 f1score :  0.87 cut off :  0.23
정확도 :  1.00 민감도 :  0.83 정밀도 :  0.90 f1score :  0.86 cut off :  0.34
정확도 :  1.00 민감도 :  0.81 정밀도 :  0.91 f1score :  0.86 cut off :  0.45
정확도 :  1.00 민감도 :  0.79 정밀도 :  0.91 f1score :  0.85 cut off :  0.55
정확도 :  1.00 민감도 :  0.71 정밀도 :  0.95 f1score :  0.81 cut off :  0.66
정확도 :  1.00 민감도 :  0.65 정밀도 :  0.94 f1score :  0.77 cut off :  0.77
정확도 :  1.00 민감도 :  0.62 정밀도 :  0.94 f1score :  0.74 cut off :  0.88
정확도 :  1.00 민감도 :  0.54 정밀도 :  0.97 f1score :  0.69 cut off :  0.99


컷오프 0.45를 채택하여 f1 스코어를  0.84에서 0.86으로 끌어올려 성능을 올리도록함.  
f1 스코어를 사용한 이유는 단순 정확도를 사용하는 것은 y의 분포가 정확히 반반이 아니기에  
적합하지 못하고 민감도와 정밀도를 둘다 고려할 수 있는 방법이 적합하다고 생각했기 때문이다.