### 지도학습

#### 분류 - 이진분류
- 목표변수가 2개 항목인 것
- 목표변수 : Survived

In [1]:
import pandas as pd

In [2]:
df_TFD = pd.read_csv('../../../datasets/TitanicFromDisaster_train.csv')
df_TFD[:2]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### 전처리

In [102]:
df_TFD.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [103]:
df_TFD_extract = df_TFD[['Survived', 'Pclass', 'Age']]
df_TFD_extract.isnull().sum()

Survived      0
Pclass        0
Age         177
dtype: int64

In [104]:
# age null값 존재 -> regression으로 null값을 채울 수 있음. 여기선 drop함
df_TFD_extract_preprocess = df_TFD_extract.dropna()
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age
0,0,3,22.0
1,1,1,38.0


#### Scaling & Encoding

##### Encoding with OneHoyEncoding

In [105]:
df_TFD_extract_preprocess['Pclass'].value_counts()

3    355
1    186
2    173
Name: Pclass, dtype: int64

In [106]:
from sklearn.preprocessing import OneHotEncoder

In [107]:
oneHotEncoder = OneHotEncoder()
oneHotEncoder.fit(df_TFD_extract_preprocess[['Pclass']]) # 해당 항목 학습한 것임.

In [108]:
columns_name = oneHotEncoder.categories_

In [109]:
# oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray() # 실제값 확인용
encoded_data = oneHotEncoder.transform(df_TFD_extract_preprocess[['Pclass']]).toarray()

In [110]:
df_encoded_data = pd.DataFrame(data=encoded_data, columns=columns_name)
df_encoded_data[:2]

Unnamed: 0,1,2,3
0,0.0,0.0,1.0
1,1.0,0.0,0.0


In [119]:
df_encoded_data.index, df_encoded_data.shape

(Int64Index([  0,   1,   2,   3,   4,   6,   7,   8,   9,  10,
             ...
             880, 881, 882, 883, 884, 885, 886, 887, 889, 890],
            dtype='int64', length=714),
 (714, 3))

In [120]:
df_TFD_extract_preprocess.index, df_TFD_extract_preprocess.shape

(Int64Index([  0,   1,   2,   3,   4,   6,   7,   8,   9,  10,
             ...
             880, 881, 882, 883, 884, 885, 886, 887, 889, 890],
            dtype='int64', length=714),
 (714, 6))

In [111]:
# pd.concat([df_TFD_extract_preprocess, df_encoded_data], axis=1, ignore_index=True)
# df_encoded_data = pd.get_dummies(df_TFD_extract_preprocess['Pclass'], prefix='Pclass')

In [112]:
df_TFD_extract_preprocess = pd.concat([df_TFD_extract_preprocess, df_encoded_data], axis=1)
df_TFD_extract_preprocess[:2]

Unnamed: 0,Survived,Pclass,Age,Pclass_1,Pclass_2,Pclass_3
0,0,3,22.0,0,0,1
1,1,1,38.0,1,0,0


In [113]:
df_TFD_extract_preprocess[['Pclass']][:3]

Unnamed: 0,Pclass
0,3
1,1
2,3


In [114]:
target = df_TFD_extract_preprocess['Survived']

In [115]:
features = df_TFD_extract_preprocess.drop(columns=['Survived', 'Pclass'])

In [116]:
features.columns

Index(['Age', 'Pclass_1', 'Pclass_2', 'Pclass_3'], dtype='object')

#### MinMaxScaler

In [117]:
from sklearn.preprocessing import MinMaxScaler

#### 정형화 단계 
- 데이터를 머신에 넣기 전 목표변수와 설명변수를 분리
- 데이터 분할
    - 머신러닝 모델을 훈련하고 성능을 측정할 때 훈련 데이터와 테스트 데이터 비율을 8:2로 설정한다.
    - train_test_split() : 8:2 비율로 만들어주는 fuction
    - 데이터가 500개 미만일 경우 split을 하는 것보다 데이터를 더 모으는 게 좋다.

In [77]:
from sklearn.model_selection import train_test_split

In [78]:
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((535, 4), (535,), (179, 4), (179,))

In [79]:
target_train = df_TFD_extract_preprocess['Survived']
features_train = df_TFD_extract_preprocess[['Pclass', 'Age']]   # label(=feature)
target_train.shape, features_train.shape

((714,), (714, 2))

#### 모델학습

In [94]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()  # 인스턴스화
model.fit(features_train, target_train)  # 모델 훈련은 fit() fuction을 이용함 -> fit(feature, target)

In [95]:
model.coef_, model.intercept_

(array([[-1.22653571, -0.04149665]]), array([3.532956]))

#### 예측

In [11]:
# 값을 넣어서 확인해보기
df_TFD_extract_preprocess[10:15]   # index가 features_train과 같다.

Unnamed: 0,Survived,Pclass,Age
11,1,1,58.0
12,0,3,20.0
13,0,3,39.0
14,0,3,14.0
15,1,2,55.0


In [12]:
model.predict(features_train[10:15])
# 위에서는 0이 3개 1이 2개
# 결과: array([0, 0, 0, 0, 0]) 60% 맞음

array([0, 0, 0, 0, 0], dtype=int64)

In [13]:
# predict_proba() : 확률을 알 수 있음
model.predict_proba(features_train[10:15])

# [0.52507531, 0.47492469] -> 앞은 0에 대한 열, 뒤는 1에 대한 열
# 0에 대한 확률이 더 높기 때문에 위에서 0으로 결과가 나온 것

array([[0.52507531, 0.47492469],
       [0.72642991, 0.27357009],
       [0.85383733, 0.14616267],
       [0.67427932, 0.32572068],
       [0.768957  , 0.231043  ]])

#### 평가

In [96]:
target_train_predict = model.predict(features_train)
target_train_predict.shape  # target_train.shape과 동일

(714,)

In [97]:

from sklearn.metrics import accuracy_score

In [98]:
# 정확도 평가 (실제값, 예측값을 넣으면 정확도가 평가된다)
accuracy_score(target_train, target_train_predict)  # 교내 시험
# 0.696078431372549 정규화 이전

0.696078431372549

In [99]:
target_test_predict = model.predict(features_test)
target_test_predict.shape   # target_test.shape과 동일



ValueError: X has 4 features, but LogisticRegression is expecting 2 features as input.

In [18]:
accuracy_score(target_test, target_test_predict)   # 교외 시험

# 교내 시험, 교외 시험의 두 개의 차이가 많이 나면 모델 성능에 문제가 있다고 판단
# 0.7, 0.65는 0.05정도의 차이이므로 양호함

0.6703910614525139

##### 성능지표
1. accuracy(정확도)
2. precision(정밀도) : 모델의 예측결과가 positive인 것 중 실제값이 positive인 확률
3. recall(재현율) : 실제 True인 것 중에서 모델이 True라고 예츠한 것의 비율로 민감도와 같은 의미
    - 정밀도와 재현율은 상호보완적, 두 지표가 모두 높을수록 좋은 지표

In [19]:
from sklearn.metrics import classification_report

In [100]:
print(classification_report(target_train, target_train_predict))   # vs code의 파이썬에서 사용되는 코드라서 print로 감싸줘야 모양이 이상하지 않게 나옴

              precision    recall  f1-score   support

           0       0.71      0.82      0.76       424
           1       0.66      0.52      0.58       290

    accuracy                           0.70       714
   macro avg       0.69      0.67      0.67       714
weighted avg       0.69      0.70      0.69       714



In [101]:
print(classification_report(target_test, target_test_predict))

              precision    recall  f1-score   support

           0       0.73      0.76      0.74       112
           1       0.56      0.52      0.54        67

    accuracy                           0.67       179
   macro avg       0.65      0.64      0.64       179
weighted avg       0.67      0.67      0.67       179



#### 오차 행렬

In [22]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [23]:
confusion_matrix(target_train, target_train_predict)

array([[347,  77],
       [140, 150]], dtype=int64)

In [24]:
precision_score(target_train, target_train_predict)

0.6607929515418502

In [25]:
recall_score(target_train, target_train_predict)

0.5172413793103449

In [26]:
f1_score(target_train, target_train_predict)

0.5802707930367504