In [1]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.metrics as metrics

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
titanic = pd.read_excel('../../data/titanic.xlsx')
titanic.head()

Unnamed: 0.1,Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town
0,0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton
1,1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg
2,2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton
3,3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton
4,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton


## 1. 특성데이터, 라벨데이터 구분
- (특성 : 등급, age, who, 탑승금액), (targer : survived)

In [3]:
x_data = titanic[['pclass','age','who','fare']]
y_data = titanic['survived']
# age Nan값 평균값으로 처리
x_data['age'] = x_data['age'].fillna(x_data['age'].mean())

print(x_data.shape, y_data.shape)

(891, 4) (891,)


In [4]:
x_data.head()

Unnamed: 0,pclass,age,who,fare
0,3,22.0,man,7.25
1,1,38.0,woman,71.2833
2,3,26.0,woman,7.925
3,1,35.0,woman,53.1
4,3,35.0,man,8.05


### 1-1 문자열 변환 만들기 
- Make_Column_Transformer

In [5]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import make_column_transformer

In [6]:
myt = make_column_transformer( (OneHotEncoder(), ['who']) )

### 1-2 데이터 셋 구분

In [7]:
# 분류 데이터셋 구분에서 'stratify =' 옵션을 반드시 줘야함
# 분류의 0과 1을 적절히 분배하기 위함

x_train, x_test, y_train, y_test=train_test_split(x_data, y_data, test_size=0.2, random_state=1, stratify = y_data)
print(x_train.shape,x_test.shape)

(712, 4) (179, 4)


## 2. 모델 적용

## GridSearchCV

In [9]:
from sklearn.model_selection import GridSearchCV

In [28]:
model = make_pipeline(myt, StandardScaler(), LogisticRegression())

# C를 조정해 과대적합 혹은 과소적합 문제해결
# C값이 크면 (높은 코스트) -> 훈련을 더 복잡하게 -> 약한 규제
# C값이 작으면(낮은 코스트)-> )
param_value = {'logisticregression__penalty':['l2', 'l1', 'elasticnet'],
                'logisticregression__C': [i for i in np.linspace(0,10,10)], 
                'logisticregression__solver':['newton-cg','liblinear','lbfgs','sag','saga']}

# 최적의 파라미터를 찾기 위해 GridSearch 사용
# 모델__파라미터 : 해당 모델에 파라미터이다. ex) ridge_alpha
# cv=교차검증 
gridS = GridSearchCV(model, param_grid=param_value)
gridS.fit(x_train, y_train)


print(gridS.best_params_)
print(gridS.best_score_)

{'logisticregression__C': 1.1111111111111112, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'newton-cg'}
0.7836403033586132


In [18]:
print('학습데이터 accuracy',gridS.score(x_train, y_train))
print('테스트 데이터 accuracy',gridS.score(x_test, y_test))

학습데이터 accuacy 0.7837078651685393
테스트 데이터 0.8100558659217877


### 데이터에 대한 예측값

In [23]:
a = [3,14,'child',11]
b = pd.DataFrame(a).T

print(x_train.iloc[1:2])
b.columns = x_train.columns
print(b)
gridS.predict(b)

     pclass   age    who    fare
542       3  11.0  child  31.275
  pclass age    who fare
0      3  14  child   11


array([1], dtype=int64)