# 타이타닉 프로세스

In [19]:
import pandas as pd

In [20]:
df_train = pd.read_csv("train.csv")

In [21]:
df_test = pd.read_csv("test.csv")

In [22]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [23]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 1. 수치 특성을 위한 파이프라인부터 시작해서 전처리 파이프라인 만들기

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
imputer1 = SimpleImputer(strategy="median")

In [35]:
num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy="median")),
    ('scaler',StandardScaler())
])

### 2. 범주형 특성을 위한 파이프라인 만들기

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [None]:
imputer2 = SimpleImputer(strategy="mode")

In [37]:
category_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy="most_frequent")),
     ('cat_encoder',OneHotEncoder(sparse=False))
])

### 3. 두 파이프라인 연결하기

In [38]:
from sklearn.compose import ColumnTransformer

In [39]:
num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

In [40]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", category_pipeline, cat_attribs)
])

In [42]:
fullpipe = full_pipeline.fit_transform(df_train[num_attribs + cat_attribs])



### 4. 모델 훈련

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
rand = RandomForestClassifier(n_estimators=100, random_state = 42)

In [50]:
rand.fit(fullpipe,df_train['Survived'])

In [57]:
X_test = full_pipeline.transform(df_test[num_attribs + cat_attribs])

In [59]:
y_pred = rand.predict(X_test)

In [60]:
#최종 예측 값 -> tot
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

### 5. 교차 검증을 사용한 평가

In [53]:
from sklearn.model_selection import cross_val_score 

In [61]:
y_train = df_train['Survived']

In [62]:
scores = cross_val_score(rand,fullpipe,y_train,cv = 10)

In [65]:
scores.mean()

0.8092759051186016

### 6. Real Predict
- 모델 저장
  - RandomforestClassifier을 pickle로 저장하시오.

- 파이프라인 저장
    - joblib을 이용해 pipeline을(전체 파이프라인) 저장하시오.

- 실제 예측
    - 파이프라인 로드하시오.
    - 주어진 사람들을 예측하시오.
      - 모델 로드하시오.

#### 모델 저장

In [93]:
import pickle

In [104]:
with open('model.pkl', 'wb') as f:
    pickle.dump(rand,f)

#### 파이프라인 저장

In [82]:
from joblib import dump

In [89]:
dump(full_pipeline, 'fullpipe.joblib')

['fullpipe.joblib']

#### 모델, 파이프라인 로드 및 실제 예측

In [106]:
#모델 로드
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [109]:
person = {"kim": [22.0, 1, 0, 7.25, 3, 'male', 'S'],
"lee" : [38.0, 1, 0, 71.2833, 1, 'female', 'C'],
"park" : [26.0, 0, 0, 7.925, 3, 'female', 'S']}

In [107]:
#파이프라인 로드
from joblib import load
real_pipe = load('fullpipe.joblib')

In [207]:
attr = num_attribs + cat_attribs
for name, data in person.items():
    data = np.asarray(data,dtype='object').reshape(1,-1)
    
    df = pd.DataFrame(data, columns = attr)
    X_test = real_pipe.transform(df)
    
    pred = model.predict(X_test)
    a,b = model.predict_proba(X_test)[0]
    if pred:
        print(f'{name} is Survived || Percent: {a:.2f}:{b:.2f}')
    else:
        print(f'{name} is Died... || Percent {a:.2f}:{b:.2f}')

kim is Died... || Percent 0.84:0.16
lee is Survived || Percent: 0.00:1.00
park is Survived || Percent: 0.12:0.88


### 후기: 파이프라인에 모델을 추가해보는 방안을 생각해보자