# 모듈 임포트 & 설치

In [None]:
!pip install category_encoders



In [None]:
import numpy as np
import pandas as pd

import category_encoders as ce

import seaborn as sns

# 데이터 로드

In [None]:
df = sns.load_dataset('titanic')
df = df.reset_index()
df.rename(columns={'index':'userid'}, inplace=True)
df = df[['userid','survived', 'age', 'fare','pclass', 'sex', 'embarked','deck']]
df.shape

(891, 8)

In [None]:
df.head()

Unnamed: 0,userid,survived,age,fare,pclass,sex,embarked,deck
0,0,0,22.0,7.25,3,male,S,
1,1,1,38.0,71.2833,1,female,C,C
2,2,1,26.0,7.925,3,female,S,
3,3,1,35.0,53.1,1,female,S,C
4,4,0,35.0,8.05,3,male,S,


## 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split
SEED=42

train, test = train_test_split(df, random_state=SEED, test_size=0.2)
submission = test[["userid","survived"]]
submission["survived"] = 0.5
test = test.drop(columns=["survived"])
train.shape, test.shape, submission.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission["survived"] = 0.5


((712, 8), (179, 7), (179, 2))

In [None]:
train.head(3)

Unnamed: 0,userid,survived,age,fare,pclass,sex,embarked,deck
331,331,0,45.5,28.5,1,male,S,C
733,733,0,23.0,13.0,2,male,S,
382,382,0,32.0,7.925,3,male,S,


In [None]:
test.head(3)

Unnamed: 0,userid,age,fare,pclass,sex,embarked,deck
709,709,,15.2458,3,male,C,
439,439,31.0,10.5,2,male,S,
840,840,20.0,7.925,3,male,S,


In [None]:
submission.head()

Unnamed: 0,userid,survived
709,709,0.5
439,439,0.5
840,840,0.5
720,720,0.5
39,39,0.5


# 데이터 전처리

## EDA

## 결측치 제거

In [None]:
train.drop(columns=['deck'], inplace=True)
embarked_mode = train['embarked'].mode().values[0]
train['embarked'] = train['embarked'].fillna(embarked_mode)
train['age'].fillna(train['age'].mean(), inplace=True)

train.isnull().sum()

userid      0
survived    0
age         0
fare        0
pclass      0
sex         0
embarked    0
dtype: int64

In [None]:
test.drop(columns=['deck'], inplace=True)
embarked_mode = train['embarked'].mode().values[0] # train 데이터 사용

test['embarked'] = test['embarked'].fillna(embarked_mode)
test['age'].fillna(train['age'].mean(), inplace=True)

train.isnull().sum()

userid      0
survived    0
age         0
fare        0
pclass      0
sex         0
embarked    0
dtype: int64

## 신규 컬럼 생성

## 인코딩

### 수치형, 범주형 구분

In [None]:
no_category_cols = ['survived', 'age', 'fare']
category_cols = ['pclass', 'sex', 'embarked']

train_no_category = train[no_category_cols]
train_category = train[category_cols]

train_no_category.shape, train_category.shape

((712, 3), (712, 3))

In [None]:
no_category_cols = ['age', 'fare']
category_cols = ['pclass', 'sex', 'embarked']

test_no_category = test[no_category_cols]
test_category = test[category_cols]

test_no_category.shape, test_category.shape

((179, 2), (179, 3))

### 데이터 형변환

In [None]:
train_category['sex'] = train_category['sex'].map({'male':1, 'female':2}).astype('category')
train_category['embarked'] = train_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
train_category['pclass'] = train_category['pclass'].astype('category')

# train_category.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_category['sex'] = train_category['sex'].map({'male':1, 'female':2}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_category['embarked'] = train_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_cate

In [None]:
test_category['sex'] = test_category['sex'].map({'male':1, 'female':2}).astype('category')
test_category['embarked'] = test_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
test_category['pclass'] = test_category['pclass'].astype('category')

# test_category.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_category['sex'] = test_category['sex'].map({'male':1, 'female':2}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_category['embarked'] = test_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_category[

### 범주형 데이터 인코딩

In [None]:
_train_encoded = pd.DataFrame()
_test_encoded = pd.DataFrame()

for col in ['pclass', 'sex', 'embarked']:
####### train #########
# 2-1. 빨간색 데이터 프레임을 생성한다.
  # 인스턴스
  _encoder = ce.OneHotEncoder(use_cat_names=True)
  # 학습
  _encoder.fit(train_category[col])
  # 반영
  _encoded = _encoder.transform(train_category[col])
# 2-2. 파란색 데이터 프레임과 빨간색 데이터 프레임을 합친다.
  _train_encoded = pd.concat([_train_encoded, _encoded], axis=1)

####### test #########
# 2-1. 빨간색 데이터 프레임을 생성한다.
  # 반영
  _encoded = _encoder.transform(test_category[col])
# 2-2. 파란색 데이터 프레임과 빨간색 데이터 프레임을 합친다.
  _test_encoded = pd.concat([_test_encoded, _encoded], axis=1)


### 수치형, 인코딩된 범주형 합치기

In [None]:
train_encoded = pd.concat([train_no_category, _train_encoded], axis=1)
train_encoded.shape

(712, 11)

In [None]:
test_encoded = pd.concat([test_no_category, _test_encoded], axis=1)
test_encoded.shape

(179, 10)

# 모델링

In [None]:
train_encoded.isnull().sum().sum(), test_encoded.isnull().sum().sum()

(0, 0)

In [None]:
train_encoded.columns

Index(['survived', 'age', 'fare', 'pclass_1.0', 'pclass_2.0', 'pclass_3.0',
       'sex_1.0', 'sex_2.0', 'embarked_1.0', 'embarked_2.0', 'embarked_3.0'],
      dtype='object')

In [None]:
train_target = train_encoded['survived']
train_features = train_encoded.drop(columns=['survived'])

train_features.shape, train_target.shape

((712, 10), (712,))

## 모델

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
model

## 학습

In [None]:
model.fit(train_features, train_target)

In [None]:
model.score(train_features, train_target)

0.9789325842696629

## 예측

In [None]:
target_pred = model.predict_proba(test_encoded)[:,1]
test_encoded.shape, target_pred.shape

((179, 10), (179,))

# Submission 파일 생성

In [None]:
submission['survived'] = target_pred
submission.head()

Unnamed: 0,userid,survived
709,709,0.0
439,439,1.0
840,840,1.0
720,720,1.0
39,39,1.0


In [None]:
submission.to_csv("submission_v1.csv", header=True, index=False)