# 모듈 임포트 & 설치

In [1]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.2-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.2


In [2]:
import numpy as np
import pandas as pd

import category_encoders as ce

import seaborn as sns

# 데이터 로드

In [73]:
df = sns.load_dataset('titanic')
df = df.reset_index()
df.rename(columns={'index':'userid'}, inplace=True)
df = df[['userid','survived', 'age', 'fare','pclass', 'sex', 'embarked','deck']]
df.shape

(891, 8)

In [74]:
df.head()

Unnamed: 0,userid,survived,age,fare,pclass,sex,embarked,deck
0,0,0,22.0,7.25,3,male,S,
1,1,1,38.0,71.2833,1,female,C,C
2,2,1,26.0,7.925,3,female,S,
3,3,1,35.0,53.1,1,female,S,C
4,4,0,35.0,8.05,3,male,S,


## 데이터 분리

In [75]:
from sklearn.model_selection import train_test_split
SEED=42

train, test = train_test_split(df, random_state=SEED, test_size=0.2)
submission = test[["userid","survived"]]
submission["survived"] = 0.5
test = test.drop(columns=["survived"])
train.shape, test.shape, submission.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission["survived"] = 0.5


((712, 8), (179, 7), (179, 2))

In [76]:
train.head(3)

Unnamed: 0,userid,survived,age,fare,pclass,sex,embarked,deck
331,331,0,45.5,28.5,1,male,S,C
733,733,0,23.0,13.0,2,male,S,
382,382,0,32.0,7.925,3,male,S,


In [77]:
test.head(3)

Unnamed: 0,userid,age,fare,pclass,sex,embarked,deck
709,709,,15.2458,3,male,C,
439,439,31.0,10.5,2,male,S,
840,840,20.0,7.925,3,male,S,


In [78]:
submission.head()

Unnamed: 0,userid,survived
709,709,0.5
439,439,0.5
840,840,0.5
720,720,0.5
39,39,0.5


# 데이터 전처리

## EDA

## 결측치 제거

In [79]:
train.drop(columns=['deck'], inplace=True)
embarked_mode = train['embarked'].mode().values[0]
train['embarked'] = train['embarked'].fillna(embarked_mode)
train['age'].fillna(train['age'].mean(), inplace=True)

train.isnull().sum()

userid      0
survived    0
age         0
fare        0
pclass      0
sex         0
embarked    0
dtype: int64

In [80]:
test.drop(columns=['deck'], inplace=True)
embarked_mode = train['embarked'].mode().values[0] # train 데이터 사용

test['embarked'] = test['embarked'].fillna(embarked_mode)
test['age'].fillna(train['age'].mean(), inplace=True)

train.isnull().sum()

userid      0
survived    0
age         0
fare        0
pclass      0
sex         0
embarked    0
dtype: int64

## 신규 컬럼 생성

## 인코딩

### 수치형, 범주형 구분

In [81]:
no_category_cols = ['survived', 'age', 'fare']
category_cols = ['pclass', 'sex', 'embarked']

train_no_category = train[no_category_cols]
train_category = train[category_cols]

train_no_category.shape, train_category.shape

((712, 3), (712, 3))

In [82]:
no_category_cols = ['age', 'fare']
category_cols = ['pclass', 'sex', 'embarked']

test_no_category = test[no_category_cols]
test_category = test[category_cols]

test_no_category.shape, test_category.shape

((179, 2), (179, 3))

### 데이터 형변환

In [83]:
train_category['sex'] = train_category['sex'].map({'male':1, 'female':2}).astype('category')
train_category['embarked'] = train_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
train_category['pclass'] = train_category['pclass'].astype('category')

# train_category.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_category['sex'] = train_category['sex'].map({'male':1, 'female':2}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_category['embarked'] = train_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_cate

In [84]:
test_category['sex'] = test_category['sex'].map({'male':1, 'female':2}).astype('category')
test_category['embarked'] = test_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
test_category['pclass'] = test_category['pclass'].astype('category')

# test_category.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_category['sex'] = test_category['sex'].map({'male':1, 'female':2}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_category['embarked'] = test_category['embarked'].map({'S':1, 'C':2, 'Q':3}).astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_category[

### 범주형 데이터 인코딩

In [85]:
_train_encoded = pd.DataFrame()
_test_encoded = pd.DataFrame()

for col in ['pclass', 'sex', 'embarked']:
####### train #########
# 2-1. 빨간색 데이터 프레임을 생성한다.
  # 인스턴스
  _encoder = ce.OneHotEncoder(use_cat_names=True)
  # 학습
  _encoder.fit(train_category[col])
  # 반영
  _encoded = _encoder.transform(train_category[col])
# 2-2. 파란색 데이터 프레임과 빨간색 데이터 프레임을 합친다.
  _train_encoded = pd.concat([_train_encoded, _encoded], axis=1)

####### test #########
# 2-1. 빨간색 데이터 프레임을 생성한다.
  # 반영
  _encoded = _encoder.transform(test_category[col])
# 2-2. 파란색 데이터 프레임과 빨간색 데이터 프레임을 합친다.
  _test_encoded = pd.concat([_test_encoded, _encoded], axis=1)


In [86]:
_train_encoded = _train_encoded.reset_index(drop=True)
_test_encoded = _test_encoded.reset_index(drop=True)

## 스케일링

In [87]:
scaling_cols = ['age', 'fare']

train_scaling = train_no_category[scaling_cols]
test_scaling = test_no_category[scaling_cols]

print(f'{train_scaling.shape} / {test_scaling.shape}')
train_scaling.head(3)

(712, 2) / (179, 2)


Unnamed: 0,age,fare
331,45.5,28.5
733,23.0,13.0
382,32.0,7.925


In [88]:
from sklearn.preprocessing import StandardScaler

In [89]:
# 인스턴스 생성
std = StandardScaler()

In [90]:
# 학습
std.fit(train_scaling)

In [91]:
# 반영
X_train_scaled = std.transform(train_scaling)
X_test_scaled = std.transform(test_scaling)

In [92]:
X_train_scaled.shape, X_test_scaled.shape

((712, 2), (179, 2))

In [93]:
train_scaling.shape, test_scaling.shape

((712, 2), (179, 2))

In [94]:
train_scaling = train_scaling.reset_index(drop=True)
test_scaling = test_scaling.reset_index(drop=True)

## 수치형(스케일링), 범주형(인코딩) 합치기

In [95]:
X_train_scaled.shape, _train_encoded.shape

((712, 2), (712, 8))

In [96]:
_X_train_scaled = pd.DataFrame(X_train_scaled, columns=scaling_cols)
_X_train_scaled.head(2)

Unnamed: 0,age,fare
0,1.232263,-0.078684
1,-0.500482,-0.377145


In [97]:
_X_test_scaled = pd.DataFrame(X_test_scaled, columns=scaling_cols)
_X_test_scaled.head(2)

Unnamed: 0,age,fare
0,0.0,-0.333901
1,0.115605,-0.425284


In [98]:
train_concat = pd.concat([_X_train_scaled, _train_encoded], axis=1)
train_concat.shape

(712, 10)

In [99]:
test_concat = pd.concat([_X_test_scaled, _test_encoded], axis=1)
test_concat.shape

(179, 10)

In [100]:
train_concat.head()

Unnamed: 0,age,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_2.0,embarked_1.0,embarked_2.0,embarked_3.0
0,1.232263,-0.078684,1,0,0,1,0,1,0,0
1,-0.500482,-0.377145,0,1,0,1,0,1,0,0
2,0.192616,-0.474867,0,0,1,1,0,1,0,0
3,-0.269449,-0.47623,0,0,1,1,0,1,0,0
4,-1.809667,-0.025249,0,0,1,0,1,1,0,0


# 모델링

In [101]:
train_concat.isnull().sum().sum(), test_concat.isnull().sum().sum()

(0, 0)

In [102]:
train_concat.columns

Index(['age', 'fare', 'pclass_1.0', 'pclass_2.0', 'pclass_3.0', 'sex_1.0',
       'sex_2.0', 'embarked_1.0', 'embarked_2.0', 'embarked_3.0'],
      dtype='object')

In [103]:
train_target = train['survived']
train_features = train_concat

train_features.shape, train_target.shape

((712, 10), (712,))

## 모델

In [104]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
model

## 학습

In [105]:
model.fit(train_features, train_target)

In [106]:
model.score(train_features, train_target)

0.9789325842696629

## 예측

In [107]:
target_pred = model.predict_proba(test_concat)[:,1]
test_concat.shape, target_pred.shape

((179, 10), (179,))

# Submission 파일 생성

In [108]:
submission['survived'] = target_pred
submission.head()

Unnamed: 0,userid,survived
709,709,0.0
439,439,1.0
840,840,1.0
720,720,1.0
39,39,1.0


In [109]:
submission.to_csv("submission_v1.csv", header=True, index=False)