# Connect to google account

In [None]:
from google.colab import drive
drive.mount('/content/data')

Drive already mounted at /content/data; to attempt to forcibly remount, call drive.mount("/content/data", force_remount=True).


In [None]:
# 코렙 한글깨짐 방지
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# 데이터 시각화에 사용할 라이브러리
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# 브라우저에서 바로 그려지도록
%matplotlib inline

# 그래프에 retina display 적용
%config InlineBackend.figure_format = 'retina'

# Colab 의 한글 폰트 설정
plt.rc('font', family='NanumBarunGothic')

# 유니코드에서  음수 부호설정
mpl.rc('axes', unicode_minus=False)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/

# Global Variables

In [None]:
import os
import numpy as np
import random
import torch

def reset_seeds(seed=500):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
  np.random.seed(seed)
  torch.manual_seed(seed) # cpu 연산 무작위 고정
  torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
  torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )

In [None]:
import easydict
args = easydict.EasyDict()

# path 정보
args.default_path = '/content/data/MyDrive/AI_study/1. Machine Learning/3. Supervised Learning/original data/'
args.train_csv = args.default_path+'train.csv'
args.test_csv = args.default_path+'test.csv'
args.default_submission_csv = args.default_path+'submission.csv'

args.submission_csv = args.default_path+'result/submission_0826.csv'
args.save_results = args.default_path+"result/model_results.json"

# 데이터 분석을 위한 변수들
# args.random_state = 500 # 위의 셀에서 고정 시킴
args.results = []

# Load Tritanic

- Surived:0=사망, 1=생존
- Pclass: 1=1등석, 2=2등석, 3=3등석
- gender:male=남성, female=여성
- Age: 나이
- SibSp: 타이타닉 호에 동승한 자매/배우자의 수
- Parch: 타이타닉 호에 동승한 부모/자식의 수
- Ticket: 티켓 번호
- Fare: 승객 요금
- Cabin: 방 호수
- Embarked: 탑승지; C=셰르부르, Q=퀴즈타운, S=사우샘프턴

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
plt.ion()

import warnings
warnings.filterwarnings('ignore')

In [None]:
ori_train = pd.read_csv(args.train_csv)
ori_test = pd.read_csv(args.test_csv)

ori_train.shape, ori_test.shape

((916, 12), (393, 11))

In [None]:
pd.read_csv(args.default_submission_csv).shape

(393, 2)

In [None]:
ori_train.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [None]:
ori_train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [None]:
ori_train['passengerid'].nunique(), ori_train.shape[0]

(916, 916)

In [None]:
ori_train.drop('passengerid', axis=1, inplace=True)
ori_train.head()

Unnamed: 0,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [None]:
ori_test.set_index(['passengerid'], inplace=True)
print(f'{ori_test.shape}')
ori_test.head()

(393, 10)


Unnamed: 0_level_0,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
916,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
917,2,"Pinsky, Mrs. (Rosa)",female,32.0,0,0,234604,13.0,,S
918,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
919,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
920,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S


# train_test_split

In [None]:
new_survived = pd.Categorical(ori_train["survived"])
new_survived = new_survived.rename_categories(["Died","Survived"])

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,570,0.622271
Survived,346,0.377729


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y = ori_train['survived']
X = ori_train.drop(['survived'], axis=1)

In [None]:
reset_seeds()
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=ori_train['survived'])

X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((641, 10), (275, 10), (641,), (275,))

# Base ModelV0

In [None]:
train = X_tr.copy()
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((641, 10), (275, 10), (393, 10))

## Data Preprocessing

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 641 entries, 582 to 286
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   name      641 non-null    object 
 2   gender    641 non-null    object 
 3   age       520 non-null    float64
 4   sibsp     641 non-null    int64  
 5   parch     641 non-null    int64  
 6   ticket    641 non-null    object 
 7   fare      641 non-null    float64
 8   cabin     153 non-null    object 
 9   embarked  640 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 55.1+ KB


drop columns

In [None]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True) # 모델이 학습하는데 사용하는 데이터
test.drop(drop_cols, axis=1, inplace=True) # 모델의 학습을 평가(잘했는지?? 못했는지??)하기 위한 데이터
ori_te.drop(drop_cols, axis=1, inplace=True) # 학습이 잘된 모델을 이용해서 내가 알고 싶은(ori_te) 데이터를 예측하게 하는 것

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (641, 10) / (275, 10)
after: (641, 7) / (275, 7)
<class 'pandas.core.frame.DataFrame'>
Index: 641 entries, 582 to 286
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   gender    641 non-null    object 
 2   age       520 non-null    float64
 3   sibsp     641 non-null    int64  
 4   parch     641 non-null    int64  
 5   fare      641 non-null    float64
 6   embarked  640 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 40.1+ KB


missing value

In [None]:
train.isnull().sum() # 미싱데이터 확인

Unnamed: 0,0
pclass,0
gender,0
age,121
sibsp,0
parch,0
fare,0
embarked,1


In [None]:
test.isnull().sum() # 미싱데이터 확인

Unnamed: 0,0
pclass,0
gender,0
age,59
sibsp,0
parch,0
fare,0
embarked,0


In [None]:
ori_te.isnull().sum() # 미싱데이터 확인

Unnamed: 0,0
pclass,0
gender,0
age,83
sibsp,0
parch,0
fare,1
embarked,1


In [None]:
age_median = train['age'].median()
fare_median = train['fare'].median()
embarked_mode = train['embarked'].mode().values[0]

age_median, fare_median, embarked_mode

(27.0, 14.5, 'S')

In [None]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)
ori_te['age'].fillna(age_median, inplace=True)

train['fare'].fillna(fare_median, inplace=True)
test['fare'].fillna(fare_median, inplace=True)
ori_te['fare'].fillna(fare_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

(0, 0, 0)

data encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
train.columns

Index(['pclass', 'gender', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object')

In [None]:
enc_cols = ['gender', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['parch', 'pclass', 'fare', 'age', 'sibsp']

In [None]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)
# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)
# ori_test
tmp_te = pd.DataFrame(
    enc.transform(ori_te[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_ori_te = pd.concat(
    [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (641, 7) / (275, 7)
after: (641, 10) / (275, 10)


Unnamed: 0,parch,pclass,fare,age,sibsp,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,0,3,7.925,32.0,0,0.0,1.0,0.0,0.0,1.0
1,0,2,11.5,21.0,0,0.0,1.0,0.0,0.0,1.0
2,0,3,7.775,27.0,0,0.0,1.0,0.0,0.0,1.0
3,0,3,7.8958,19.0,0,0.0,1.0,0.0,0.0,1.0
4,0,3,7.775,74.0,0,0.0,1.0,0.0,0.0,1.0


## Training

In [None]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

(0, 0, 0)

In [None]:
enc_tr.shape, enc_te.shape, enc_ori_te.shape

((641, 10), (275, 10), (393, 10))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
reset_seeds()
modelV0 = DecisionTreeClassifier()

print(f'{enc_tr.shape} / {y_tr.shape}')
modelV0.fit(enc_tr, y_tr)

(641, 10) / (641,)


## Evaluation

In [None]:
score_tr = modelV0.score(enc_tr, y_tr)
score_te = modelV0.score(enc_te, y_te)

score_tr, score_te

(0.984399375975039, 0.8145454545454546)

In [None]:
from sklearn.metrics import roc_curve, auc

# 예측값 확률로 나타냄
y_pred = modelV0.predict_proba(enc_te)[:,1]
fpr, tpr, thresholds = roc_curve(y_te,y_pred)

auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.7960807467386415


In [None]:
ori_te_pred = modelV3.predict_proba(enc_ori_te)[:,1]
ori_te_pred.shape

(393,)

In [None]:
modelV3.feature_importances_

array([0.02581964, 0.03925754, 0.20688132, 0.19178675, 0.02999791,
       0.46547571, 0.        , 0.01893159, 0.00509144, 0.01675809])

In [None]:
df_feature_importances = pd.DataFrame(modelV3.feature_importances_, enc_tr.columns).sort_values(by=[0], ascending=False).reset_index()

# 모델이 중요하다고 생각하는 요소가 뭔지 판단하는 가중치를 나타냄
print(f'{df_feature_importances.shape}')
df_feature_importances

(10, 2)


Unnamed: 0,index,0
0,gender_female,0.465476
1,fare,0.206881
2,age,0.191787
3,pclass,0.039258
4,sibsp,0.029998
5,parch,0.02582
6,embarked_C,0.018932
7,embarked_S,0.016758
8,embarked_Q,0.005091
9,gender_male,0.0


In [None]:
args.results.append(
    {
        'model': 'modelV3',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

args.results

[{'model': 'modelV0',
  'score_tr': 0.984399375975039,
  'score_te': 0.8145454545454546,
  'auc_te': 0.7960807467386415,
  'ori_te_pred': array([0.        , 1.        , 0.875     , 0.        , 1.        ,
         1.        , 0.        , 0.14285714, 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.14285714,
         0.        , 0.        , 0.        , 0.        , 0.14285714,
         0.33333333, 0.        , 1.        , 0.        , 1.        ,
         1.        , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 1.        , 0. 

# Base ModelV0 -> age -> midian에서 mean으로 통합 = 정확도 떨어짐

In [None]:
train = X_tr.copy()
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((641, 10), (275, 10), (393, 10))

## Data Preprocessing

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 641 entries, 582 to 286
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   name      641 non-null    object 
 2   gender    641 non-null    object 
 3   age       520 non-null    float64
 4   sibsp     641 non-null    int64  
 5   parch     641 non-null    int64  
 6   ticket    641 non-null    object 
 7   fare      641 non-null    float64
 8   cabin     153 non-null    object 
 9   embarked  640 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 55.1+ KB


drop columns

In [None]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True) # 모델이 학습하는데 사용하는 데이터
test.drop(drop_cols, axis=1, inplace=True) # 모델의 학습을 평가(잘했는지?? 못했는지??)하기 위한 데이터
ori_te.drop(drop_cols, axis=1, inplace=True) # 학습이 잘된 모델을 이용해서 내가 알고 싶은(ori_te) 데이터를 예측하게 하는 것

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (641, 10) / (275, 10)
after: (641, 7) / (275, 7)
<class 'pandas.core.frame.DataFrame'>
Index: 641 entries, 582 to 286
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   gender    641 non-null    object 
 2   age       520 non-null    float64
 3   sibsp     641 non-null    int64  
 4   parch     641 non-null    int64  
 5   fare      641 non-null    float64
 6   embarked  640 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 40.1+ KB


missing value

In [None]:
train.isnull().sum() # 미싱데이터 확인

Unnamed: 0,0
pclass,0
gender,0
age,121
sibsp,0
parch,0
fare,0
embarked,1


In [None]:
test.isnull().sum() # 미싱데이터 확인

Unnamed: 0,0
pclass,0
gender,0
age,59
sibsp,0
parch,0
fare,0
embarked,0


In [None]:
ori_te.isnull().sum() # 미싱데이터 확인

Unnamed: 0,0
pclass,0
gender,0
age,83
sibsp,0
parch,0
fare,1
embarked,1


In [None]:
age_mean = train['age'].mean()
fare_median = train['fare'].median()
embarked_mode = train['embarked'].mode().values[0]

age_mean, fare_median, embarked_mode

(29.335423076923078, 14.5, 'S')

In [None]:
train['age'].fillna(age_mean, inplace=True)
test['age'].fillna(age_mean, inplace=True)
ori_te['age'].fillna(age_mean, inplace=True)

train['fare'].fillna(fare_median, inplace=True)
test['fare'].fillna(fare_median, inplace=True)
ori_te['fare'].fillna(fare_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

(0, 0, 0)

data encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
train.columns

Index(['pclass', 'gender', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object')

In [None]:
enc_cols = ['gender', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['parch', 'pclass', 'fare', 'age', 'sibsp']

In [None]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)
# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)
# ori_test
tmp_te = pd.DataFrame(
    enc.transform(ori_te[enc_cols]).toarray(),
    columns = enc.get_feature_names_out()
)
enc_ori_te = pd.concat(
    [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (641, 7) / (275, 7)
after: (641, 10) / (275, 10)


Unnamed: 0,parch,pclass,fare,age,sibsp,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,0,3,7.925,32.0,0,0.0,1.0,0.0,0.0,1.0
1,0,2,11.5,21.0,0,0.0,1.0,0.0,0.0,1.0
2,0,3,7.775,29.335423,0,0.0,1.0,0.0,0.0,1.0
3,0,3,7.8958,19.0,0,0.0,1.0,0.0,0.0,1.0
4,0,3,7.775,74.0,0,0.0,1.0,0.0,0.0,1.0


## Training

In [None]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

(0, 0, 0)

In [None]:
enc_tr.shape, enc_te.shape, enc_ori_te.shape

((641, 10), (275, 10), (393, 10))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
reset_seeds()
modelV0 = DecisionTreeClassifier()

print(f'{enc_tr.shape} / {y_tr.shape}')
modelV0.fit(enc_tr, y_tr)

(641, 10) / (641,)


## Evaluation

In [None]:
score_tr = modelV0.score(enc_tr, y_tr)
score_te = modelV0.score(enc_te, y_te)

score_tr, score_te

(0.984399375975039, 0.7963636363636364)

In [None]:
from sklearn.metrics import roc_curve, auc

# 예측값 확률로 나타냄
y_pred = modelV0.predict_proba(enc_te)[:,1]
fpr, tpr, thresholds = roc_curve(y_te,y_pred)

auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.7743477282950967


In [None]:
ori_te_pred = modelV0.predict_proba(enc_ori_te)[:,1]
ori_te_pred.shape

(393,)

In [None]:
modelV0.feature_importances_

array([0.02943662, 0.04480394, 0.1941452 , 0.20570807, 0.03080676,
       0.46547571, 0.        , 0.01907506, 0.00131681, 0.00923183])

In [None]:
df_feature_importances = pd.DataFrame(modelV0.feature_importances_, enc_tr.columns).sort_values(by=[0], ascending=False).reset_index()

# 모델이 중요하다고 생각하는 요소가 뭔지 판단하는 가중치를 나타냄
print(f'{df_feature_importances.shape}')
df_feature_importances

(10, 2)


Unnamed: 0,index,0
0,gender_female,0.465476
1,age,0.205708
2,fare,0.194145
3,pclass,0.044804
4,sibsp,0.030807
5,parch,0.029437
6,embarked_C,0.019075
7,embarked_S,0.009232
8,embarked_Q,0.001317
9,gender_male,0.0


In [None]:
args.results.append(
    {
        'model': 'modelV0',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

args.results

[{'model': 'modelV0',
  'score_tr': 0.984399375975039,
  'score_te': 0.8145454545454546,
  'auc_te': 0.7960807467386415,
  'ori_te_pred': array([0.        , 1.        , 0.875     , 0.        , 1.        ,
         1.        , 0.        , 0.14285714, 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.14285714,
         0.        , 0.        , 0.        , 0.        , 0.14285714,
         0.33333333, 0.        , 1.        , 0.        , 1.        ,
         1.        , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 1.        , 0. 

# Base ModelV1 - age mean으로 통합 -> embarked를 one-hot-encoding

## Data Preprocessing

In [None]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)
ori_te.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (641, 8) / (275, 8)


KeyError: "['name', 'ticket', 'cabin'] not found in axis"

In [None]:
# # age -> age_random으로 변경

# reset_seeds()

# # train
# train['age_random'] = train['age']

# # random sampling
# random_sampling = (train['age'].dropna().sample(train['age'].isnull().sum()))
# random_sampling.index = train[lambda x: x['age'].isnull()].index # index 부여

# # NA imputation
# train.loc[train['age'].isnull(), 'age_random'] = random_sampling

# # test

# test['age_random'] = test['age']

# # random sampling
# random_sampling = (X_tr['age'].dropna().sample(test['age'].isnull().sum()))
# random_sampling.index = test[lambda x: x['age'].isnull()].index # index 부여

# # NA imputation
# test.loc[test['age'].isnull(), 'age_random'] = random_sampling

# # 확인
# print(test[['age', 'age_random']].isnull().sum())


# # ori_te

# ori_te['age_random'] = ori_te['age']

# # random sampling
# random_sampling = (X_tr['age'].dropna().sample(ori_te['age'].isnull().sum()))
# random_sampling.index = ori_te[lambda x: x['age'].isnull()].index # index 부여

# # NA imputation
# ori_te.loc[ori_te['age'].isnull(), 'age_random'] = random_sampling

# # 확인
# print(ori_te[['age', 'age_random']].isnull().sum())


# # 확인
# train[['age', 'age_random']].isnull().sum()
# print(test.info())
# print(ori_te.info())

In [None]:
age_median = train['age'].median()
fare_median = train['fare'].median()
embarked_mode = train['embarked'].mode().values[0]

fare_median, embarked_mode

KeyError: 'age'

In [None]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)
ori_te['age'].fillna(age_median, inplace=True)

train['fare'].fillna(fare_median, inplace=True)
test['fare'].fillna(fare_median, inplace=True)
ori_te['fare'].fillna(fare_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

In [None]:
train.drop(['age'], axis=1, inplace=True)
test.drop(['age'], axis=1, inplace=True)
ori_te.drop(['age'], axis=1, inplace=True)

train.head()

In [None]:
!pip install category_encoders

In [None]:
import category_encoders as ce

encoder = ce.OneHotEncoder(use_cat_names=True) # 인코딩 객체 만들어짐.

encoded_embarked = encoder.fit_transform(train['embarked']) # 인코딩 객체가 fit, transform....
_test_encoded_embarked = encoder.transform(test['embarked'])
_ori_te_encoded_embarked = encoder.transform(ori_te['embarked'])

encoded_embarked.head() # transform 결과 보기


In [None]:
train = pd.concat([train, encoded_embarked], axis=1)
test = pd.concat([test, _test_encoded_embarked], axis=1)
ori_te = pd.concat([ori_te, _ori_te_encoded_embarked], axis=1)

train.head()

In [None]:
train.drop(['embarked'], axis=1, inplace=True)
test.drop(['embarked'], axis=1, inplace=True)
ori_te.drop(['embarked'], axis=1, inplace=True)

train.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc_cols = ['gender', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

In [None]:
# print(f'before: {train.shape} / {test.shape}')

# enc = OneHotEncoder()
# # train
# tmp_tr = pd.DataFrame(
#     enc.fit_transform(train[enc_cols]).toarray(),
#     columns = enc.get_feature_names_out()
# )
# enc_tr = pd.concat(
#     [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
#     , axis=1
# )
# # test
# tmp_te = pd.DataFrame(
#     enc.transform(test[enc_cols]).toarray(),
#     columns = enc.get_feature_names_out()
# )
# enc_te = pd.concat(
#     [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
#     , axis=1
# )
# # ori_test
# tmp_te = pd.DataFrame(
#     enc.transform(ori_te[enc_cols]).toarray(),
#     columns = enc.get_feature_names_out()
# )
# enc_ori_te = pd.concat(
#     [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
#     , axis=1
# )

# print(f'after: {enc_tr.shape} / {enc_te.shape}')

Scaler

In [None]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

In [None]:
enc_tr.columns

In [None]:
scaling_cols = ['age_random', 'fare']
not_scaling_cols = list(set(enc_tr.columns) - set(scaling_cols))
not_scaling_cols

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
std = StandardScaler()

_scaled_tr = std.fit_transform(enc_tr[scaling_cols])
_scaled_te = std.transform(enc_te[scaling_cols])
_scaled_ori_te = std.transform(enc_ori_te[scaling_cols])

In [None]:
print(f'before: {enc_tr.shape} / {enc_te.shape}')
# train
tmp_tr = pd.DataFrame(
    _scaled_tr,
    columns = scaling_cols
)
scaled_tr = pd.concat(
    [enc_tr[not_scaling_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
).reset_index(drop=True)
# test
tmp_te = pd.DataFrame(
    _scaled_te,
    columns = scaling_cols
)
scaled_te = pd.concat(
    [enc_te[not_scaling_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
).reset_index(drop=True)
# ori_test
tmp_te = pd.DataFrame(
    _scaled_ori_te,
    columns = scaling_cols
)
scaled_ori_te = pd.concat(
    [enc_ori_te[not_scaling_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
).reset_index(drop=True)

print(f'after: {scaled_tr.shape} / {scaled_te.shape}')
scaled_tr.head()

## Training

In [None]:
scaled_tr.isnull().sum().sum(), scaled_te.isnull().sum().sum(), scaled_ori_te.isnull().sum().sum()

(0, 0, 0)

In [None]:
scaled_tr.shape, scaled_te.shape, scaled_ori_te.shape

((641, 10), (275, 10), (393, 10))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
reset_seeds()
modelV3 = DecisionTreeClassifier()

modelV3.fit(scaled_tr, y_tr)

## Evaluation

In [None]:
score_tr = modelV3.score(scaled_tr, y_tr)
score_te = modelV3.score(scaled_te, y_te)

score_tr, score_te

(0.9921996879875195, 0.8072727272727273)

In [None]:
from sklearn.metrics import roc_curve, auc

y_pred = modelV3.predict_proba(scaled_te)[:,1]
fpr, tpr, thresholds = roc_curve(y_te,y_pred)
auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.793325461088619


In [None]:
ori_te_pred = modelV3.predict_proba(scaled_ori_te)[:,1]
ori_te_pred.shape

(393,)

In [None]:
df_feature_importances = pd.DataFrame(modelV1.feature_importances_, scaled_tr.columns).sort_values(by=[0], ascending=False).reset_index()
print(f'{df_feature_importances.shape}')

(10, 2)


In [None]:
args.results.append(
    {
        'model': 'modelV3',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

len(args.results)

5

# Submission

In [None]:
df_results = pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)
df_results

Unnamed: 0,model,score_tr,score_te,auc_te,ori_te_pred,len_features,feaute_importances,create_dt
0,modelV0,0.984399,0.814545,0.796081,"[0.0, 1.0, 0.875, 0.0, 1.0, 1.0, 0.0, 0.142857...",10,"[gender_female, fare, age, pclass, sibsp, parc...",217
3,modelV1,0.9922,0.807273,0.793325,"[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",10,"[gender_male, fare, age_random, pclass, sibsp,...",217
4,modelV1,0.9922,0.807273,0.793325,"[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",10,"[gender_male, fare, age_random, pclass, sibsp,...",217
1,modelV1,0.984399,0.803636,0.78312,"[0.0, 1.0, 0.875, 0.0, 1.0, 1.0, 0.0, 0.142857...",10,"[gender_male, age, fare, pclass, parch, sibsp,...",217
2,modelV2,0.984399,0.803636,0.78312,"[0.0, 1.0, 0.875, 0.0, 1.0, 1.0, 0.0, 0.142857...",10,"[gender_male, age, fare, pclass, parch, sibsp,...",217
5,modelV0,0.984399,0.796364,0.774348,"[0.0, 1.0, 0.875, 0.0, 1.0, 1.0, 0.0, 0.142857...",10,"[gender_female, age, fare, pclass, sibsp, parc...",217


In [None]:
submission = pd.read_csv(args.default_submission_csv)
submission.head()

Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [None]:
submission['survived'] = df_results.loc[0, ['ori_te_pred']].values[0]

print(f'{submission.isnull().sum().sum()}')
submission.head(10)

0


Unnamed: 0,passengerid,survived
0,916,0.0
1,917,1.0
2,918,0.875
3,919,0.0
4,920,1.0
5,921,1.0
6,922,0.0
7,923,0.142857
8,924,0.0
9,925,0.0


In [None]:
submission.to_csv(args.submission_csv, header=True, index=False)

# Save Results

In [None]:
args.save_results

'/content/data/MyDrive/AI_study/1. Machine Learning/3. Supervised Learning/original data/result/model_results.json'

In [None]:
df_results.columns

Index(['model', 'score_tr', 'score_te', 'auc_te', 'ori_te_pred',
       'len_features', 'feaute_importances', 'create_dt'],
      dtype='object')

In [None]:
df_results.drop(['ori_te_pred'], axis=1, inplace=True)

In [None]:
df_results.to_json(args.save_results, orient="records")

In [None]:
# 어떤 모델이 일을 잘했는지 확인 가능
df_results.head()

Unnamed: 0,model,score_tr,score_te,auc_te,len_features,feaute_importances,create_dt
0,modelV0,0.984399,0.814545,0.796081,10,"[gender_female, fare, age, pclass, sibsp, parc...",217
1,modelV1,0.984399,0.803636,0.78312,10,"[gender_male, age, fare, pclass, parch, sibsp,...",217
2,modelV2,0.984399,0.803636,0.78312,10,"[gender_male, age, fare, pclass, parch, sibsp,...",217


In [None]:
import json

load_results = None
with open(args.save_results, 'r') as file:
    load_results = json.load(file)

load_results

[{'model': 'modelV0',
  'score_tr': 0.984399376,
  'score_te': 0.8145454545,
  'auc_te': 0.7960807467,
  'len_features': 10,
  'feaute_importances': ['gender_female',
   'fare',
   'age',
   'pclass',
   'sibsp',
   'parch',
   'embarked_C',
   'embarked_S',
   'embarked_Q',
   'gender_male'],
  'create_dt': '0217'},
 {'model': 'modelV1',
  'score_tr': 0.984399376,
  'score_te': 0.8036363636,
  'auc_te': 0.7831196581,
  'len_features': 10,
  'feaute_importances': ['gender_male',
   'age',
   'fare',
   'pclass',
   'parch',
   'sibsp',
   'embarked_C',
   'embarked_S',
   'embarked_Q',
   'gender_female'],
  'create_dt': '0217'},
 {'model': 'modelV2',
  'score_tr': 0.984399376,
  'score_te': 0.8036363636,
  'auc_te': 0.7831196581,
  'len_features': 10,
  'feaute_importances': ['gender_male',
   'age',
   'fare',
   'pclass',
   'parch',
   'sibsp',
   'embarked_C',
   'embarked_S',
   'embarked_Q',
   'gender_female'],
  'create_dt': '0217'}]