## 타이타닉 데이터의 생존자 예측

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Global Variables 글로벌 변수 선언

In [2]:
import easydict
args = easydict.EasyDict()     # Dict --- args 딕셔너리형이다. 들고다니기 편해서 실제로 easydict.EasyDict() 많이 씀

In [3]:
# path 정보
args.default_path = 'C:/titanic_datas/'
args.train_csv = args.default_path+'train.csv'
args.test_csv = args.default_path+'test.csv'
args.default_submission_csv = args.default_path+'submission.csv'

args.submission_csv = args.default_path+'result/submission_0220.csv'
args.save_results = args.default_path+"result/model_results.json"    
                    # 아래 args.results = [] 결과부분을 제이슨 형태로 저장함

# 데이터 분석을 위한 변수들
args.random_state = 21   # random_state = seed값 설정
args.results=[]

### Load Titanic 데이터 로드

- Surived : 0 = 사망, 1 = 생존
- Pclass : 1 = 1등석, 2 = 2등석, 3 = 3등석
- gender : male = 남성, female = 여성
- Age : 나이
- SibSp : 타이타닉 호에 동승한 자매/배우자의 수
- Parch : 타이타닉 호에 동승한 부모/자식의 수
- Ticket : 티켓 번호
- Fare : 승객 요금
- Cabin : 방 호수
- Embarked : 탑승지; C = 셰르부르, Q = 퀴즈타운, S = 사우샘프턴

In [4]:
titanic = pd.read_csv(args.train_csv)
titanic

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.8750,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.7500,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5000,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
911,911,0,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C
912,912,0,3,"Cacic, Mr. Jego Grga",male,18.0,0,0,315091,8.6625,,S
913,913,0,2,"Pengelly, Mr. Frederick William",male,19.0,0,0,28665,10.5000,,S
914,914,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q


In [5]:
titanic_te = pd.read_csv(args.test_csv)

In [6]:
ori_train = pd.read_csv(args.train_csv)       
ori_test = pd.read_csv(args.test_csv)

ori_train.shape, ori_test.shape

((916, 12), (393, 11))

In [7]:
titanic.columns, titanic_te.columns

(Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
        'parch', 'ticket', 'fare', 'cabin', 'embarked'],
       dtype='object'),
 Index(['passengerid', 'pclass', 'name', 'gender', 'age', 'sibsp', 'parch',
        'ticket', 'fare', 'cabin', 'embarked'],
       dtype='object'))

In [8]:
titanic.shape, titanic_te.shape

((916, 12), (393, 11))

In [9]:
# columns를 조회해보니, test data 내부에 target data인 survived가 없다.

In [10]:
ori_train = pd.read_csv(args.train_csv)

In [11]:
ori_train.isnull().sum().sort_values(ascending=False)

cabin          718
age            180
embarked         1
passengerid      0
survived         0
pclass           0
name             0
gender           0
sibsp            0
parch            0
ticket           0
fare             0
dtype: int64

In [12]:
ori_train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [13]:
ori_train['passengerid'].nunique(), ori_train.shape[0]

(916, 916)

In [14]:
ori_train.drop('passengerid', axis=1, inplace=True)
ori_train.head()

Unnamed: 0,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [15]:
ori_test.set_index(['passengerid'], inplace=True) 
print(f'{ori_test.shape}')
ori_test.head()

(393, 10)


Unnamed: 0_level_0,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
916,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
917,2,"Pinsky, Mrs. (Rosa)",female,32.0,0,0,234604,13.0,,S
918,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
919,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
920,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S


In [16]:
ori_train.columns

Index(['survived', 'pclass', 'name', 'gender', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [17]:
ori_train.head()

Unnamed: 0,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


### train_test_split

In [18]:
new_survived = pd.Categorical(ori_train["survived"])
new_survived = new_survived.rename_categories(["Died","Survived"])              

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,570,0.622271
Survived,346,0.377729


In [19]:
from sklearn.model_selection import train_test_split # 사이킷런

In [20]:
y = ori_train['survived']
X = ori_train.drop(['survived'], axis=1)

In [21]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=ori_train['survived'], random_state=args.random_state)
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((641, 10), (275, 10), (641,), (275,))

### Base ModelV0

In [22]:
train = X_tr.copy()  
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((641, 10), (275, 10), (393, 10))

### Data Preprocessing

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 812 to 277
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   name      641 non-null    object 
 2   gender    641 non-null    object 
 3   age       512 non-null    float64
 4   sibsp     641 non-null    int64  
 5   parch     641 non-null    int64  
 6   ticket    641 non-null    object 
 7   fare      641 non-null    float64
 8   cabin     135 non-null    object 
 9   embarked  641 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 55.1+ KB


In [24]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']  

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)
ori_te.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (641, 10) / (275, 10)
after: (641, 7) / (275, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 812 to 277
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   gender    641 non-null    object 
 2   age       512 non-null    float64
 3   sibsp     641 non-null    int64  
 4   parch     641 non-null    int64  
 5   fare      641 non-null    float64
 6   embarked  641 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 40.1+ KB


In [25]:
train.isnull().sum()  

pclass        0
gender        0
age         129
sibsp         0
parch         0
fare          0
embarked      0
dtype: int64

In [26]:
test.isnull().sum()

pclass       0
gender       0
age         51
sibsp        0
parch        0
fare         0
embarked     1
dtype: int64

In [27]:
ori_te.isnull().sum()

pclass       0
gender       0
age         83
sibsp        0
parch        0
fare         1
embarked     1
dtype: int64

In [28]:
age_median = train['age'].median()
fare_median = train['fare'].median()
embarked_mode = train['embarked'].mode().values[0]   # 최빈값?

age_median, fare_median, embarked_mode

(28.0, 14.4, 'S')

In [29]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)
ori_te['age'].fillna(age_median, inplace=True)

train['fare'].fillna(fare_median, inplace=True)
test['fare'].fillna(fare_median, inplace=True)
ori_te['fare'].fillna(fare_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

(0, 0, 0)

### Data Encoding

In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
train.columns

Index(['pclass', 'gender', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object')

In [32]:
enc_cols = ['gender', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols)) 
normal_cols

['fare', 'parch', 'sibsp', 'age', 'pclass']

In [33]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(),  
    columns = enc.get_feature_names_out()     
)
enc_tr = pd.concat(   
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)] 
    , axis=1
)
# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(),    
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)
# ori_test
tmp_te = pd.DataFrame(
    enc.transform(ori_te[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_ori_te = pd.concat(
    [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (641, 7) / (275, 7)
after: (641, 10) / (275, 10)


Unnamed: 0,fare,parch,sibsp,age,pclass,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,7.775,0,0,22.0,3,1.0,0.0,0.0,0.0,1.0
1,7.8208,0,0,21.0,3,0.0,1.0,0.0,1.0,0.0
2,7.8542,0,0,32.0,3,0.0,1.0,0.0,0.0,1.0
3,18.7875,0,0,11.0,3,0.0,1.0,1.0,0.0,0.0
4,8.05,0,0,30.0,3,0.0,1.0,0.0,0.0,1.0


In [34]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

(0, 0, 0)

In [35]:
enc_tr.shape, enc_te.shape, enc_ori_te.shape

((641, 10), (275, 10), (393, 10))

In [36]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
modelV0 = DecisionTreeClassifier(random_state=args.random_state)

print(f'{enc_tr.shape} / {y_tr.shape}')
modelV0.fit(enc_tr, y_tr)

(641, 10) / (641,)


DecisionTreeClassifier(random_state=21)

In [38]:
score_tr = modelV0.score(enc_tr, y_tr)   
score_te = modelV0.score(enc_te, y_te) 

score_tr, score_te  

(0.982839313572543, 0.7745454545454545)

In [39]:
from sklearn.metrics import roc_curve, auc 
                    
y_pred = modelV0.predict_proba(enc_te)[:,1] 
fpr, tpr, thresholds = roc_curve(y_te,y_pred)
auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.7635796221322538


In [40]:
ori_te_pred = modelV0.predict_proba(enc_ori_te)[:,1]
ori_te_pred.shape

(393,)

In [41]:
modelV0.feature_importances_

array([0.19704895, 0.00739608, 0.0388822 , 0.17352864, 0.0638137 ,
       0.49297873, 0.        , 0.00766071, 0.        , 0.01869099])

In [42]:
df_feature_importances = pd.DataFrame(modelV0.feature_importances_, enc_tr.columns).sort_values(by=[0], ascending=False).reset_index()

print(f'{df_feature_importances.shape}')
df_feature_importances

(10, 2)


Unnamed: 0,index,0
0,gender_female,0.492979
1,fare,0.197049
2,age,0.173529
3,pclass,0.063814
4,sibsp,0.038882
5,embarked_S,0.018691
6,embarked_C,0.007661
7,parch,0.007396
8,gender_male,0.0
9,embarked_Q,0.0


In [43]:
args.results.append(
    {
        'model': 'modelV0',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

args.results

[{'model': 'modelV0',
  'score_tr': 0.982839313572543,
  'score_te': 0.7745454545454545,
  'auc_te': 0.7635796221322538,
  'ori_te_pred': array([1.        , 1.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.125     , 1.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 1.        ,
         0.        , 1.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         0.5       , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 1.        , 1. 

In [44]:
args.results.append(
    {
        'model': 'modelV0',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

args.results

[{'model': 'modelV0',
  'score_tr': 0.982839313572543,
  'score_te': 0.7745454545454545,
  'auc_te': 0.7635796221322538,
  'ori_te_pred': array([1.        , 1.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.125     , 1.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 1.        ,
         0.        , 1.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         0.5       , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 1.        , 1. 

In [45]:
train = X_tr.copy() 
test = X_te.copy()
ori_te = ori_test.copy()

train.shape, test.shape, ori_te.shape

((641, 10), (275, 10), (393, 10))

In [46]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)
ori_te.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (641, 10) / (275, 10)
after: (641, 7) / (275, 7)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 812 to 277
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   gender    641 non-null    object 
 2   age       512 non-null    float64
 3   sibsp     641 non-null    int64  
 4   parch     641 non-null    int64  
 5   fare      641 non-null    float64
 6   embarked  641 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 40.1+ KB


In [47]:
age_median = train['age'].median()
fare_median = train['fare'].median()
embarked_mode = train['embarked'].mode().values[0]

age_median, fare_median, embarked_mode

(28.0, 14.4, 'S')

In [48]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)
ori_te['age'].fillna(age_median, inplace=True)

train['fare'].fillna(fare_median, inplace=True)
test['fare'].fillna(fare_median, inplace=True)
ori_te['fare'].fillna(fare_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)
test['embarked'].fillna(embarked_mode, inplace=True)
ori_te['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum(), ori_te.isnull().sum().sum()

(0, 0, 0)

In [49]:
from sklearn.preprocessing import OneHotEncoder

In [50]:
enc_cols = ['gender', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['fare', 'parch', 'sibsp', 'age', 'pclass']

In [51]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
# train
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)
# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)
# ori_test
tmp_te = pd.DataFrame(
    enc.transform(ori_te[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_ori_te = pd.concat(
    [ori_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')

before: (641, 7) / (275, 7)
after: (641, 10) / (275, 10)


In [52]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum(), enc_ori_te.isnull().sum().sum()

(0, 0, 0)

In [53]:
enc_tr.columns

Index(['fare', 'parch', 'sibsp', 'age', 'pclass', 'gender_female',
       'gender_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [54]:
scaling_cols = ['age', 'fare']
not_scaling_cols = list(set(enc_tr.columns) - set(scaling_cols))
not_scaling_cols 

['embarked_S',
 'gender_female',
 'embarked_Q',
 'gender_male',
 'parch',
 'embarked_C',
 'sibsp',
 'pclass']

In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
std = StandardScaler()

_scaled_tr = std.fit_transform(enc_tr[scaling_cols])
_scaled_te = std.transform(enc_te[scaling_cols])
_scaled_ori_te = std.transform(enc_ori_te[scaling_cols])

In [57]:
print(f'before: {enc_tr.shape} / {enc_te.shape}')
# train
tmp_tr = pd.DataFrame(
    _scaled_tr, 
    columns = scaling_cols
)
scaled_tr = pd.concat(
    [enc_tr[not_scaling_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
).reset_index(drop=True)
# test
tmp_te = pd.DataFrame(
    _scaled_te, 
    columns = scaling_cols
)
scaled_te = pd.concat(
    [enc_te[not_scaling_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
).reset_index(drop=True)
# ori_test
tmp_te = pd.DataFrame(
    _scaled_ori_te, 
    columns = scaling_cols
)
scaled_ori_te = pd.concat(
    [enc_ori_te[not_scaling_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
).reset_index(drop=True)

print(f'after: {scaled_tr.shape} / {scaled_te.shape}')
scaled_tr.head()

before: (641, 10) / (275, 10)
after: (641, 10) / (275, 10)


Unnamed: 0,embarked_S,gender_female,embarked_Q,gender_male,parch,embarked_C,sibsp,pclass,age,fare
0,1.0,1.0,0.0,0.0,0,0.0,0,3,-0.606781,-0.479616
1,0.0,0.0,1.0,1.0,0,0.0,0,3,-0.683858,-0.478696
2,1.0,0.0,0.0,1.0,0,0.0,0,3,0.163995,-0.478025
3,0.0,0.0,0.0,1.0,0,1.0,0,3,-1.454634,-0.258431
4,1.0,0.0,0.0,1.0,0,0.0,0,3,0.00984,-0.474092


In [58]:
scaled_tr.isnull().sum().sum(), scaled_te.isnull().sum().sum(), scaled_ori_te.isnull().sum().sum()

(0, 0, 0)

In [59]:
scaled_tr.shape, scaled_te.shape, scaled_ori_te.shape

((641, 10), (275, 10), (393, 10))

In [60]:
from sklearn.tree import DecisionTreeClassifier

In [61]:
modelV1 = DecisionTreeClassifier(random_state=args.random_state)
modelV1.fit(scaled_tr, y_tr)

DecisionTreeClassifier(random_state=21)

In [62]:
score_tr = modelV1.score(scaled_tr, y_tr)
score_te = modelV1.score(scaled_te, y_te) 

score_tr, score_te  

(0.982839313572543, 0.7854545454545454)

In [63]:
from sklearn.metrics import roc_curve, auc 

y_pred = modelV0.predict_proba(scaled_te)[:,1]
fpr, tpr, thresholds = roc_curve(y_te,y_pred)
auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.5255847953216375


In [64]:
ori_te_pred = modelV0.predict_proba(scaled_ori_te)[:,1]
ori_te_pred.shape

(393,)

In [65]:
df_feature_importances = pd.DataFrame(modelV1.feature_importances_, scaled_tr.columns).sort_values(by=[0], ascending=False).reset_index()
print(f'{df_feature_importances.shape}')

(10, 2)


In [None]:
args.results.append(
    {
        'model': 'modelV1',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': ori_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

len(args.results)

3

In [67]:
df_results = pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)
df_results

Unnamed: 0,model,score_tr,score_te,auc_te,ori_te_pred,len_features,feaute_importances,create_dt
0,modelV0,0.982839,0.774545,0.76358,"[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.125, 1.0...",10,"[gender_female, fare, age, pclass, sibsp, emba...",217
1,modelV0,0.982839,0.774545,0.76358,"[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.125, 1.0...",10,"[gender_female, fare, age, pclass, sibsp, emba...",217
2,modelV1,0.982839,0.785455,0.525585,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10,"[gender_male, fare, age, pclass, sibsp, embark...",217


In [69]:
submission = pd.read_csv(args.default_submission_csv)
submission.head()

Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [70]:
submission['survived'] = df_results.loc[0, ['ori_te_pred']].values[0] 
  
print(f'{submission.isnull().sum().sum()}')
submission.head(10)

0


Unnamed: 0,passengerid,survived
0,916,1.0
1,917,1.0
2,918,1.0
3,919,0.0
4,920,1.0
5,921,1.0
6,922,0.0
7,923,0.125
8,924,1.0
9,925,0.0


In [71]:
submission.to_csv(args.submission_csv, header=True, index=False)  # to_csv 파일로 변환. 저장

In [72]:
args.save_results

'C:/titanic_datas/result/model_results.json'