In [6]:
import pandas as pd
import numpy as np

In [7]:
tt_train_df = pd.read_csv('../train.csv')
tt_train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [8]:
tt_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
tt_train_df['Age'].fillna(tt_train_df['Age'].mean(),inplace=True)
tt_train_df['Cabin'].fillna('N',inplace=True)
tt_train_df['Embarked'].fillna('S',inplace=True)
print('데이터 세트 Null 값 갯수', tt_train_df.isnull().sum())

데이터 세트 Null 값 갯수 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [10]:
tt_test_df = pd.read_csv('../test.csv')
tt_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [11]:
tt_test_df['Age'].fillna(tt_test_df['Age'].mean(), inplace=True)
tt_train_df['Fare'].fillna('N',inplace=True)
tt_train_df['Cabin'].fillna('N',inplace=True)
print('데이터 세트 Null 값 갯수', tt_train_df.isnull().sum())

데이터 세트 Null 값 갯수 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [12]:
print(f"성별의 값 분포는 {tt_train_df['Sex'].value_counts()}") # female만 모두 생존

성별의 값 분포는 male      577
female    314
Name: Sex, dtype: int64


In [13]:
print(f"Embarked의 값 분포는 {tt_train_df['Embarked'].value_counts()}")

Embarked의 값 분포는 S    646
C    168
Q     77
Name: Embarked, dtype: int64


## 데이터 전처리 모듈화_ 함수생성

In [14]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Fare'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [15]:
import pandas as pd
tt_train_df = pd.read_csv('../train.csv')
tt_test_df = pd.read_csv('../test.csv')

In [16]:
df_list = [tt_train_df, tt_test_df]
tt_train_df_1 = transform_features(tt_train_df)
tt_test_df_1 = transform_features(tt_test_df)

In [17]:
tt_train_df_1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Cabin,Embarked
0,0,3,1,22.0,7,3
1,1,1,0,38.0,2,0
2,1,3,0,26.0,7,3
3,1,1,0,35.0,2,3
4,0,3,1,35.0,7,3


In [18]:
tt_test_df_1.head()

Unnamed: 0,Pclass,Sex,Age,Cabin,Embarked
0,3,1,34.5,7,1
1,3,0,47.0,7,2
2,2,1,62.0,7,1
3,3,1,27.0,7,2
4,3,0,22.0,7,2


### Features 와 label 분리

In [19]:
label = tt_train_df_1['Survived']
features = tt_train_df_1.drop(columns='Survived')

In [20]:
features

Unnamed: 0,Pclass,Sex,Age,Cabin,Embarked
0,3,1,22.000000,7,3
1,1,0,38.000000,2,0
2,3,0,26.000000,7,3
3,1,0,35.000000,2,3
4,3,1,35.000000,7,3
...,...,...,...,...,...
886,2,1,27.000000,7,3
887,1,0,19.000000,1,3
888,3,0,29.699118,7,3
889,1,1,26.000000,2,0


In [21]:
label

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

### datascaling

In [22]:
# StandardScaler를 이용한 스케일링
from sklearn.preprocessing import StandardScaler

# StandardScaler객체 생성
scaler = StandardScaler()
# StandardScaler로 데이터 셋 변환 , fit()과 transform()호출
titanic_train_scaled = scaler.fit_transform(features)

In [23]:
titanic_train_scaled

array([[ 0.82737724,  0.73769513, -0.5924806 ,  0.51129323,  0.56273365],
       [-1.56610693, -1.35557354,  0.63878901, -1.91449093, -2.00852627],
       [ 0.82737724, -1.35557354, -0.2846632 ,  0.51129323,  0.56273365],
       ...,
       [ 0.82737724, -1.35557354,  0.        ,  0.51129323,  0.56273365],
       [-1.56610693,  0.73769513, -0.2846632 , -1.91449093, -2.00852627],
       [ 0.82737724,  0.73769513,  0.17706291,  0.51129323, -0.29435299]])

In [24]:
df_train_scaled = pd.DataFrame(titanic_train_scaled , columns=features.columns)

### DecisionTree를 활용한 예측

In [25]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(titanic_train_scaled , label)

In [26]:
titanic_test_scaled = scaler.fit_transform(tt_test_df_1)
results = clf.predict(titanic_test_scaled)

In [27]:
results

array([0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [28]:
len(results)

418

In [29]:
label

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [30]:
y_test = pd.read_csv('../gender_submission.csv')

In [31]:
y_test = y_test['Survived']

In [32]:
y_test

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [33]:
#정확도 측정
from sklearn.metrics import accuracy_score
accuracy_score(y_test, results)

0.8110047846889952

In [34]:
# = 이 피쳐써도 성능 양호. 

In [35]:
418*0.8110047846889952 #339개 일치

339.0

In [36]:
# null값처리, 숫자변환, 스케일링 + DecisionTree알고리즘 사용 => 성능 81%

In [37]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dt_clf = DecisionTreeClassifier(random_state=15)
rf_clf = RandomForestClassifier(random_state=15)
lr_clf = LogisticRegression()

dt_clf.fit(df_train_scaled , label)
dt_pred = dt_clf.predict(titanic_test_scaled)
print(accuracy_score(y_test, dt_pred))

rf_clf.fit(df_train_scaled , label)
rf_pred = rf_clf.predict(titanic_test_scaled)
print(accuracy_score(y_test,rf_pred))

lr_clf.fit(df_train_scaled , label)
lr_pred = lr_clf.predict(titanic_test_scaled)
print(accuracy_score(y_test,lr_pred))



0.8253588516746412
0.8086124401913876
0.9473684210526315




In [38]:
from sklearn.model_selection import GridSearchCV

param = {'max_depth':[2,3,5,10],
        'min_samples_split':[2,3,5],
        'min_samples_leaf':[1,5,8]}

grid_dclf = GridSearchCV(dt_clf,param_grid=param,scoring='accuracy',cv=5)
grid_dclf.fit(df_train_scaled,label)

print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
print(f'GrdiSearchCV 최고 정확도: {grid_dclf.best_score_}')
best_dclf = grid_dclf.best_estimator_

### GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행
dpred = best_dclf.predict(titanic_test_scaled)
accuracy = accuracy_score(y_test,dpred)
print(f'테스트 세트에서의 DecisionTreeClassifier 정확도 {accuracy}')

GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 3}
GrdiSearchCV 최고 정확도: 0.8170610758897748
테스트 세트에서의 DecisionTreeClassifier 정확도 0.8827751196172249




In [39]:
import numpy as np
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf, titanic_train_scaled, label, cv=5)

for iter_count, accuracy in enumerate(scores):
    print(iter_count, accuracy)

print(np.mean(scores))

0 0.7821229050279329
1 0.7865168539325843
2 0.8089887640449438
3 0.8089887640449438
4 0.8370786516853933
0.8047391877471597


In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score,confusion_matrix

def get_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test,pred)
    accuracy = accuracy_score(y_test,pred)
    precision = precision_score(y_test,pred)
    recall = recall_score(y_test,pred)
    print('confusion matrix')
    print(confusion)
    print(f'정확도: {np.round(accuracy,4)},정밀도{np.round(precision,4)}, 재현율{np.round(recall,4)}')
    
get_clf_eval(y_test,dpred)

confusion matrix
[[261   5]
 [ 44 108]]
정확도: 0.8828,정밀도0.9558, 재현율0.7105
