In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
titanic = sns.load_dataset('titanic')
titanic


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [54]:
titanic['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [55]:
#결측치
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [56]:
#타겟 추출
dfy = titanic['survived'].copy()
dfy

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

In [57]:
#피처추출
feature_list = ['pclass','sex','age']

dfx = titanic[feature_list].copy()
dfx

Unnamed: 0,pclass,sex,age
0,3,male,22.0
1,1,female,38.0
2,3,female,26.0
3,1,female,35.0
4,3,male,35.0
...,...,...,...
886,2,male,27.0
887,1,female,19.0
888,3,female,
889,1,male,26.0


In [58]:
dfx['sex'].value_counts()


sex
male      577
female    314
Name: count, dtype: int64

In [59]:
from sklearn.preprocessing import LabelEncoder
dfx['sex'] = LabelEncoder().fit_transform(dfx['sex'])
dfx['sex'].value_counts()

sex
1    577
0    314
Name: count, dtype: int64

In [60]:
dfx['age'].isnull().sum()


177

In [61]:
dfx['age'].fillna(dfx['age'].mean(), inplace=True)
dfx['age'].isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfx['age'].fillna(dfx['age'].mean(), inplace=True)


0

In [62]:
dfx.tail()

Unnamed: 0,pclass,sex,age
886,2,1,27.0
887,1,0,19.0
888,3,0,29.699118
889,1,1,26.0
890,3,1,32.0


In [63]:
dfx['pclass'].value_counts()

pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [64]:
#pclass 의 원핫 인코딩
from sklearn.preprocessing import LabelBinarizer

dfx2 = pd. DataFrame(LabelBinarizer().fit_transform(dfx['pclass']), columns=['c1','c2','c3'],index=dfx.index)
dfx2

Unnamed: 0,c1,c2,c3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1
...,...,...,...
886,0,1,0
887,1,0,0
888,0,0,1
889,1,0,0


In [65]:
dfx = pd.concat( [dfx, dfx2],axis=1)
dfx

Unnamed: 0,pclass,sex,age,c1,c2,c3
0,3,1,22.000000,0,0,1
1,1,0,38.000000,1,0,0
2,3,0,26.000000,0,0,1
3,1,0,35.000000,1,0,0
4,3,1,35.000000,0,0,1
...,...,...,...,...,...,...
886,2,1,27.000000,0,1,0
887,1,0,19.000000,1,0,0
888,3,0,29.699118,0,0,1
889,1,1,26.000000,1,0,0


In [66]:
dfx.drop(['pclass'], axis=1, inplace=True) 
dfx

Unnamed: 0,sex,age,c1,c2,c3
0,1,22.000000,0,0,1
1,0,38.000000,1,0,0
2,0,26.000000,0,0,1
3,0,35.000000,1,0,0
4,1,35.000000,0,0,1
...,...,...,...,...,...
886,1,27.000000,0,1,0
887,0,19.000000,1,0,0
888,0,29.699118,0,0,1
889,1,26.000000,1,0,0


# 데이터 분할


In [72]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dfx, dfy,test_size=0.25, random_state=0)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((668, 5), (223, 5), (668,), (223,))

의사결정 나무


In [75]:
from sklearn.tree import DecisionTreeClassifier
dtmodel = DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=5)
#모델학습
dtmodel.fit(x_train, y_train)
dtmodel

In [77]:
y_pred = dtmodel.predict(x_test)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1])

In [78]:
pd.crosstab(y_test, y_pred, margins=True)

col_0,0,1,All
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,119,20,139
1,25,59,84
All,144,79,223


In [None]:
from IPython.display import Image # for visualizing the tree 

import pydotplus


In [80]:
# 의사 결정 나무를 시각화하기 위해서 라이브러리를 임포트합니다. 

from sklearn.tree import export_graphviz 



In [81]:
# 시각화를 위한 함수를 작성한다. 

def tree_graph_to_png(tree, feature_names, class_names, png_file_to_save): 

    tree_str = export_graphviz(tree, feature_names=feature_names, class_names=class_names, filled=True, out_file=None) 

    graph = pydotplus.graph_from_dot_data(tree_str) 

    print(graph) 

    graph.write_png(png_file_to_save) 

    return Image(graph.create_png()) 