In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
import matplotlib.pyplot as plt

# 데이터 로드
df = sns.load_dataset('titanic')

# 1. 필요한 특성 선택
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'deck']
X = df[features]

In [23]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [32]:
# target 변수 인코딩
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(df['alive'])  # 'yes'/'no'를 1/0으로 변환

print("타겟 변수 인코딩 결과:")
print(f"원본 클래스: {y_encoder.classes_}")
print(f"변환된 값: {y_encoder.transform(y_encoder.classes_)}")

타겟 변수 인코딩 결과:
원본 클래스: ['no' 'yes']
변환된 값: [0 1]


array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [24]:
# 2. 결측치 처리
# 수치형 데이터 결측치
numeric_features = ['age', 'fare']
numeric_imputer = SimpleImputer(strategy='mean')
X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])


In [25]:

# 범주형 데이터 결측치
categorical_features = ['embarked', 'deck', 'who', 'class', 'sex']
categorical_imputer = SimpleImputer(strategy='most_frequent')
X.loc[:, categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

# 범주형 데이터 인코딩
label_encoders = {}
for feature in categorical_features:
    # 문자열로 변환 후 인코딩
    X[feature] = X[feature].astype(str)
    label_encoders[feature] = LabelEncoder()
    X.loc[:, feature] = label_encoders[feature].fit_transform(X[feature])

# 4. 특성 스케일링
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 5. 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 교차 검증 수행
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print(f"교차 검증 점수: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

교차 검증 점수: 1.000 (+/- 0.000)


In [4]:
y_pred = rf_model.predict(X_test)
print("\n최종 테스트 정확도:", accuracy_score(y_test, y_pred))
print("\n분류 보고서:")
print(classification_report(y_test, y_pred))


최종 테스트 정확도: 0.8100558659217877

분류 보고서:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       105
           1       0.79      0.73      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [5]:
# 하이퍼파라미터 튜닝
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)
print(f"\n최적 파라미터: {grid_search.best_params_}")
print(f"최고 정확도: {grid_search.best_score_:.3f}")



최적 파라미터: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
최고 정확도: 0.830


In [6]:

# 최적화된 모델로 예측
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\n최종 테스트 정확도:", accuracy_score(y_test, y_pred))
print("\n분류 보고서:")
print(classification_report(y_test, y_pred))

# 특성 선택
selector = SelectFromModel(best_model, prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# 선택된 특성으로 최종 모델 학습
final_model = RandomForestClassifier(**grid_search.best_params_, random_state=42)
final_model.fit(X_train_selected, y_train)




최종 테스트 정확도: 0.8324022346368715

분류 보고서:
              precision    recall  f1-score   support

           0       0.83      0.90      0.86       105
           1       0.84      0.73      0.78        74

    accuracy                           0.83       179
   macro avg       0.83      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179





In [7]:
df[features].head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,deck
0,3,male,22.0,1,0,7.25,S,Third,man,
1,1,female,38.0,1,0,71.2833,C,First,woman,C
2,3,female,26.0,0,0,7.925,S,Third,woman,
3,1,female,35.0,1,0,53.1,S,First,woman,C
4,3,male,35.0,0,0,8.05,S,Third,man,


In [9]:
X2 = df[features].copy()

# 범주형 데이터 결측치 처리
categorical_features = ['embarked', 'deck', 'who', 'class', 'sex']
categorical_imputer = SimpleImputer(strategy='most_frequent')
X2.loc[:, categorical_features] = categorical_imputer.fit_transform(X2[categorical_features])

# 범주형 데이터 인코딩
label_encoders = {}
for feature in categorical_features:
    # 문자열로 변환 후 인코딩
    X2[feature] = X2[feature].astype(str)
    label_encoders[feature] = LabelEncoder()
    encoded_values = label_encoders[feature].fit_transform(X2[feature])
    X2.loc[:, feature] = encoded_values

# 특성 스케일링
scaler = StandardScaler()
X2 = pd.DataFrame(
    scaler.fit_transform(X2),
    columns=X2.columns,
    index=X2.index
)


In [19]:
X2

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,deck
0,0.827377,0.737695,-0.530377,0.432793,-0.473674,-0.502445,0.585954,0.827377,-0.355242,-0.119419
1,-1.566107,-1.355574,0.571831,0.432793,-0.473674,0.786845,-1.942303,-1.566107,1.328379,-0.119419
2,0.827377,-1.355574,-0.254825,-0.474545,-0.473674,-0.488854,0.585954,0.827377,1.328379,-0.119419
3,-1.566107,-1.355574,0.365167,0.432793,-0.473674,0.420730,0.585954,-1.566107,1.328379,-0.119419
4,0.827377,0.737695,0.365167,-0.474545,-0.473674,-0.486337,0.585954,0.827377,-0.355242,-0.119419
...,...,...,...,...,...,...,...,...,...,...
886,-0.369365,0.737695,-0.185937,-0.474545,-0.473674,-0.386671,0.585954,-0.369365,-0.355242,-0.119419
887,-1.566107,-1.355574,-0.737041,-0.474545,-0.473674,-0.044381,0.585954,-1.566107,1.328379,-1.538119
888,0.827377,-1.355574,,0.432793,2.008933,-0.176263,0.585954,0.827377,1.328379,-0.119419
889,-1.566107,0.737695,-0.254825,-0.474545,-0.473674,-0.044381,-1.942303,-1.566107,-0.355242,-0.119419


In [11]:
y_pred2 = rf_model.predict(X2)

In [13]:
df['predicted_survived'] = y_pred2

In [21]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,predicted_survived
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,1
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,1
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,0
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,1
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,0
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,0


In [16]:
df[df['survived'] != df['predicted_survived']]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,predicted_survived
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False,0
17,1,2,male,,0,0,13.0000,S,Second,man,True,,Southampton,yes,True,0
21,1,2,male,34.0,0,0,13.0000,S,Second,man,True,D,Southampton,yes,True,0
23,1,1,male,28.0,0,0,35.5000,S,First,man,True,A,Southampton,yes,True,0
25,1,3,female,38.0,1,5,31.3875,S,Third,woman,False,,Southampton,yes,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,1,1,male,,0,0,29.7000,C,First,man,True,C,Cherbourg,yes,True,0
854,0,2,female,44.0,1,0,26.0000,S,Second,woman,False,,Southampton,no,False,1
869,1,3,male,4.0,1,1,11.1333,S,Third,child,False,,Southampton,yes,False,0
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True,0


In [33]:
y_encoder.inverse_transform(y_pred2)

array(['no', 'yes', 'yes', 'yes', 'no', 'no', 'no', 'no', 'yes', 'yes',
       'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'yes',
       'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no',
       'yes', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no',
       'yes', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'yes',
       'no', 'no', 'no', 'no', 'yes', 'no', 'yes', 'no', 'no', 'no', 'no',
       'no', 'yes', 'no', 'no', 'no', 'no', 'yes', 'no', 'yes', 'yes',
       'no', 'yes', 'yes', 'no', 'no', 'yes', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no',
       'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'yes', 'no', 'no',
       'yes', 'no', 'no', 'no', 'no', 'yes', 'yes', 'no', 'no