In [None]:
import pandas as pd

# 데이터 준비
bc_df = pd.read_csv('breast_cancer.csv', index_col=0)
bc_df.head(), bc_df.shape

In [None]:
bc_df.columns.unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

# object → int 
le_disaster = LabelEncoder()
bc_df['target'] = le_disaster.fit_transform(bc_df['target'])
bc_df.info()

In [None]:
bc_df.describe()

In [688]:
# 결측치 처리
# bc_df.isnull().sum()
# bc_df = bc_df.dropna()

In [689]:
# 중복 데이터 확인 및 제거
# bc_df.duplicated().sum()
# bc_df.drop_duplicates(inplace=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
bc_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# 이상치 확인
bc_df.drop('target', axis=1).boxplot(figsize=(30,10))

In [692]:
# 이상치 제거
bc_df = bc_df[bc_df['symmetry error'] <= 4000]

In [693]:
from sklearn.model_selection import train_test_split

X = bc_df.drop(columns=['target'])
y = bc_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [694]:
# 데이터 스케일링
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [695]:
# 모델 훈련
# from sklearn.linear_model import LogisticRegression
# model = LogisticRegression(max_iter=200)

# from sklearn.tree import DecisionTreeClassifier
# model = DecisionTreeClassifier(criterion='entropy',
#                                max_depth=4,
#                                random_state=42)


# model.fit(X_train_scaled, y_train)

# pred = model.predict(X_test_scaled)

# # depth 깊이 확인
# depth = model.get_depth()
# print(f"Tree depth: {depth}")

In [696]:
# 앙상블
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

model1 = DecisionTreeClassifier(max_depth=4, random_state=42)
model2 = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
model3 = AdaBoostClassifier(n_estimators=50, algorithm='SAMME', random_state=42)

ensemble_model = VotingClassifier(estimators=[
    ('dt', model1),
    ('rf', model2),
    ('ab', model3)],
    voting='soft') # 'hard'

ensemble_model.fit(X_train_scaled, y_train)

pred = ensemble_model.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score

model1.fit(X_train_scaled, y_train)
model2.fit(X_train_scaled, y_train)
model3.fit(X_train_scaled, y_train)

pred1 = model1.predict(X_test_scaled)
pred2 = model2.predict(X_test_scaled)
pred3 = model3.predict(X_test_scaled)

# 각 모델의 정확도
acc1 = accuracy_score(y_test, pred1)
acc2 = accuracy_score(y_test, pred2)
acc3 = accuracy_score(y_test, pred3)

print(f"Decision Tree Accuracy: {acc1:.2f}")
print(f"Random Forest Accuracy: {acc2:.2f}")
print(f"AdaBoost Accuracy: {acc3:.2f}")

In [None]:
# 성능
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
# accuracy = accuracy_score(y_test, pred)
# precision = precision_score(y_test, pred)
# recall = recall_score(y_test, pred)
# f1 = f1_score(y_test, pred)
roc_auc = roc_auc_score(y_test, pred)

# print(f"정확도: {accuracy:.2f}")
# print(f"정밀도: {precision:.2f}")
# print(f"재현율: {recall:.2f}")
# print(f"F1 점수: {f1:.2f}")
print(f"ROC AUC: {roc_auc:.2f}\n")

cm = confusion_matrix(y_test, pred)
print("혼동 행렬:\n", cm, '\n')

report = classification_report(y_test, pred)
print("classification report:\n", report)