# 모듈 불러오기

In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from modules.stableCRI import stable_concise_rule_induction # file 제공
from modules.stableDT import stableDecisionTree # file 제공
from modules.stableDT import visualize_tree # file 제공


# 데이터 불러오기 및 전처리

In [None]:

# Get the Data 
raw_df = pd.read_csv('https://raw.githubusercontent.com/countifs/rawdata/main/UCI%20-%20Breast%20Cancer%20Wisconsin%20(Original)/breastCancer.csv')

def preprocess(raw_df):
    df = raw_df.copy()
    df = df[df['bare_nucleoli'] != '?'] # 16개의 '?' 행 제거
    df['bare_nucleoli'] = df['bare_nucleoli'].astype(int) # 문자열을 정수형으로 변환
    df.drop_duplicates(inplace=True) # 중복된 행 제거
    df.replace({'class': {2: 0, 4: 1}}, inplace=True)  # 2: Benign, 4: Malignant
    df.reset_index(drop=True, inplace=True) # index 재설정
    df.drop(columns=['id'], axis=1, inplace=True) # id 열 제거
    return df

df = preprocess(raw_df).copy()

# 트레인 테스트 데이터 분리
X = df.drop(columns='class')
y = df['class']

# CRI 모델링

In [5]:
def cri_fit(X, y, random_state = 25, sample_ratio = 0.05, max_depth = 1000, iter_num = 20, rule_rate = 1.0):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state, stratify=y)

    global train
    train = pd.concat([X_train, y_train], axis=1)
    target_name = 'class'

    # 최소 리프노드 샘플 수
    min_samples = round(sample_ratio * len(X_train))
    # max_depth = 1000 # 처음에는 2 또는 3으로 설정해서 돌려보고, 소요시간을 체크해보시기 바랍니다.

    # 모델 생성
    cri = stable_concise_rule_induction(min_samples = min_samples, max_depth = max_depth, algorithm='adaptive', simplify = True)
    cri.fit(data = train , target_name = target_name, iter_num = iter_num, rule_rate = rule_rate)

    # 성능 확인
    global test
    test = pd.concat([X_test, y_test], axis=1)
    pred_y, acc, f1 = cri.predict(test)

    print('▶ accuarcy : ', round(accuracy_score(y_test, pred_y), 3) )
    print('▶ f1-score :', round(f1_score(y_test, pred_y), 3) )
    print('▶ roc_auc :', round(roc_auc_score(y_test, pred_y), 3) )
    print('▶ confusion_matrix',confusion_matrix(y_test, pred_y) , sep = '\n' )
    print('▶ classification', classification_report(y_test, pred_y) , sep = '\n'  )

    # 결과 정리
    rule_class = []
    cover_count = []
    coverage = []
    class_0 = []
    class_1 = []
    homogeneity = []
    class_0_cover = []
    class_1_cover = []
    
    df_list = cri.df_concise_rule.T
    rule_list = df_list['Rule']

    # rule_list 에서 ,를 ' and '로 변경
    rule_list = rule_list.str.replace(',', ' and ')
    
    for i in range(len(rule_list)):
        cover_count.append(len(train.query(rule_list[i]) ))
        coverage.append(round(len(train.query(rule_list[i])) / len(train), 3 ) )
        class_0.append( sum (train.query(rule_list[i])['class'] == 0) )
        class_1.append( sum (train.query(rule_list[i])['class'] == 1) )
        try:
            homogeneity.append(round( max(class_0[i], class_1[i]) / cover_count[i], 3 ) )
        except:
            homogeneity.append('divide by zero')
        rule_class.append(0 if class_0[i] > class_1[i] else 1)
        
        class_0_cover.append( round(class_0[i] / sum(train['class'] == 0) , 3) )
        class_1_cover.append( round(class_1[i] / sum(train['class'] == 1) , 3) )
        
   
    rule_df = pd.DataFrame({'rule':rule_list, 'class':rule_class, 'count':cover_count, 'coverage':coverage, 'homo': homogeneity,
                        '0':class_0, '1':class_1, '0_cover':class_0_cover, '1_cover':class_1_cover})    

    # condition컬럼에서 and 개수 찾기
    rule_df['conditions'] = rule_df['rule'].str.count(' and ') + 1 

    #condition에서 부호가 모두 같으면 1, 다르면 0
    rule_df['justifiability'] = rule_df['rule'].str.contains('>=') & rule_df['rule'].str.contains('<')
    rule_df['justifiability'] = rule_df['justifiability'].astype(int)

    pd.set_option('display.max_colwidth', None)
    return rule_df

# 결과확인

In [6]:
random_seed = 9
rule_df = cri_fit(X, y, random_state = random_seed, sample_ratio = 0.05, max_depth = 1000, iter_num = 20, rule_rate = 1.0)
rule_df.sort_values(by='class', ascending=False).reset_index(drop=True)

▶ accuarcy :  0.97
▶ f1-score : 0.957
▶ roc_auc : 0.967
▶ confusion_matrix
[[86  2]
 [ 2 45]]
▶ classification
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        88
           1       0.96      0.96      0.96        47

    accuracy                           0.97       135
   macro avg       0.97      0.97      0.97       135
weighted avg       0.97      0.97      0.97       135



Unnamed: 0,rule,class,count,coverage,homo,0,1,0_cover,1_cover,conditions,justifiability
0,size_uniformity >= 4.5,1,140,0.259,0.986,2,138,0.006,0.73,1,0
1,clump_thickness >= 6.5,1,117,0.217,0.966,4,113,0.011,0.598,1,0
2,shape_uniformity < 1.5,0,279,0.517,0.993,277,2,0.789,0.011,1,0
3,bland_chromatin < 2.5,0,247,0.457,0.972,240,7,0.684,0.037,1,0
