In [2]:
import pandas as pd
import numpy as np
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from Orange import classification
from Orange.data.pandas_compat import table_from_frame
from Orange.data import Domain, Table
from Orange import evaluation
import pickle
import pandas as pd
import numpy as np


# Get the Data 
raw_df = pd.read_csv('https://raw.githubusercontent.com/countifs/rawdata/main/UCI%20-%20Breast%20Cancer%20Wisconsin%20(Original)/breastCancer.csv')

def preprocess(raw_df):
    df = raw_df.copy()
    df = df[df['bare_nucleoli'] != '?'] # 16개의 '?' 행 제거
    df['bare_nucleoli'] = df['bare_nucleoli'].astype(int) # 문자열을 정수형으로 변환
    df.drop_duplicates(inplace=True) # 중복된 행 제거
    df.replace({'class': {2: 0, 4: 1}}, inplace=True)  # 2: Benign, 4: Malignant
    df.reset_index(drop=True, inplace=True) # index 재설정
    df.drop(columns=['id'], axis=1, inplace=True) # id 열 제거
    return df

df = preprocess(raw_df).copy()

test_ratio = 0.2
sample_ratio = 0.05
target_att = 'class'

colnm = df.columns
X = df.loc[:,colnm [colnm != target_att]]
y = df.loc[:, target_att]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=25, stratify=y)
train_idx, test_idx = X_train.index, X_test.index  
train, _ = df.loc[train_idx,:], df.loc[test_idx,:]

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_train = y_train.astype(str)
y_train = pd.DataFrame(y_train)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
y_test = y_test.astype(str)
y_test = pd.DataFrame(y_test)
orange_X = table_from_frame(X_train)
orange_y = table_from_frame(y_train)
orange_domain = Domain(orange_X.domain, orange_y.domain)
orange_table = Table.from_numpy(domain = orange_domain, X = orange_X, Y = orange_y)

orange_X_test = table_from_frame(X_test)
orange_y_test = table_from_frame(y_test)
orange_domain_test = Domain(orange_X_test.domain, orange_y_test.domain)
orange_table_test = Table.from_numpy(domain = orange_domain_test, X = orange_X_test, Y = orange_y_test)

learner = classification.CN2Learner()
learner.rule_finder.general_validator.min_covered_examples = int(sample_ratio * orange_table.n_rows +1)
learner.rule_finder.general_validator.max_rule_length = 1000
classifier = learner(orange_table)


  class_vars = list(class_vars)
  attributes = list(attributes)


In [3]:
# predictive performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

pred = np.argmax(classifier.predict(orange_table_test.X),axis=1)
origin = orange_table_test.Y
print(accuracy_score(pred,origin) )
print(f1_score(pred,origin,average='macro') )

0.9629629629629629
0.9589939857845817


In [4]:
from sklearn.metrics import confusion_matrix

pred = pred.astype(int)
y_test = y_test.astype(int)

# confusion matrix
cm = confusion_matrix(y_test, pred)
print(cm)

[[86  2]
 [ 3 44]]


In [5]:
import pandas as pd

rules = classifier.rule_list

rule_data = []

for rule in rules:
    rule_str = str(rule)
    rule_parts = rule_str.split('IF ')[1].split(' THEN class=')
    condition = rule_parts[0]
    class_value = rule_parts[1]
    rule_data.append([condition, class_value])

rule_df = pd.DataFrame(rule_data, columns=['Condition', 'Class'])

# rule_df['Condition'] 에서 AND를 and로 바꾸기
rule_df['Condition'] = rule_df['Condition'].str.replace(' AND ', ' and ')

rule_df = rule_df[:-1]

In [6]:
# max colwidth 설정
#데이터프레임 모든 column의 폭 최대화
pd.set_option('display.max_colwidth', -1)

rule_df

Unnamed: 0,Condition,Class
0,normal_nucleoli>=9.0,1
1,size_uniformity>=5.0 and clump_thickness>=7.0,1
2,bare_nucleoli>=6.0 and shape_uniformity>=5.0,1
3,normal_nucleoli<=3.0 and shape_uniformity<=2.0 and marginal_adhesion>=2.0,0
4,bare_nucleoli<=2.0 and epithelial_size<=4.0 and epithelial_size<=3.0,0
5,clump_thickness>=5.0 and bare_nucleoli>=2.0 and normal_nucleoli<=7.0,1
6,size_uniformity<=5.0 and mitoses<=3.0 and bare_nucleoli<=7.0,0


In [7]:
cover_count = []
coverage = []
homogeneity = []
class_0 = []
class_1 = []

data = pd.concat([X_train, y_train], axis=1)

for i in range(len(rule_df)) :
    cover_count.append( len(data.query(rule_df.iloc[i,0])) )
    coverage.append(round(len(data.query(rule_df.iloc[i,0])) / len(data), 3 ) )
    class_0.append( sum (data.query(rule_df.iloc[i,0])['class'] == '0') )
    class_1.append( sum (data.query(rule_df.iloc[i,0])['class'] == '1') ) 
    homogeneity.append(round( max(class_0[i], class_1[i]) / cover_count[i], 3 ) )
       

In [8]:
rule_df = rule_df.assign(cover_count=cover_count, coverage=coverage, homogeneity=homogeneity, class_0=class_0, class_1=class_1)

In [9]:
# Class, homo, coverage순으로 정렬

rule_df = rule_df.sort_values(by=['Class', 'homogeneity', 'coverage'], ascending=[False, False, False])
rule_df

Unnamed: 0,Condition,Class,cover_count,coverage,homogeneity,class_0,class_1
2,bare_nucleoli>=6.0 and shape_uniformity>=5.0,1,103,0.191,1.0,0,103
1,size_uniformity>=5.0 and clump_thickness>=7.0,1,82,0.152,1.0,0,82
0,normal_nucleoli>=9.0,1,58,0.107,1.0,0,58
5,clump_thickness>=5.0 and bare_nucleoli>=2.0 and normal_nucleoli<=7.0,1,104,0.193,0.875,13,91
3,normal_nucleoli<=3.0 and shape_uniformity<=2.0 and marginal_adhesion>=2.0,0,48,0.089,1.0,48,0
4,bare_nucleoli<=2.0 and epithelial_size<=4.0 and epithelial_size<=3.0,0,317,0.587,0.994,315,2
6,size_uniformity<=5.0 and mitoses<=3.0 and bare_nucleoli<=7.0,0,369,0.683,0.932,344,25


In [10]:
# condition컬럼에서 and 개수 찾기
rule_df['cond_count'] = rule_df['Condition'].str.count(' and ') + 1

In [11]:
#condition에서 부호가 모두 같으면 1, 다르면 0
rule_df['cond_sign'] = rule_df['Condition'].str.contains('<=') & rule_df['Condition'].str.contains('>=')
rule_df['cond_sign'] = rule_df['cond_sign'].astype(int)
rule_df

Unnamed: 0,Condition,Class,cover_count,coverage,homogeneity,class_0,class_1,cond_count,cond_sign
2,bare_nucleoli>=6.0 and shape_uniformity>=5.0,1,103,0.191,1.0,0,103,2,0
1,size_uniformity>=5.0 and clump_thickness>=7.0,1,82,0.152,1.0,0,82,2,0
0,normal_nucleoli>=9.0,1,58,0.107,1.0,0,58,1,0
5,clump_thickness>=5.0 and bare_nucleoli>=2.0 and normal_nucleoli<=7.0,1,104,0.193,0.875,13,91,3,1
3,normal_nucleoli<=3.0 and shape_uniformity<=2.0 and marginal_adhesion>=2.0,0,48,0.089,1.0,48,0,3,1
4,bare_nucleoli<=2.0 and epithelial_size<=4.0 and epithelial_size<=3.0,0,317,0.587,0.994,315,2,3,0
6,size_uniformity<=5.0 and mitoses<=3.0 and bare_nucleoli<=7.0,0,369,0.683,0.932,344,25,3,0


In [14]:
rule_df.reset_index(drop=True, inplace=True)

In [15]:
rule_df

Unnamed: 0,Condition,Class,cover_count,coverage,homogeneity,class_0,class_1,cond_count,cond_sign
0,bare_nucleoli>=6.0 and shape_uniformity>=5.0,1,103,0.191,1.0,0,103,2,0
1,size_uniformity>=5.0 and clump_thickness>=7.0,1,82,0.152,1.0,0,82,2,0
2,normal_nucleoli>=9.0,1,58,0.107,1.0,0,58,1,0
3,clump_thickness>=5.0 and bare_nucleoli>=2.0 and normal_nucleoli<=7.0,1,104,0.193,0.875,13,91,3,1
4,normal_nucleoli<=3.0 and shape_uniformity<=2.0 and marginal_adhesion>=2.0,0,48,0.089,1.0,48,0,3,1
5,bare_nucleoli<=2.0 and epithelial_size<=4.0 and epithelial_size<=3.0,0,317,0.587,0.994,315,2,3,0
6,size_uniformity<=5.0 and mitoses<=3.0 and bare_nucleoli<=7.0,0,369,0.683,0.932,344,25,3,0
