In [2]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [3]:
# Get the Data 
raw_df = pd.read_csv('https://raw.githubusercontent.com/countifs/rawdata/main/UCI%20-%20Breast%20Cancer%20Wisconsin%20(Original)/breastCancer.csv')

def preprocess(raw_df):
    df = raw_df.copy()
    df = df[df['bare_nucleoli'] != '?'] # 16개의 '?' 행 제거
    df['bare_nucleoli'] = df['bare_nucleoli'].astype(int) # 문자열을 정수형으로 변환
    df.drop_duplicates(inplace=True) # 중복된 행 제거
    df.replace({'class': {2: 0, 4: 1}}, inplace=True)  # 2: Benign, 4: Malignant
    df.reset_index(drop=True, inplace=True) # index 재설정
    df.drop(columns=['id'], axis=1, inplace=True) # id 열 제거
    return df

data = preprocess(raw_df).copy()


In [4]:
rule_list = ['(size_uniformity >= 4) and (size_uniformity >= 5) => class=1 (134.0/1.0)',
             '(bare_nucleoli >= 3) and (clump_thickness >= 5) => class=1 (51.0/6.0)',
             '(shape_uniformity >= 4) and (marginal_adhesion >= 4) => class=1 (6.0/1.0)',
             '(bland_chromatin >= 4) and (bare_nucleoli >= 8) => class=1 (4.0/0.0)']


In [5]:
colnm = data.columns

X = data.drop(columns='class')
y = data['class']

feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25, stratify=y)

In [6]:
rule_list 

['(size_uniformity >= 4) and (size_uniformity >= 5) => class=1 (134.0/1.0)',
 '(bare_nucleoli >= 3) and (clump_thickness >= 5) => class=1 (51.0/6.0)',
 '(shape_uniformity >= 4) and (marginal_adhesion >= 4) => class=1 (6.0/1.0)',
 '(bland_chromatin >= 4) and (bare_nucleoli >= 8) => class=1 (4.0/0.0)']

In [7]:
# => 기준으로 왼쪽 추출
rule_list = [rule.split('=>')[0] for rule in rule_list]

# 괄호 삭제
rule_list = [rule.replace('(', '').replace(')', '') for rule in rule_list]


In [8]:
import pandas as pd

train = pd.concat([X_train, y_train], axis=1)

rule_class = []
cover_count = []
coverage = []
class_0 = []
class_1 = []
homogeneity = []
    
for i in range(len(rule_list)):
    cover_count.append(len(train.query(rule_list[i]) ))
    coverage.append(round(len(train.query(rule_list[i])) / len(train), 3 ) )
    class_0.append( sum (train.query(rule_list[i])['class'] == 0) )
    class_1.append( sum (train.query(rule_list[i])['class'] == 1) )    
    homogeneity.append(round( max(class_0[i], class_1[i]) / cover_count[i], 3 ) )
    rule_class.append(0 if class_0[i] > class_1[i] else 1)

rule_df = pd.DataFrame({'rule':rule_list, 'class':rule_class, 'cover_count':cover_count, 'coverage':coverage, 'homogeneity': homogeneity,
                    'class_0':class_0, 'class_1':class_1})  

In [9]:
# condition컬럼에서 and 개수 찾기
rule_df['cond_count'] = rule_df['rule'].str.count(' and ') + 1 

#condition에서 부호가 모두 같으면 1, 다르면 0
rule_df['cond_sign'] = rule_df['rule'].str.contains('>=') & rule_df['rule'].str.contains('<')
rule_df['cond_sign'] = rule_df['cond_sign'].astype(int)

pd.set_option('display.max_colwidth', None)
rule_df

Unnamed: 0,rule,class,cover_count,coverage,homogeneity,class_0,class_1,cond_count,cond_sign
0,size_uniformity >= 4 and size_uniformity >= 5,1,134,0.248,0.993,1,133,2,0
1,bare_nucleoli >= 3 and clump_thickness >= 5,1,156,0.289,0.955,7,149,2,0
2,shape_uniformity >= 4 and marginal_adhesion >= 4,1,120,0.222,0.975,3,117,2,0
3,bland_chromatin >= 4 and bare_nucleoli >= 8,1,108,0.2,1.0,0,108,2,0


In [53]:
rule1 = train.query('size_uniformity >= 4 and size_uniformity >= 5')

In [54]:
rule2 = train.query('bare_nucleoli >= 3 and clump_thickness >= 5')

In [55]:
common_index = rule1.index.intersection(rule2.index)
common_index

Int64Index([208, 285, 111, 120, 371, 485, 337, 560, 470, 340,
            ...
            249, 502,  47, 462, 566,  54, 167, 211,  53, 178],
           dtype='int64', length=105)

In [28]:
# 아래 조건이 아닌 데이터 추출
train1 = train.query('not (size_uniformity >= 4 and size_uniformity >= 5)')
train2 = train1.query('bare_nucleoli >= 3 and clump_thickness >= 5')


In [36]:
train2 = train1.query('bare_nucleoli >= 3 and clump_thickness >= 5')
train2['class'].value_counts()

1    45
0     6
Name: class, dtype: int64

In [40]:
train3 = train1.query('not(bare_nucleoli >= 3 and clump_thickness >= 5)')

In [None]:
rain3

In [43]:
train3.query('shape_uniformity >= 4 and marginal_adhesion >= 4')

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
84,3,3,6,4,5,8,4,4,1,1
301,4,4,4,4,6,5,7,3,1,0
581,5,4,6,8,4,1,8,10,1,1
99,2,3,4,4,2,5,2,5,1,1
328,3,4,4,10,5,1,3,3,1,1
15,7,4,6,4,6,1,4,3,1,1


In [11]:
train.query('not ((size_uniformity >= 3 and size_uniformity >= 5 and bland_chromatin >= 5)) and (bare_nucleoli >= 3 and clump_thickness >= 7)' ).shape

(51, 10)

In [None]:
rule_list2 = [ 'size_uniformity >= 4 and size_uniformity >= 5',
              'not ((size_uniformity >= 4 and size_uniformity >= 5)) and (bare_nucleoli >= 3 and clump_thickness >= 5)',
              

              
              
]

In [12]:
train1.query('bare_nucleoli >= 3 and clump_thickness >= 7').shape

(51, 10)

In [13]:
pd.set_option('display.max_colwidth', None)

In [14]:
rule_df

Unnamed: 0,rule,class,cover_count,coverage,homogeneity,class_0,class_1,cond_count,cond_sign
0,size_uniformity >= 4 and size_uniformity >= 5,1,134,0.248,0.993,1,133,2,0
1,bare_nucleoli >= 3 and clump_thickness >= 5,1,156,0.289,0.955,7,149,2,0
2,shape_uniformity >= 4 and marginal_adhesion >= 4,1,120,0.222,0.975,3,117,2,0
3,bland_chromatin >= 4 and bare_nucleoli >= 8,1,108,0.2,1.0,0,108,2,0
