In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from modules.stableDT import stableDecisionTree # file 제공
from modules.stableDT import visualize_tree # file 제공

# Get the Data 
raw_df = pd.read_csv('https://raw.githubusercontent.com/countifs/rawdata/main/UCI%20-%20Breast%20Cancer%20Wisconsin%20(Original)/breastCancer.csv')

def preprocess(raw_df):
    df = raw_df.copy()
    df = df[df['bare_nucleoli'] != '?'] # 16개의 '?' 행 제거
    df['bare_nucleoli'] = df['bare_nucleoli'].astype(int) # 문자열을 정수형으로 변환
    df.drop_duplicates(inplace=True) # 중복된 행 제거
    df.replace({'class': {2: 0, 4: 1}}, inplace=True)  # 2: Benign, 4: Malignant
    df.reset_index(drop=True, inplace=True) # index 재설정
    df.drop(columns=['id'], axis=1, inplace=True) # id 열 제거
    return df

df = preprocess(raw_df).copy()


# 트레인 테스트 데이터 분리
X = df.drop(columns='class')
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25, stratify=y)

from modules.stableCRI import stable_concise_rule_induction # file 제공
data = pd.concat([X_train, y_train], axis=1)
target_name = 'class'

sample_ratio = 0.05

# 최소 리프노드 샘플 수
n_samples = int(sample_ratio * len(X_train))


min_samples = round(sample_ratio * len(X_train))
max_depth = 1000 # 처음에는 2 또는 3으로 설정해서 돌려보고, 소요시간을 체크해보시기 바랍니다.

# 모델 생성
cri = stable_concise_rule_induction(min_samples = min_samples, max_depth = max_depth, algorithm='adaptive', simplify = True)
cri.fit(data = data , target_name = target_name, iter_num = 20, rule_rate = 0.9)


# 결과 정리

rule_class = []
cover_count = []
coverage = []
class_0 = []
class_1 = []
homogeneity = []
train = pd.concat([X_train, y_train], axis=1)

df = cri.df_concise_rule.T
rule_list = df['Rule']

# rule_list 에서 ,를 ' and '로 변경
rule_list = rule_list.str.replace(',', ' and ')
   
for i in range(len(rule_list)):
    cover_count.append(len(train.query(rule_list[i]) ))
    coverage.append(round(len(train.query(rule_list[i])) / len(train), 3 ) )
    class_0.append( sum (train.query(rule_list[i])['class'] == 0) )
    class_1.append( sum (train.query(rule_list[i])['class'] == 1) )
    homogeneity.append(round( max(class_0[i], class_1[i]) / cover_count[i], 3 ) )
    rule_class.append(0 if class_0[i] > class_1[i] else 1)
    
rule_df = pd.DataFrame({'rule':rule_list, 'class':rule_class, 'cover_count':cover_count, 'coverage':coverage, 'homogeneity': homogeneity,
                    'class_0':class_0, 'class_1':class_1})    

# condition컬럼에서 and 개수 찾기
rule_df['cond_count'] = rule_df['rule'].str.count(' and ') + 1 

#condition에서 부호가 모두 같으면 1, 다르면 0
rule_df['justifiability'] = rule_df['rule'].str.contains('>=') & rule_df['rule'].str.contains('<')
rule_df['justifiability'] = rule_df['justifiability'].astype(int)

pd.set_option('display.max_colwidth', None)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train_d.loc[:] = self.multi_to_integer(y_train_d)


In [4]:
rule_df

Unnamed: 0,rule,class,cover_count,coverage,homogeneity,class_0,class_1,cond_count,justifiability
0,shape_uniformity < 1.5,0,271,0.502,0.996,270,1,1,0
1,size_uniformity >= 4.5,1,134,0.248,0.993,1,133,1,0
2,size_uniformity < 1.5,0,295,0.546,0.99,292,3,1,0
3,bare_nucleoli >= 7.5,1,126,0.233,0.976,3,123,1,0


In [5]:
data = pd.concat([X_train, y_train], axis=1)

In [171]:
len(data)

540

In [170]:
data['class'].value_counts()

0    351
1    189
Name: class, dtype: int64

In [3]:
# CRI
rule1 = 'shape_uniformity < 1.5'
rule2 = 'size_uniformity >= 4.5'
rule3 = 'size_uniformity < 1.5'
rule4 = 'bare_nucleoli >= 7.5'

In [49]:
# HH

rule1 = 'shape_uniformity < 1.5'
rule2 = 'shape_uniformity >= 1.5 and size_uniformity >= 4.5'
rule3 = 'shape_uniformity >= 1.5 and size_uniformity < 4.5 and size_uniformity <1.5'
rule4 = 'shape_uniformity >= 1.5 and size_uniformity < 4.5 and size_uniformity >= 1.5 and bare_nucleoli >= 7.5'
rule5 = 'shape_uniformity >= 1.5 and size_uniformity < 4.5 and size_uniformity >= 1.5 and bare_nucleoli < 7.5 and normal_nucleoli < 1.5'
rule6 = 'shape_uniformity >= 1.5 and size_uniformity < 4.5 and size_uniformity >= 1.5 and bare_nucleoli < 7.5 and normal_nucleoli >= 1.5'

In [60]:
# cart

rule1 = 'size_uniformity <= 2.5 and bare_nucleoli <= 2.5 and size_uniformity <= 1.5'
rule2 = 'size_uniformity > 2.5 and size_uniformity > 4.5 and bare_nucleoli > 4.5'
rule3 = 'size_uniformity <= 2.5 and bare_nucleoli <= 2.5 and size_uniformity > 1.5'
rule4 = 'size_uniformity > 2.5 and size_uniformity > 4.5 and bare_nucleoli <= 4.5'
rule5 = 'size_uniformity > 2.5 and size_uniformity <= 4.5 and bare_nucleoli > 3.5'
rule6 = 'size_uniformity > 2.5 and size_uniformity <= 4.5 and bare_nucleoli <= 3.5'
rule7 = 'size_uniformity <= 2.5 and bare_nucleoli > 2.5'

In [10]:
data = pd.concat([X_test, y_test], axis=1)

# 테스트 데이터 셋 평가

In [119]:
print(sum(data.query(rule2_0)['class']== 0) )
print(sum(data.query(rule2_0)['class']== 1) )
print(len(data.query(rule2_0) ) / len(data) )

2
37
0.28888888888888886


In [107]:
data = pd.concat([X_test, y_test], axis=1)

In [108]:
len(data)

135

In [164]:
print(sum(data.query(rule1_3)['class']== 0) )
print(sum(data.query(rule1_3)['class']== 1) )
print(len(data.query(rule1_3) ) / len(data) )

1
27
0.2074074074074074


In [165]:
print(sum(data.query(rule2_0)['class']== 0) )
print(sum(data.query(rule2_0)['class']== 1) )
print(len(data.query(rule2_0) ) / len(data) )

2
37
0.28888888888888886


In [166]:
print(sum(data.query(rule3_2)['class']== 0) )
print(sum(data.query(rule3_2)['class']== 1) )
print(len(data.query(rule3_2) ) / len(data) )

1
29
0.2222222222222222


In [167]:
print(sum(data.query(rule4_1)['class']== 0) )
print(sum(data.query(rule4_1)['class']== 1) )
print(len(data.query(rule4_1) ) / len(data) )

2
35
0.2740740740740741


In [138]:
print(sum(data.query(rule5)['class']== 0) )
print(sum(data.query(rule5)['class']== 1) )
print(len(data.query(rule5) ) / len(data) )

80
0
0.5925925925925926


In [139]:
print(sum(data.query(rule6)['class']== 0) )
print(sum(data.query(rule6)['class']== 1) )
print(len(data.query(rule6) ) / len(data) )

86
3
0.6592592592592592


In [140]:
print(sum(data.query(rule7)['class']== 0) )
print(sum(data.query(rule7)['class']== 1) )
print(len(data.query(rule7) ) / len(data) )

3
21
0.17777777777777778


In [128]:
data['class'].value_counts()

0    88
1    47
Name: class, dtype: int64

---

In [124]:
# Ripper

rule1_3 = 'bland_chromatin >= 4 and bare_nucleoli >= 8'
rule2_0 = 'size_uniformity >= 4 and size_uniformity >= 5'
rule3_2 = 'shape_uniformity >= 4 and marginal_adhesion >= 4'
rule4_1 = 'bare_nucleoli >= 3 and clump_thickness >= 5'

# 1. rule1

In [161]:
print(len(data))
print(data.query(rule2_0)['class'].eq(0).sum() )
print(data.query(rule2_0)['class'].eq(1).sum() )

135
2
37


# 2. (not rule1) and (rule2)

In [162]:
print(len(data.query(f'~({rule2_0})')) )

print(data.query(f'~({rule2_0})').query(rule4_1)['class'].eq(0).sum() )
print(data.query(f'~({rule2_0})').query(rule4_1)['class'].eq(1).sum() )

96
0
8


# 3. (not rule1) and (not rule2) and (rule3)

In [None]:
print(len(data.query(f'~({rule3_0})').query(f'~({rule2_1})')) )

print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(rule1_2)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(rule1_2)['class'].eq(1).sum() )

104
2
7


# 4. (not rule1) and (not rule2) and (not rule3) and (rule4)

In [None]:
print(len(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})') ) )

print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(rule4_3)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(rule4_3)['class'].eq(1).sum() )

95
13
2


---

In [142]:
# CN2

rule1_2 = 'bare_nucleoli>=6.0 and shape_uniformity>=5.0'
rule2_1 = 'size_uniformity>=5.0 and clump_thickness>=7.0'
rule3_0 = 'normal_nucleoli>=9.0'
rule4_3 = 'normal_nucleoli<=3.0 and shape_uniformity<=2.0 and marginal_adhesion>=2.0'
rule5_4 = 'bare_nucleoli<=2.0 and epithelial_size<=4.0 and epithelial_size<=3.0'
rule6_6 = 'size_uniformity<=5.0 and mitoses<=3.0 and bare_nucleoli<=7.0'
rule7_5 = 'clump_thickness>=5.0 and bare_nucleoli>=2.0 and normal_nucleoli<=7.0'

# 1. rule1

In [143]:
print(len(data))
print(data.query(rule3_0)['class'].eq(0).sum() )
print(data.query(rule3_0)['class'].eq(1).sum() )

135
0
17


# 2. (not rule1) and (rule2)

In [145]:
print(len(data.query(f'~({rule3_0})')) )

print(data.query(f'~({rule3_0})').query(rule2_1)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(rule2_1)['class'].eq(1).sum() )

118
0
14


# 3. (not rule1) and (not rule2) and (rule3)

In [146]:
print(len(data.query(f'~({rule3_0})').query(f'~({rule2_1})')) )

print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(rule1_2)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(rule1_2)['class'].eq(1).sum() )

104
2
7


# 4. (not rule1) and (not rule2) and (not rule3) and (rule4)

In [151]:
print(len(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})') ) )

print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(rule4_3)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(rule4_3)['class'].eq(1).sum() )

95
13
2


# 5. (not rule1) and (not rule2) and (not rule3) and (not rule4) and (rule5)

In [155]:
print(len(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})') ) )

print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(rule5_4)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(rule5_4)['class'].eq(1).sum() )

80
69
0


# (not rule1) and (not rule2) and (not rule3) and (not rule4) and (not rule5) and (rule6)

In [153]:
print(len(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(f'~({rule5_4})') ) )
          
          
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(f'~({rule5_4})').query(rule7_5)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(f'~({rule5_4})').query(rule7_5)['class'].eq(1).sum() )

11
0
6


# (not rule1) and (not rule2) and (not rule3) and (not rule4) and (not rule5) and (not rule6) and (rule7)

In [154]:
print(len(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(f'~({rule5_4})').query(f'~({rule7_5})') ) )
          
          
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(f'~({rule5_4})').query(f'~({rule7_5})').query(rule6_6)['class'].eq(0).sum() )
print(data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(f'~({rule5_4})').query(f'~({rule7_5})').query(rule6_6)['class'].eq(1).sum() )

5
4
0


In [160]:
data.query(f'~({rule3_0})').query(f'~({rule2_1})').query(f'~({rule1_2})').query(f'~({rule4_3})').query(f'~({rule5_4})').query(f'~({rule7_5})') .query(f'~({rule6_6})') 

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
673,4,8,6,4,3,4,10,6,1,1


In [4]:
# predictive performance
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

data_test = pd.concat([X_test, y_test], axis=1)
pred_y, acc, f1 = cri.predict(data_test)
print(acc)
print(f1)

0.9704
0.9674


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[:] = self.multi_to_integer(y)


In [7]:
import pandas as pd

# Create two dataframes
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'A': [4, 5, 6], 'B': [7, 8, 9]})

# Concatenate the two dataframes
union = pd.concat([df1, df2], axis=0, ignore_index=True)

print(union)

   A  B
0  1  4
1  2  5
2  3  6
3  4  7
4  5  8
5  6  9


In [43]:
rule1 = data.query('shape_uniformity < 1.5')
rule2 = data.query('size_uniformity >= 4.5')
rule3 = data.query('size_uniformity < 1.5')
rule4 = data.query('bare_nucleoli >= 7.5')

In [53]:
index_union = rule1.index.union(rule2.index)
data.loc[index_union].

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,5,1,1,1,2,1,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
5,8,10,10,8,7,10,9,7,1,1
6,1,1,1,1,2,10,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
669,3,1,1,1,2,1,2,1,2,0
670,3,1,1,1,3,2,1,1,1,0
671,2,1,1,1,2,1,1,1,1,0
672,5,10,10,3,7,3,8,10,2,1


In [58]:
round(max(len(rule1[rule1['class'] == 0]), len(rule1[rule1['class'] == 1])) / len(rule1), 3)

0.996

In [61]:
def qualit_rule_df(rule_df):
    cover = len(rule_df)/len(data)
    class_0_len = len(rule_df[rule_df['class'] == 0])
    class_1_len = len(rule_df[rule_df['class'] == 0])
    homo = round(max(class_0_len, class_1_len) / len(rule_df), 3)
    
    print(cover)
    print(class_0_len)
    print(class_1_len)
    print(homo)
    
    return 
    
qualit_rule_df(rule1)

0.5018518518518519
270
270
0.996


In [47]:
import pandas as pd

# 샘플 데이터프레임 생성
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'A': [2, 5, 6], 'B': [5, 8, 9]}, index=['b', 'e', 'f'])

# 인덱스의 합집합 구하기
index_union = df1.index.union(df2.index)

# 결과 출력
print(index_union)

Index(['a', 'b', 'c', 'e', 'f'], dtype='object')


In [10]:
import pandas as pd

# Create two dataframes
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=[1, 2, 3])
df2 = pd.DataFrame({'A': [4, 5, 6], 'B': [7, 8, 9]}, index=[3, 4, 5])

# Merge the two dataframes
union = pd.merge(df1, df2, how='inner', left_index=True, right_index=True)

print(union)

   A_x  B_x  A_y  B_y
3    3    6    4    7


In [13]:
import pandas as pd

# Create two dataframes
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'B': [4, 5, 6], 'D': [7, 8, 9]})

# Concatenate the two dataframes
union = pd.concat([df1, df2], axis=0)

print(union)

     A  B    D
0  1.0  4  NaN
1  2.0  5  NaN
2  3.0  6  NaN
0  NaN  4  7.0
1  NaN  5  8.0
2  NaN  6  9.0


In [14]:
import pandas as pd

# Create two dataframes
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df2 = pd.DataFrame({'A': [4, 5, 6], 'B': [7, 8, 9]})

# Concatenate the two dataframes
union = pd.concat([df1, df2], ignore_index=True)

print(union)

   A  B
0  1  4
1  2  5
2  3  6
3  4  7
4  5  8
5  6  9


In [26]:
import pandas as pd

# Create two dataframes with index
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'A': [2, 5, 6], 'B': [5, 8, 9]}, index=['b', 'e', 'f'])


In [27]:
df1

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [28]:
df2

Unnamed: 0,A,B
b,2,5
e,5,8
f,6,9


In [29]:
# Concatenate the two dataframes with index
union = pd.concat([df1, df2], ignore_index=False)

print(union)

   A  B
a  1  4
b  2  5
c  3  6
b  2  5
e  5  8
f  6  9


In [30]:
import pandas as pd

# Create two dataframes with index
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'A': [2, 5, 6], 'B': [5, 8, 9]}, index=['b', 'e', 'f'])

# Concatenate the two dataframes with index
union = pd.concat([df1, df2])

# Remove duplicate index values
union = union[~union.index.duplicated(keep='first')]

print(union)

   A  B
a  1  4
b  2  5
c  3  6
e  5  8
f  6  9


In [33]:
import pandas as pd

# 샘플 데이터프레임 생성
df1 = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]}, index = ['a', 'b', 'c'])

# 샘플 데이터프레임 생성
df2 = pd.DataFrame({'A': [3, 2, 3],
                    'B': [6, 5, 6]}, index = ['d', 'b', 'c'])


In [40]:
# 두 개의 데이터프레임을 행 방향으로 합치기, 같은 행은 하나로
df3 = pd.concat([df1, df2], axis=0, ignore_index=True)
df3

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6
3,3,6
4,2,5
5,3,6


In [41]:
import pandas as pd

# 샘플 데이터프레임 생성
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index = ['a', 'b', 'c'])
# 샘플 데이터프레임 생성
df2 = pd.DataFrame({'A': [3, 2, 3], 'B': [6, 5, 6]}, index = ['d', 'b', 'c'])

# 두 데이터프레임을 세로 방향으로 합치고 중복되는 행을 제거하여 하나의 데이터프레임으로 만듦
union = pd.concat([df1, df2], join='outer')
union = union[~union.index.duplicated(keep='first')]

print(union)

   A  B
a  1  4
b  2  5
c  3  6
d  3  6


In [42]:
import pandas as pd

# 샘플 데이터프레임 생성
df = pd.DataFrame({'A': [1, 2, 3, 4, 5],
                   'B': [6, 7, 8, 9, 10],
                   'C': ['a', 'b', 'c', 'd', 'e']})

# 'A' 열이 3 미만이 아닌 행만 선택
df_filtered = df[~(df['A'] < 3)]

print(df_filtered)

   A   B  C
2  3   8  c
3  4   9  d
4  5  10  e


---

In [105]:
# Ripper
rule1 = 'size_uniformity >= 4 and size_uniformity >= 5'
rule2 = 'bare_nucleoli >= 3 and clump_thickness >= 5'
rule3 = 'shape_uniformity >= 4 and marginal_adhesion >= 4'
rule4 = 'bland_chromatin >= 4 and bare_nucleoli >= 8'

In [85]:
data = pd.concat([X_train, y_train], axis=1)

In [130]:
# CN2
rule0 = 'normal_nucleoli>=9.0'
rule1 = 'size_uniformity>=5.0 and clump_thickness>=7.0'
rule2 = 'bare_nucleoli>=6.0 and shape_uniformity>=5.0'
rule3 = 'normal_nucleoli<=3.0 and shape_uniformity<=2.0 and marginal_adhesion>=2.0'
rule4 = 'bare_nucleoli<=2.0 and epithelial_size<=4.0 and epithelial_size<=3.0'
rule5 = 'clump_thickness>=5.0 and bare_nucleoli>=2.0 and normal_nucleoli<=7.0'
rule6 = 'size_uniformity<=5.0 and mitoses<=3.0 and bare_nucleoli<=7.0'

In [92]:
def rule_quality(df, data):
    
    class1 = len(df[df['class'] == 0])
    class0 = len(df[df['class'] == 1])
    
    # class1 또는 class 0 중 max
    m = max(class1, class0)
    
    print('coverage :', round(len(df) / len(data), 3), f'({len(df)})')
    print('homo : ', m / (class1+class0) )
    print('class 0 :', len(df[df['class'] == 0]) / len(df), f'({len(df[df["class"] == 0])})')
    print('class 1 :', len(df[df['class'] == 1]) / len(df), f'({len(df[df["class"] == 1])})')

In [122]:
rule_bar = data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})')
rule_bar['class'].value_counts()

0    303
1     41
Name: class, dtype: int64

---

In [131]:
rule_quality(data.query(rule0), data)

coverage : 0.107 (58)
homo :  1.0
class 0 : 0.0 (0)
class 1 : 1.0 (58)


In [140]:
rule_quality(data.query(f'~({rule0})').query(rule1), data.query(f'~({rule0})'))

coverage : 0.11 (53)
homo :  1.0
class 0 : 0.0 (0)
class 1 : 1.0 (53)


In [141]:
rule_quality(data.query(f'~({rule0})').query(f'~({rule1})').query(rule2), data.query(f'~({rule0})').query(f'~({rule1})'))

coverage : 0.086 (37)
homo :  1.0
class 0 : 0.0 (0)
class 1 : 1.0 (37)


In [142]:
rule_quality(data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(rule3), 
             data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})'))

coverage : 0.122 (48)
homo :  1.0
class 0 : 1.0 (48)
class 1 : 0.0 (0)


In [135]:
rule_quality(data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})').query(rule4), 
             data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})'))

coverage : 0.797 (274)
homo :  0.9963503649635036
class 0 : 0.9963503649635036 (273)
class 1 : 0.0036496350364963502 (1)


In [136]:
rule_quality(data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})').query(f'~({rule4})').query(rule5), 
             data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})').query(f'~({rule4})'))

coverage : 0.429 (30)
homo :  0.8666666666666667
class 0 : 0.13333333333333333 (4)
class 1 : 0.8666666666666667 (26)


In [137]:
rule_quality(data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})').query(f'~({rule4})').query(f'~({rule5})').query(rule6), 
             data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})').query(f'~({rule4})').query(f'~({rule5})'))

coverage : 0.725 (29)
homo :  0.8620689655172413
class 0 : 0.8620689655172413 (25)
class 1 : 0.13793103448275862 (4)


In [138]:
rule_bar = data.query(f'~({rule0})').query(f'~({rule1})').query(f'~({rule2})').query(f'~({rule3})').query(f'~({rule4})').query(f'~({rule5})').query(f'~({rule6})')
rule_bar['class'].value_counts()

1    10
0     1
Name: class, dtype: int64