In [1]:
import fcalc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score

## Diabetes Dataset

In [2]:
# Diabetes Dataset

column_names_d = ['col2', 'col2.1', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'y']

diabetes_dataset = pd.read_csv('Datasets/diabetes.csv', names = column_names_d)

diabetes_dataset['y']= [x == '1' for x in diabetes_dataset['y']]
diabetes_dataset.sample(10)

Unnamed: 0,col2,col2.1,col3,col4,col5,col6,col7,col8,y
658,1,120,80,48,200,38.9,1.162,41,False
47,1,146,56,0,0,29.7,0.564,29,False
722,1,114,66,36,200,38.1,0.289,21,False
732,8,120,86,0,0,28.4,0.259,22,True
557,1,97,70,40,0,38.1,0.218,30,False
285,2,108,80,0,0,27.0,0.259,52,True
351,4,92,80,0,0,42.2,0.237,29,False
343,1,0,68,35,0,32.0,0.389,22,False
277,7,106,60,24,0,26.5,0.296,29,True
175,2,75,64,24,55,29.7,0.37,33,False


### Binarized Binary Classifier

In [3]:
x1 = pd.get_dummies(diabetes_dataset[column_names_d[:-1]], prefix=column_names_d[:-1]).astype(bool)
y1 = diabetes_dataset['y']

x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.3, random_state=42)                         

In [4]:
#Binarized Binary Classifier

bin_cls = fcalc.classifier.BinarizedBinaryClassifier(x1_train.values, y1_train.to_numpy(), method="standard-support")

In [6]:
bin_cls.predict(x1_test.values)
print(bin_cls.predictions)

[0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1.
 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 0. 0.
 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0.
 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0.
 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1.
 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]


In [7]:
print(accuracy_score(y1_test, bin_cls.predictions))
print(f1_score(y1_test, bin_cls.predictions)) 

0.5887445887445888
0.45086705202312144


### Pattern Binary Classifier

In [7]:
x1 = diabetes_dataset[column_names_d[:-1]]
y1 = diabetes_dataset['y']

x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.3, random_state=42)

In [8]:
# Pattern Binary Classifier

pat_cls = fcalc.classifier.PatternBinaryClassifier(x1_train.values, y1_train.to_numpy(), 
                                             categorical=np.arange(x1_train.shape[1]))   

In [9]:
pat_cls.predict(x1_test.values)
print(pat_cls.predictions)

In [10]:
print(accuracy_score(y1_test, pat_cls.predictions))
print(f1_score(y1_test, pat_cls.predictions))

0.6190476190476191
0.45


## Iris Dataset

In [11]:
# Iris Dataset

column_names_i = ['sepal_length','sepal_width','petal_length','petal_width','species']

iris_dataset = pd.read_csv('data_sets/iris.data', names = column_names_i)
iris_dataset['species'] = [x == 'Iris-setosa' for x in iris_dataset['species']]
iris_dataset.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
124,6.7,3.3,5.7,2.1,False
66,5.6,3.0,4.5,1.5,False
82,5.8,2.7,3.9,1.2,False
105,7.6,3.0,6.6,2.1,False
88,5.6,3.0,4.1,1.3,False
6,4.6,3.4,1.4,0.3,True
102,7.1,3.0,5.9,2.1,False
104,6.5,3.0,5.8,2.2,False
48,5.3,3.7,1.5,0.2,True
35,5.0,3.2,1.2,0.2,True


### Binarized Binary Classifier

In [12]:
x2 = pd.get_dummies(iris_dataset[column_names_i[:-1]], prefix=column_names_i[:-1]).astype(bool)
y2 = iris_dataset['species']

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.3, random_state=42)

ValueError: Length of 'prefix' (4) did not match the length of the columns being encoded (0).

In [None]:
# Binarized Binary Classifier

bin_cls = fcalc.classifier.BinarizedBinaryClassifier(x2_train.values, y2_train.to_numpy(), method="standard-support")

In [None]:
bin_cls.predict(x2_test.values)

In [None]:
print(accuracy_score(y2_test, bin_cls.predictions))
print(f1_score(y2_test, bin_cls.predictions))

### Pattern Binary Classifier

In [12]:
x2 = iris_dataset.iloc[:,:-1]
y2 = iris_dataset['species']

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.3, random_state=42)

In [13]:
# Pattern Binary Classifier

pat_cls = fcalc.classifier.PatternBinaryClassifier(x2_train.values, y2_train.to_numpy())

In [14]:
pat_cls.predict(x2_test.values)
print(pat_cls.predictions)

In [15]:
print("accuracy:",round(accuracy_score(y2_test, pat_cls.predictions),4))
print("f1 score:",round(f1_score(y2_test, pat_cls.predictions),4))

accuracy: 1.0
f1 score: 1.0


## Breast Cancer Dataset

In [16]:
# Breast Cancer Dataset

column_names_c = ['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'diagnosis']

cancer_dataset = pd.read_csv('Datasets/bcancer.csv', names = column_names_c)
# cancer_dataset['diagnosis'] = cancer_dataset['diagnosis'].map({'B': 0, 'M': 1})
cancer_dataset['diagnosis']= [x == 'M' for x in cancer_dataset['diagnosis']]
cancer_dataset.sample(10)

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
498,12.47,17.31,80.45,480.1,0.08928,0.0763,0.03609,0.02369,0.1526,0.06046,...,24.34,92.82,607.3,0.1276,0.2506,0.2028,0.1053,0.3035,0.07661,False
133,16.16,21.54,106.2,809.8,0.1008,0.1284,0.1043,0.05613,0.216,0.05891,...,31.68,129.7,1175.0,0.1395,0.3055,0.2992,0.1312,0.348,0.07619,True
118,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,...,27.37,127.1,1095.0,0.1878,0.448,0.4704,0.2027,0.3585,0.1065,True
129,15.1,16.39,99.58,674.5,0.115,0.1807,0.1138,0.08534,0.2001,0.06467,...,18.33,105.9,762.6,0.1386,0.2883,0.196,0.1423,0.259,0.07779,False
87,14.48,21.46,94.25,648.2,0.09444,0.09947,0.1204,0.04938,0.2075,0.05636,...,29.25,108.4,808.9,0.1306,0.1976,0.3349,0.1225,0.302,0.06846,True
375,13.69,16.07,87.84,579.1,0.08302,0.06374,0.02556,0.02031,0.1872,0.05669,...,20.21,99.16,670.6,0.1105,0.2096,0.1346,0.06987,0.3323,0.07701,False
234,20.51,27.81,134.4,1319.0,0.09159,0.1074,0.1554,0.0834,0.1448,0.05592,...,37.38,162.7,1872.0,0.1223,0.2761,0.4146,0.1563,0.2437,0.08328,True
186,10.08,15.11,63.76,317.5,0.09267,0.04695,0.001597,0.002404,0.1703,0.06048,...,21.18,75.39,437.0,0.1521,0.1019,0.00692,0.01042,0.2933,0.07697,False
454,14.53,13.98,93.86,644.2,0.1099,0.09242,0.06895,0.06495,0.165,0.06121,...,16.93,103.1,749.9,0.1347,0.1478,0.1373,0.1069,0.2606,0.0781,False
107,11.64,18.33,75.17,412.5,0.1142,0.1017,0.0707,0.03485,0.1801,0.0652,...,29.26,85.51,521.7,0.1688,0.266,0.2873,0.1218,0.2806,0.09097,False


### Binarized Binary Classifier

In [17]:
x3 = pd.get_dummies(cancer_dataset[column_names_c[:-1]], prefix=column_names_c[:-1]).astype(bool)
y3 = cancer_dataset['diagnosis']

x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.3, random_state=42)

In [18]:
# Binarized Binary Classifier

bin_cls = fcalc.classifier.BinarizedBinaryClassifier(x3_train.values, y3_train.to_numpy(), method="standard-support")

In [19]:
bin_cls.predict(x3_test.values)
print(bin_cls.predictions)

In [20]:
print(accuracy_score(y3_test, bin_cls.predictions))
print(f1_score(y3_test, bin_cls.predictions, average = 'macro'))

0.5964912280701754
0.4034148289062684


### Pattern Binary Classifier

In [21]:
# x3 = cancer_dataset.iloc[:,:-1]
x3 = cancer_dataset.drop('diagnosis', axis=1)
y3 = cancer_dataset['diagnosis']

x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.3, random_state=42)

In [22]:
# Pattern Binary Classifier

pat_cls = fcalc.classifier.PatternBinaryClassifier(x3_train.values, y3_train.to_numpy(), 
                                             categorical=np.arange(x3_train.shape[1]))

In [23]:
pat_cls.predict(x3_test.values)
print(pat_cls.predictions)

In [24]:
print(accuracy_score(y3_test, pat_cls.predictions))
print(f1_score(y3_test, pat_cls.predictions, average = 'macro'))

0.6608187134502924
0.4400781589231437
