In [1]:
import os
# Set environment variables for controlling the number of threads in certain libraries
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
import pandas as pd
import numpy as np
import time
import math
from itertools import product
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from RuleTree.stumps.instance_stumps import * 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
pt_stump = pt_stump_call()

In [2]:
from RuleTree import PairwiseDistanceTreeRegressor

In [3]:
from util_exp_funct import * 

In [4]:
dataset_name = 'breast'
dataframe = pd.read_csv(f'sensitivity_datasets/{dataset_name}.csv')
feature_names = dataframe.drop(columns=['label']).columns

In [5]:
feat_names = [
    "worst concave points",
    "worst area",
    "mean concave points",
    "mean concavity",
    "worst radius",
    "worst perimeter",
    "mean perimeter",
    "area error",
    "mean area",
    "mean radius"
]


In [6]:
column_indexes = []
for e, name in enumerate(feature_names):
    if name in feat_names:
        column_indexes.append(e)

column_indexes

[0, 2, 3, 6, 7, 13, 20, 22, 23, 27]

In [7]:
df_filtered = dataframe.iloc[:, column_indexes]
df_filtered
df_filtered['label'] = dataframe.label
feature_names = list(df_filtered.drop(columns=['label']).columns)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['label'] = dataframe.label


In [8]:
dataframe = df_filtered

In [9]:
dataframe

Unnamed: 0,mean radius,mean perimeter,mean area,mean concavity,mean concave points,area error,worst radius,worst perimeter,worst area,worst concave points,label
0,17.99,122.80,1001.0,0.30010,0.14710,153.40,25.380,184.60,2019.0,0.2654,0
1,20.57,132.90,1326.0,0.08690,0.07017,74.08,24.990,158.80,1956.0,0.1860,0
2,19.69,130.00,1203.0,0.19740,0.12790,94.03,23.570,152.50,1709.0,0.2430,0
3,11.42,77.58,386.1,0.24140,0.10520,27.23,14.910,98.87,567.7,0.2575,0
4,20.29,135.10,1297.0,0.19800,0.10430,94.44,22.540,152.20,1575.0,0.1625,0
...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,142.00,1479.0,0.24390,0.13890,158.70,25.450,166.10,2027.0,0.2216,0
565,20.13,131.20,1261.0,0.14400,0.09791,99.04,23.690,155.00,1731.0,0.1628,0
566,16.60,108.30,858.1,0.09251,0.05302,48.55,18.980,126.70,1124.0,0.1418,0
567,20.60,140.10,1265.0,0.35140,0.15200,86.22,25.740,184.60,1821.0,0.2650,0


In [10]:
X = dataframe.drop(columns = ['label']).values
y = np.array(dataframe.label)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)



In [11]:
num_samples = X_train.shape[0]  
percentages = [0.5] # Ensures 1.0 is included
sample_sizes = [math.floor(num_samples * p) for p in percentages]

In [12]:
fix_feat_dic = { 'feat_with_thr' : [True, True] }
absolute_val_dict = { 'both_original_and_diff' : [True, True] }

np.random.seed(90)  # Fix random seed
all_results_across_seeds = []
all_results_one_seed = []

perc, sample_size = 0.5, sample_sizes[-1]
print('PERC', perc)

results = {'dataset_name': dataset_name, 'random_seed': 90, 'perc': perc, 'sample_size': sample_size}

random_indices, selection_time = measure_time(np.random.choice, X_train.shape[0], sample_size, replace=False)        
X_train_N, y_train_N = X_train[random_indices], y_train[random_indices]
scaler = StandardScaler()

_, scaler_fit_time = measure_time(scaler.fit, X_train_N)
X_train_scaled, _ = measure_time(scaler.transform, X_train) 
X_train_N_scaled, _ = measure_time(scaler.transform, X_train_N)
X_test_scaled, _ = measure_time(scaler.transform, X_test)

dist_matrix_X_train_N_scaled, _ = measure_time(pairwise_distances, X_train_N_scaled, metric='euclidean')
dist_matrix_X_test_scaled, _ = measure_time(pairwise_distances, X_test_scaled, metric='euclidean')

df_X_train_N_scaled = df_pairwise(X_train_N_scaled, feature_names, dist_matrix_X_train_N_scaled, consider_abs_diff=True)
df_X_test_scaled = df_pairwise(X_test_scaled, feature_names, dist_matrix_X_test_scaled, consider_abs_diff=True)

new_feature_names = list(df_X_train_N_scaled.drop(columns=['indexes', 'overall_euclidean_distance_sklr']).columns)
X_train_N_scaled_pairwise = df_X_train_N_scaled[new_feature_names].values        
y_train_N_scaled_pairwise = df_X_train_N_scaled['overall_euclidean_distance_sklr'].values
X_test_scaled_pairwise = df_X_test_scaled[new_feature_names].values
y_test_scaled_pairwise = df_X_test_scaled['overall_euclidean_distance_sklr'].values

all_model_results_for_perc = []

max_depth = 4
fix_feature, fix_threshold = True, True
params_model = {'max_depth': max_depth, 'min_samples_leaf': 1, 'min_samples_split': 2,
                'random_state': 42, 'fix_feature': fix_feature, 'fix_threshold': fix_threshold}

model = PairwiseDistanceTreeRegressor(**params_model)
_, train_time = measure_time(model.fit, X_train_N_scaled, y_train_N_scaled_pairwise, X_train_N_scaled_pairwise)

importance_dict = {k: v for k, v in zip(new_feature_names, model.compute_feature_importances())}

X_train_curr, y_train_curr = X_train_scaled, y_train
res_r1_r2_test, knn_test_matrix_r1_r2 = concat_for_prediction(
    X_test_scaled, X_train_curr, feature_names, model, r1_r2_order=True, consider_abs_diff=True)
res_r1_r2_train, knn_train_matrix_r1_r2 = concat_for_prediction(
    X_train_curr, X_train_curr, feature_names, model, r1_r2_order=True, consider_abs_diff=True)

train_matrix = knn_train_matrix_r1_r2
test_matrix = knn_test_matrix_r1_r2

predictions = {"prediction_res_r1_r2_test": np.array(res_r1_r2_test['prediction'])}

knn_model = KNeighborsClassifier(n_neighbors=3, metric='precomputed')

knn_euclid = KNeighborsClassifier(n_neighbors=3)

knn_euclid.fit(X_train_curr, y_train_curr)

_, train_time = measure_time(knn_model.fit, train_matrix , y_train_curr)

y_pred_train, _ = measure_time(knn_model.predict, train_matrix)
y_pred_test, _ = measure_time(knn_model.predict, test_matrix)

all_model_results_for_perc.append({
    'dataset_type': 'full_dataset',
    'train_test_case': 'train_test_r1_r2_r1_r2',
    'predictions': predictions,
    'knn_params': {'n_neighbors': 3},
    'importance_dict': importance_dict,
    'train_time': train_time,
    'metrics_train': compute_metrics_classifier(y_train_curr, y_pred_train, 'KNN_measure_train'),
    'metrics_test': compute_metrics_classifier(y_test, y_pred_test, 'KNN_measure_test')
})

all_results_one_seed.append(all_model_results_for_perc)
all_results_across_seeds.append(all_results_one_seed)


PERC 0.5


In [13]:
pd.set_option('display.max_columns', None)

In [14]:
y_test[105]

0

In [15]:
knn_model.predict(test_matrix)

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1])

In [16]:
y_test[105]

0

In [17]:
#knn_model.kneighbors(X=test_matrix, n_neighbors=3, return_distance=True)

In [18]:
knn_model.predict(np.array([test_matrix[105]])) #predizione giusta

array([0])

In [19]:
knn_model.kneighbors(X=np.array([test_matrix[105]]), n_neighbors=3, return_distance=True)

(array([[1.90534659, 3.63334293, 3.63334293]]), array([[36, 31, 89]]))

In [20]:
model.print_rules(model.get_rules(new_feature_names))

|--- worst perimeter_abs_diff <= 1.481	51529
|   |--- worst perimeter_abs_diff <= 0.71	35869
|   |   |--- mean concavity_abs_diff <= 1.122	21483
|   |   |   |--- mean area_abs_diff <= 0.446	17683
|   |   |   |    output: 1.16
|   |   |   |--- mean area_abs_diff > 0.446
|   |   |   |    output: 1.91
|   |   |--- mean concavity_abs_diff > 1.122
|   |   |   |--- mean concavity_abs_diff <= 2.227	3800
|   |   |   |    output: 2.7
|   |   |   |--- mean concavity_abs_diff > 2.227
|   |   |   |    output: 4.22
|   |--- worst perimeter_abs_diff > 0.71
|   |   |--- mean concave points_abs_diff <= 1.484	14386
|   |   |   |--- mean area_abs_diff <= 0.99	10972
|   |   |   |    output: 2.69
|   |   |   |--- mean area_abs_diff > 0.99
|   |   |   |    output: 3.63
|   |   |--- mean concave points_abs_diff > 1.484
|   |   |   |--- area error_abs_diff <= 4.314	3414
|   |   |   |    output: 4.43
|   |   |   |--- area error_abs_diff > 4.314
|   |   |   |    output: 11.65
|--- worst perimeter_abs_diff > 1.

In [21]:
#knn_euclid.predict(X_test_scaled)

In [22]:
knn_euclid.kneighbors(X=np.array([X_test_scaled[105]]), n_neighbors=3, return_distance=True)

(array([[4.64327197, 8.89654565, 8.99599267]]), array([[343,  36, 153]]))

In [23]:
y_train[[36, 31, 89, 343,  36, 153]]

array([0, 0, 0, 0, 0, 0])

In [24]:
query_instance = [105]
nearest_neigh_PDT = [36, 31, 89]
dist_PDT = [1.90, 3.63, 3.63]

query_instance = [105]
nearest_neigh_EUCLID = [343,  36, 153]
dist_EUCLID = [4.64, 8.90, 8.99]

vet_class = list(y_train[[36, 31, 89, 343,  36, 153]])

In [25]:
list_ind = nearest_neigh_PDT + nearest_neigh_EUCLID
dists = dist_PDT + dist_EUCLID

In [26]:
test_instance = pd.DataFrame(X_test_scaled[[105]], columns = feature_names).applymap((lambda x : round(x,2)))
test_instance['distance'] = ['--']
test_instance['class'] = ['m']

  test_instance = pd.DataFrame(X_test_scaled[[105]], columns = feature_names).applymap((lambda x : round(x,2)))


In [27]:
test_instance

Unnamed: 0,mean radius,mean perimeter,mean area,mean concavity,mean concave points,area error,worst radius,worst perimeter,worst area,worst concave points,distance,class
0,3.84,3.98,5.44,3.29,3.09,10.84,4.3,4.52,6.58,2.16,--,m


In [28]:
A = pd.DataFrame(X_train_scaled[list_ind], columns = feature_names).applymap((lambda x : round(x,2)))
A['distance'] = dists
A['class'] = ['m' for x in vet_class]

  A = pd.DataFrame(X_train_scaled[list_ind], columns = feature_names).applymap((lambda x : round(x,2)))


In [29]:
print(A.to_latex(float_format= f"{{:0.2f}}".format))

\begin{tabular}{lrrrrrrrrrrrl}
\toprule
 & mean radius & mean perimeter & mean area & mean concavity & mean concave points & area error & worst radius & worst perimeter & worst area & worst concave points & distance & class \\
\midrule
0 & 3.34 & 3.44 & 3.99 & 2.97 & 3.67 & 2.41 & 3.66 & 3.83 & 4.59 & 2.36 & 1.90 & m \\
1 & 3.02 & 3.07 & 3.49 & 1.68 & 2.52 & 2.12 & 2.96 & 3.08 & 3.43 & 1.94 & 3.63 & m \\
2 & 3.19 & 3.33 & 3.60 & 2.93 & 3.49 & 1.69 & 2.97 & 3.27 & 3.27 & 2.52 & 3.63 & m \\
3 & 4.04 & 4.05 & 5.44 & 2.76 & 2.84 & 10.48 & 2.56 & 2.54 & 3.15 & 0.64 & 4.64 & m \\
4 & 3.34 & 3.44 & 3.99 & 2.97 & 3.67 & 2.41 & 3.66 & 3.83 & 4.59 & 2.36 & 8.90 & m \\
5 & 2.91 & 3.10 & 3.25 & 4.06 & 3.92 & 4.14 & 2.10 & 2.30 & 2.32 & 1.61 & 8.99 & m \\
\bottomrule
\end{tabular}



In [30]:
total= pd.concat((test_instance,A))

In [31]:
print(total.to_latex(float_format= f"{{:0.2f}}".format))

\begin{tabular}{lrrrrrrrrrrll}
\toprule
 & mean radius & mean perimeter & mean area & mean concavity & mean concave points & area error & worst radius & worst perimeter & worst area & worst concave points & distance & class \\
\midrule
0 & 3.84 & 3.98 & 5.44 & 3.29 & 3.09 & 10.84 & 4.30 & 4.52 & 6.58 & 2.16 & -- & m \\
0 & 3.34 & 3.44 & 3.99 & 2.97 & 3.67 & 2.41 & 3.66 & 3.83 & 4.59 & 2.36 & 1.90 & m \\
1 & 3.02 & 3.07 & 3.49 & 1.68 & 2.52 & 2.12 & 2.96 & 3.08 & 3.43 & 1.94 & 3.63 & m \\
2 & 3.19 & 3.33 & 3.60 & 2.93 & 3.49 & 1.69 & 2.97 & 3.27 & 3.27 & 2.52 & 3.63 & m \\
3 & 4.04 & 4.05 & 5.44 & 2.76 & 2.84 & 10.48 & 2.56 & 2.54 & 3.15 & 0.64 & 4.64 & m \\
4 & 3.34 & 3.44 & 3.99 & 2.97 & 3.67 & 2.41 & 3.66 & 3.83 & 4.59 & 2.36 & 8.90 & m \\
5 & 2.91 & 3.10 & 3.25 & 4.06 & 3.92 & 4.14 & 2.10 & 2.30 & 2.32 & 1.61 & 8.99 & m \\
\bottomrule
\end{tabular}

