In [None]:
import numpy as np
from ucimlrepo import fetch_ucirepo
from printScore import *

# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# Create a Pandas DataFrame with the features and the target
df = X.copy()
df['target'] = y.copy()

df.info()

In [None]:
#Print all the different values of the target
print(df['target'].unique())
# Replace the target values with the corresponding numeric values: 'B'(benign)= 0, 'M'(malignant)= 1
df['target'].replace({'B': 0, 'M': 1}, inplace=True)

# Evaluation Function that implements Correlation Feature Selection
## Correlation Feature Selection (CFS) is a filter-based method that evaluates the worth of a subset of features by considering the individual predictive ability of each feature along with the degree of redundancy between them.

# Insert image CSF_formula.png


In [None]:
# Merit function: Correlation Feature Selection (CFS)
def cfs(df, selectedFeatures, target):
	# compute the correlation matrix with the selected features and the target
	corr_matrix = df[selectedFeatures + [target]].corr()
	# compute the correlation between the target and the selected features
	corr_target = corr_matrix[target].values[:-1]
	# compute the correlation between the features
	corr_features = corr_matrix[selectedFeatures].values
	# compute the score of each feature
	score = np.zeros(len(selectedFeatures))
	for i in range(len(selectedFeatures)):
		score[i] = np.sum(corr_target * corr_features[i]) / np.sqrt(np.sum(corr_target * corr_target) * np.sum(corr_features[i] * corr_features[i]))
	
	# return the mean of the scores
	return np.sum(score) / len(selectedFeatures)

In [None]:
from sklearn.feature_selection import mutual_info_classif
# Merit function: Mutual Information Feature Selection (MIFS)
def mifs(df, selectedFeatures, testFeature, target, random_state=0):
	# compute the mutual information between the target and the selectedFeatures and the testFeature
	mi = mutual_info_classif(df[selectedFeatures + [testFeature]], df[target], random_state=random_state)
	# compute the mutual information between the selectedFeatures and the testFeature
	mi_selectedFeatures = mutual_info_classif(df[selectedFeatures], df[testFeature], random_state=random_state)
	# return the score of the testFeature
	return np.sum(mi) - np.sum(mi_selectedFeatures)

In [None]:
# Implementation of the SFS algorithm
# This algorithm add the best feature at to the selected features at each iteration
def selection_sfs(df, selected_features, remaining_features, target, measures, maxFeatures=np.inf):
	if not set(measures).issubset(['variance', 'correlation', 'cfs', 'mifs']): # check if the measures are valid
		raise ValueError('Invalid measures function')
	
	# initialize the scores list
	score = []
	counter = 0
	while counter < maxFeatures and len(remaining_features) > 0:
		# initialize the best score and feature
		best_score = -np.inf
		best_feature = None
		
		# iterate over the features 
		for feature in remaining_features:
			# compute the score of the feature
			feature_score = 0
			# Select the features already selected and the current feature
			testFeatures = selected_features + [feature]
			
			if 'variance' in measures: 
				feature_score += df[feature].var() 				
			if 'correlation' in measures:
				feature_score += np.abs(df[feature].corr(df[target]))
			if 'cfs' in measures:
				feature_score += cfs(df, testFeatures, target)
			if 'mifs' in measures:
				feature_score += mifs(df, testFeatures, target)
				
			# check if the score is better than the best score
			if feature_score > best_score:
				best_score = feature_score
				best_feature = feature
		# add the best feature to the selected features list
		selected_features.append(best_feature)
		# remove the best feature from the remaining features list
		remaining_features.remove(best_feature)
		# add the best score to the scores list
		score.append(best_score)
		counter += 1

	return selected_features, score, remaining_features

In [None]:
# Implementation of the SBS algorithm
# This algorithm is the same as the SFS algorithm but it removes the worst feature at each iteration
def selection_sbs(df, remaining_features, target, measures, maxFeatures=np.inf):
	if not set(measures).issubset(['variance', 'correlation', 'cfs', 'mifs']): # check if the measures are valid
		raise ValueError('Invalid measures function')

	# initialize the scores list
	score = []
	# iterate over the features
	eliminated_features = []
	remaining_features = remaining_features.copy()
	while len(eliminated_features) < maxFeatures and len(remaining_features) > 1:
		# initialize the worst score and feature
		worst_score = np.inf
		worst_feature = None
		
		# iterate over the features 
		for feature in remaining_features:
			# compute the score of the feature
			feature_score = 0
			# Select all the features except the current feature
			testFeatures = remaining_features.copy()
			testFeatures.remove(feature)
			
			if 'variance' in measures: 
				feature_score += df[feature].var()
			if 'correlation' in measures:
				feature_score += np.abs(df[feature].corr(df[target]))
			if 'cfs' in measures:
				feature_score += cfs(df, testFeatures, target)
			if 'mifs' in measures:
				feature_score += mifs(df, testFeatures, target)
				
			# check if the score is worst
			if feature_score < worst_score:
				worst_score = feature_score
				worst_feature = feature
				
		# remove the worst feature from the remaining features list
		remaining_features.remove(worst_feature)
		# add the best score to the scores list
		score.append(worst_score)
		# add the worst feature to the eliminated features list
		eliminated_features.append(worst_feature)
	
	return remaining_features, score, eliminated_features

In [None]:
selected_features = []
remaining_features = df.columns[:-1].tolist() # Initialize the remaining features list without the target
target = df.columns[-1] # Get target name

selected_features, score, remaining_features = selection_sfs(df, selected_features, remaining_features, target, measures=['variance', 'correlation', 'cfs'], maxFeatures= 2)
remaining_features, worst_score, eliminated_features = selection_sbs(df, remaining_features, target, measures=['variance', 'correlation', 'cfs'], maxFeatures= 3)

printFinalScore(selected_features, score, remaining_features, worst_score, eliminated_features)

In [None]:
# Implementation of the BDS algorithm using the SFS and SBS function
def selection_bds(df, selected_features, remaining_features, target, measures, maxFeatures=1):
	selected_features_score = []
	eliminated_features_worst_score = []
	eliminated_features = []

	maxFeaturesOverLoad = False
	while(len(selected_features) < maxFeatures and len(remaining_features) > 0):
		selected_features, selected_feature_score, remaining_features = selection_sfs(df, selected_features, remaining_features, target, measures, maxFeatures= 1)
		remaining_features, eliminated_feature_worst_score, eliminated_feature = selection_sbs(df, remaining_features, target, measures, maxFeatures= 1)

		selected_features_score.extend(selected_feature_score)
		maxFeaturesOverLoad = eliminated_feature == []#It appends when |remaining_features| = 2
		if not maxFeaturesOverLoad:
			eliminated_features_worst_score.extend(eliminated_feature_worst_score)
			eliminated_features.extend(eliminated_feature)

		printFinalScore(selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features)

	if maxFeaturesOverLoad:
		print("Max features reached!")
		
	return selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features

# Initialize the selected features list
selected_features = []
# Initialize the remaining features list without the target
remaining_features = df.columns[:-1].tolist()
# Get target name
target = df.columns[-1]
# Call the BDS function
selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features = selection_bds(df, selected_features, remaining_features, target, measures=['variance', 'correlation', 'cfs'], maxFeatures= 100)

In [None]:
# Implementation of the LRS algorithm using the SFS and SBS function
def selection_lrs(df, selected_features, remaining_features, target, measures, l=1, r=1, maxFeatures=1):
	selected_features_score = []
	eliminated_features_worst_score = []
	eliminated_features = []

	maxFeaturesOverLoad = False
	while(len(selected_features) < maxFeatures and len(remaining_features) > 0):
		selected_features, selected_feature_score, remaining_features = selection_sfs(df, selected_features, remaining_features, target, measures, maxFeatures= l)
		remaining_features, eliminated_feature_worst_score, eliminated_feature = selection_sbs(df, remaining_features, target, measures, maxFeatures= r)

		selected_features_score.extend(selected_feature_score)
		maxFeaturesOverLoad = eliminated_feature == []#It appends when |remaining_features| = 2
		if not maxFeaturesOverLoad:
			eliminated_features_worst_score.extend(eliminated_feature_worst_score)
			eliminated_features.extend(eliminated_feature)

		printFinalScore(selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features)

	if maxFeaturesOverLoad:
		print("Max features reached!")
		
	return selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features

selected_features = []
remaining_features = df.columns[:-1].tolist() # Initialize the remaining features list without the target
target = df.columns[-1] # Get target name
# Call the BDS function
selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features = selection_lrs(df, selected_features, remaining_features, target, measures=['variance', 'correlation', 'cfs'], maxFeatures= 100)

In [None]:
# Implementation of the SFFS algorithm using the SFS and SBS function
def selection_sffs(df, selected_features, remaining_features, target, measures, maxFeatures=1):
	selected_features_score = []
	eliminated_features_worst_score = []
	eliminated_features = []

	maxFeaturesOverLoad = False
	while(len(selected_features) < maxFeatures and len(remaining_features) > 0):
		selected_features, selected_feature_score, remaining_features = selection_sfs(df, selected_features, remaining_features, target, measures, maxFeatures= 1)
		remaining_features, eliminated_feature_worst_score, eliminated_feature = selection_sbs(df, remaining_features, target, measures, maxFeatures= 1)

		selected_features_score.extend(selected_feature_score)
		maxFeaturesOverLoad = eliminated_feature == [] #It appends when |remaining_features| = 2
		if not maxFeaturesOverLoad:
			eliminated_features_worst_score.extend(eliminated_feature_worst_score)
			eliminated_features.extend(eliminated_feature)
			
			# check if the score of the selected features is better than the score of the eliminated features
			if selected_feature_score[-1] < eliminated_feature_worst_score[-1]:
				# remove the worst feature from the selected features list
				selected_features.remove(eliminated_feature[-1])
				# add the worst feature to the remaining features list
				remaining_features.append(eliminated_feature[-1])
				# remove the worst feature from the eliminated features list
				eliminated_features[-1].remove(eliminated_feature[-1])
				# remove the worst score from the selected features score list
				selected_features_score[-1].remove(eliminated_feature_worst_score[-1][-1])
				# remove the worst score from the eliminated features worst score list
				eliminated_features_worst_score[-1].remove(eliminated_feature_worst_score[-1][-1])
				print("Feature removed!")			

		printFinalScore(selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features)
		
	return selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features

selected_features = []
remaining_features = df.columns[:-1].tolist() # Initialize the remaining features list without the target
target = df.columns[-1] # Get target name
# Call the BDS function
selected_features, selected_features_score, remaining_features, eliminated_features_worst_score, eliminated_features = selection_sffs(df, selected_features, remaining_features, target, measures=['variance', 'correlation', 'cfs'], maxFeatures= 100)