# Feature Selection using f_regression

## Load data
We are using the breast cancer dataset.  It has 30 features that are used to help 
determine whether a case is positive.

In [1]:
from IPython.display import display, HTML
from sklearn.datasets import load_breast_cancer
import pandas as pd

def display_df(df, rows=1):
    display(HTML(df.head(rows).to_html()))

# Load the Breast Cancer dataset
bc_data = load_breast_cancer()
print("dir(bc_data): ", dir(bc_data))

# Setup vars for values to be used throughout
X, y = bc_data.data, bc_data.target
all_feature_names = bc_data.feature_names
print(bc_data.DESCR)

dir(bc_data):  ['DESCR', 'data', 'data_module', 'feature_names', 'filename', 'frame', 'target', 'target_names']
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 

In [3]:
bc_df = load_breast_cancer(as_frame=True)
df = bc_df.frame
df['target'] = bc_data.target
# df.info(verbose=True)

df_min = df.min(numeric_only=True)
df_max = df.max(numeric_only=True)
df_mean = df.mean(numeric_only=True)
df_median = df.median(numeric_only=True)
df_mode = df.mode(numeric_only=True)
df_std = df.std(numeric_only=True)

df_stats = pd.concat([df_min, df_max, df_mean, df_median, df_std], axis = 1)
df_stats.columns = ["MIN", "MAX", "MEAN", "MEDIAN", "STD_DEV"]
print(y[20:100])
display_df(df_stats, 50)


[1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0
 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1
 0 0 1 1 1 0]


Unnamed: 0,MIN,MAX,MEAN,MEDIAN,STD_DEV
mean radius,6.981,28.11,14.127292,13.37,3.524049
mean texture,9.71,39.28,19.289649,18.84,4.301036
mean perimeter,43.79,188.5,91.969033,86.24,24.298981
mean area,143.5,2501.0,654.889104,551.1,351.914129
mean smoothness,0.05263,0.1634,0.09636,0.09587,0.014064
mean compactness,0.01938,0.3454,0.104341,0.09263,0.052813
mean concavity,0.0,0.4268,0.088799,0.06154,0.07972
mean concave points,0.0,0.2012,0.048919,0.0335,0.038803
mean symmetry,0.106,0.304,0.181162,0.1792,0.027414
mean fractal dimension,0.04996,0.09744,0.062798,0.06154,0.00706


In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the feature matrix
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Train a neural network classifier
clf = MLPClassifier(hidden_layer_sizes=(10,), random_state=42)
clf.fit(X_train_normalized, y_train)



## Train a neural network with our dataset

In [12]:
from sklearn.feature_selection import f_regression

# Perform feature selection using f_regression
f_scores, p_values = f_regression(X, y)

# Get the feature importance scores
feature_importance = abs(f_scores)

# Sort the features by importance in descending order
sorted_indices = feature_importance.argsort()[::-1]

feature_importance = { all_feature_names[i]: feature_importance[i] for i in sorted_indices }
for k, v in feature_importance.items():
    print(f"Feature {k}, Importance: {v:.4f}")


Feature worst concave points, Importance: 964.3854
Feature worst perimeter, Importance: 897.9442
Feature mean concave points, Importance: 861.6760
Feature worst radius, Importance: 860.7817
Feature mean perimeter, Importance: 697.2353
Feature worst area, Importance: 661.6002
Feature mean radius, Importance: 646.9810
Feature mean area, Importance: 573.0607
Feature mean concavity, Importance: 533.7931
Feature worst concavity, Importance: 436.6919
Feature mean compactness, Importance: 313.2331
Feature worst compactness, Importance: 304.3411
Feature radius error, Importance: 268.8403
Feature perimeter error, Importance: 253.8974
Feature area error, Importance: 243.6516
Feature worst texture, Importance: 149.5969
Feature worst smoothness, Importance: 122.4729
Feature worst symmetry, Importance: 118.8602
Feature mean texture, Importance: 118.0961
Feature concave points error, Importance: 113.2628
Feature mean smoothness, Importance: 83.6511
Feature mean symmetry, Importance: 69.5274
Feature 

# RFE Feature Selection

In [8]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


# Create a logistic regression estimator
estimator = LogisticRegression(
    max_iter=2000, 
    C=0.9, # Inverse regularization strength
    class_weight= 'balanced',
)

# Perform feature selection using RFE
selector = RFE(estimator, n_features_to_select=10, step=0.01)
selector = selector.fit(X, y)

# Get the selected feature indices & associated ranking
selected_feature_indices = selector.support_
ranking = selector.ranking_

# Print the selected feature names
selected_features = {
    all_feature_names[i]:ranking[i] \
      for i in range(len(all_feature_names)) \
      if selected_feature_indices[i]
}

print(f"Selected features({len(selected_features)}):", selected_features)
print(f"\nOut of all features({len(all_feature_names)}):", all_feature_names)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Selected features(10): {'mean radius': 1, 'mean concavity': 1, 'texture error': 1, 'perimeter error': 1, 'worst radius': 1, 'worst smoothness': 1, 'worst compactness': 1, 'worst concavity': 1, 'worst concave points': 1, 'worst symmetry': 1}

Out of all features(30): ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


## Render graphs

In [13]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
G = nx.DiGraph()

# Add nodes for features and diagnosis
for feature in all_feature_names:
    G.add_node(feature, type="feature", importance=feature_importance[feature])
    
G.add_node("Diagnosis", type="diagnosis")

# Visualize the graph
pos = nx.spring_layout(G)
node_colors = ["lightblue" if G.nodes[n]["type"] == "feature" else "orange" for n in G.nodes]
node_sizes = [1000 * G.nodes[n]["importance"] if G.nodes[n]["type"] == "feature" else 1000 for n in G.nodes]
edge_widths = [5 * G.edges[e]["weight"] for e in G.edges]

plt.figure(figsize=(12, 8))
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes, alpha=0.8)
nx.draw_networkx_edges(G, pos, width=edge_widths, edge_color="gray", alpha=0.5)
nx.draw_networkx_labels(G, pos, font_size=10)
plt.axis("off")
plt.title("Feature Importance Graph")
plt.show()

# Print feature importances
for i in range(len(feature_names)):
    print(f"{feature_names[i]}: {importances[i]:.4f}")