In [28]:
# Import needed packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, r_regression
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [29]:
# Load the Wisconsin Breast Cancer Database
wbcd = pd.read_csv('WisconsinBreastCancerDatabase.csv')

In [30]:
# Select and scale input features, create dataframe for output feature
X = wbcd[['Radius mean','Texture mean', 'Area mean', 
         'Compactness mean', 'Concavity mean', 'Concave points mean', 
          'Fractal dimension mean', 'Symmetry mean']]
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
y = wbcd[['Smoothness mean']]

In [31]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [32]:
# Perform feature selection using the SelectKBest function
model_kbest = SelectKBest(score_func=r_regression, k=5)
X_new_kbest = model_kbest.fit_transform(X_train,np.ravel(y_train))

# Perform feature selection using the SelectPercentile function
model_percent = SelectPercentile(score_func=r_regression, percentile=30)
X_new_percent = model_percent.fit_transform(X_train,np.ravel(y_train))

In [33]:
# Get features selected by each function
filter_kbest = model_kbest.get_support()
filter_percent = model_percent.get_support()

# Get input feature names
features = np.array(X_train.columns)

In [34]:
# Display feature names selected by the SelectKBest function
features[filter_kbest]

array(['Compactness mean', 'Concavity mean', 'Concave points mean',
       'Fractal dimension mean', 'Symmetry mean'], dtype=object)

In [35]:
# Display feature names selected by the SelectPercent function
features[filter_percent]

In [38]:
y_train

Unnamed: 0,Smoothness mean
190,0.10750
134,0.09430
386,0.08108
118,0.11550
316,0.07734
...,...
98,0.08983
322,0.11340
382,0.06935
365,0.09150


In [39]:
# Display the F-statistic and p-value for each feature
pd.DataFrame(r_regression(X_train, np.ravel(y_train)), 
             columns=X_train.columns, index=['Pearson correlation','p-value']).T

In [44]:
pd.DataFrame(r_regression(X_train, np.ravel(y_train)),index=X_train.columns, columns=['Pearson coefficient'])

Unnamed: 0,Pearson coefficient
Radius mean,0.141981
Texture mean,-0.049423
Area mean,0.151101
Compactness mean,0.656917
Concavity mean,0.51283
Concave points mean,0.541455
Fractal dimension mean,0.600566
Symmetry mean,0.557752


In [10]:
# Construct MLP classifier using all features and display classification accuracy
clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, np.ravel(y_train))
clf.score(X_test, y_test)

0.8859649122807017

In [11]:
# Construct MLP classifier using 5 best features and display classification accuracy
clf_reduced_kbest = MLPClassifier(random_state=1, 
                                  max_iter=300).fit(X_train[features[filter_kbest]], np.ravel(y_train))
clf_reduced_kbest.score(X_test[features[filter_kbest]], y_test)

0.9035087719298246

In [12]:
# Construct MLP classifier using the top 30% features and display classification accuracy
clf_reduced_percent = MLPClassifier(random_state=1, 
                              max_iter=1000).fit(X_train[features[filter_percent]], np.ravel(y_train))
clf_reduced_percent.score(X_test[features[filter_percent]], y_test)

0.8771929824561403