In [1]:
# PREPROCESSING  OF A DATA - SAME AS WITH LOGISTIC REGRESSION MODEL
import pandas as pd

df = pd.read_csv('../data/breast-cancer-data.csv')
df.head()
df.info()
# Removal of apostrophes
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.rstrip("'")

# Checking if I removed apostrophes
df.head()
# Filling in missing values in a way that we input the most frequent entries instead of missing ones
for column in ['node-caps', 'breast-quad']:
    most_frequent_value = df[column].mode()[0]
    df[column] = df[column].fillna(most_frequent_value)

    df.info()
##ENCODING
X = df.drop('class', axis=1)
y = df['class']
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# To see how function labeled our classes
print(dict(zip(le.classes_, le.transform(le.classes_))))
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# We define which columns are ordinal and in which order
ordinal_features = ['age', 'tumer-size', 'inv-nodes', 'deg-malig']
age_cats = ['20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
tumer_size_cats = ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54']
inv_nodes_cats = ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '24-26']
deg_malig_cats = ['1', '2', '3']
ordinal_categories = [age_cats, tumer_size_cats, inv_nodes_cats, deg_malig_cats]

# We define whick columns are nominal
nominal_features = ['menopause', 'node-caps', 'breast', 'breast-quad', 'irradiate']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumer-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    object
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiate    286 non-null    object
 9   class        286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumer-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-

In [3]:
# --- ANN STEP 1: Build the Artificial Neural Network (ANN) Pipeline ---
# Import necessary classes
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier

# 1. Define the preprocessor (this code is the same as before)
preprocessor = ColumnTransformer(transformers=[
    ('ord', OrdinalEncoder(categories=ordinal_categories), ordinal_features),
    ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_features)
])

# 2. Build the final pipeline for the ANN
ann_pipeline = Pipeline(steps=[
    # Preprocess(encode) the data
    ('preprocessor', preprocessor),

    # Scale the data (important for neural networks)
    ('scaler', StandardScaler()),

    # Apply the neural network classifier
    ('classifier', MLPClassifier(random_state=42, max_iter=1000))
])

In [4]:
# --- ANN STEP 2: Train and Evaluate the ANN Model ---
# Import evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train the entire pipeline on the training data
ann_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred_ann = ann_pipeline.predict(X_test)

# Performance metrics
print('--- Artificial Neural Network (ANN) Performance ---')
print('Accuracy:', accuracy_score(y_test, y_pred_ann))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred_ann))
print('\nClassification Report:\n', classification_report(y_test, y_pred_ann, target_names=le.classes_))

--- Artificial Neural Network (ANN) Performance ---
Accuracy: 0.603448275862069

Confusion Matrix:
 [[30 11]
 [12  5]]

Classification Report:
                       precision    recall  f1-score   support

no-recurrence-events       0.71      0.73      0.72        41
   recurrence-events       0.31      0.29      0.30        17

            accuracy                           0.60        58
           macro avg       0.51      0.51      0.51        58
        weighted avg       0.60      0.60      0.60        58



In [5]:
# --- ANN STEP 3: Hyperparameter Tuning for ANN ---
from sklearn.model_selection import GridSearchCV

# 1. Define the grid of parameters to search for the ANN
# We will test different architectures and regularization strengths.
param_grid_ann = {
    'classifier__hidden_layer_sizes': [(50, 50), (100,), (100, 50, 25)],
    'classifier__alpha': [0.0001, 0.001, 0.01]
}

# 2. Set up and run the Grid Search for the ANN
# We are still optimizing for 'recall' score.
grid_search_ann = GridSearchCV(estimator=ann_pipeline,
                               param_grid=param_grid_ann,
                               cv=5,
                               scoring='recall',
                               n_jobs=-1)

# Fit the grid search to the training data
grid_search_ann.fit(X_train, y_train)

# Print the best parameters found
print("Best ANN parameters found: ", grid_search_ann.best_params_)

# 3. Evaluate the best ANN model found by the grid search
y_pred_tuned_ann = grid_search_ann.predict(X_test)

# Performance metrics for the tuned ANN model
print('\n--- Tuned ANN Performance ---')
print('Accuracy:', accuracy_score(y_test, y_pred_tuned_ann))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred_tuned_ann))
print('\nClassification Report:\n', classification_report(y_test, y_pred_tuned_ann, target_names=le.classes_))

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Best ANN parameters found:  {'classifier__alpha': 0.0001, 'classifier__hidden_layer_sizes': (50, 50)}

--- Tuned ANN Performance ---
Accuracy: 0.5689655172413793

Confusion Matrix:
 [[29 12]
 [13  4]]

Classification Report:
                       precision    recall  f1-score   support

no-recurrence-events       0.69      0.71      0.70        41
   recurrence-events       0.25      0.24      0.24        17

            accuracy                           0.57        58
           macro avg       0.47      0.47      0.47        58
        weighted avg       0.56      0.57      0.57        58

