In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras import layers

In [None]:
tf.__version__

# Breast Cancer Wisconsin with TensorFlow

In this notebook you will learn how to classify structured data using Keras preprocessing layers. You will use [Keras](https://www.tensorflow.org/guide/keras) to define the model, and [preprocessing layers](https://www.tensorflow.org/guide/keras/preprocessing_layers) as a bridge to map from columns in a CSV to features used to train the model. 

The content is based on a [tutorial](https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers) from the TensorFlow team. Some utility functions used in this notebook are copied from this tutorial and we recommend to refer to it for more details.

You will train and deploy a Neural Network to predict whether breast cancer is benign or malignant (see [Breast Cancer Wisconsin (Diagnostic) Data Set](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/data) at Kaggle).

### Use Pandas to create a dataframe

Import training data from public Github URL and load it into a dataframe.

In [None]:
dataset_url = "https://raw.githubusercontent.com/ellenhvn/hhz-artificial-intelligence-vl-s23/main/Exercise%20Material/Cancer%20with%20Tensorflow/data.csv"
csv_file = tf.keras.utils.get_file("data.csv", dataset_url, cache_dir=".")
dataframe_unmodified = pd.read_csv(csv_file)


In [None]:
dataframe_unmodified.head()

### Prepare data

Data preparation and analysis for this dataset was covered in depth in a different notebook (see "[HHZ] Cancer (Exercise)").

In [None]:
dataframe = dataframe_unmodified.copy()

In [None]:
# check for missing values
dataframe.isnull().sum()

In [None]:
# map malignant (denoted by "M") to 1 and benign (denoted by "B") to 0
dataframe.diagnosis.replace(["M", "B"], [1, 0], inplace=True)

In [None]:
# rename inconsistently named columns
dataframe.rename(
    columns={
        "concave points_mean": "concave_points_mean",
        "concave points_worst": "concave_points_worst",
        "concave points_se": "concave_points_se",
    },
    inplace=True,
)

# Note: without this transformation tf.keras.models.load_model may fail in the last step with a confusing error message if these columns are used as a predictor.

In [None]:
dataframe.head()

### Split the dataframe into train, validation, and test

In [None]:
train, val, test = np.split(
    dataframe.sample(frac=1), [int(0.8 * len(dataframe)), int(0.9 * len(dataframe))]
)

print(f"{len(train)} train examples")
print(f"{len(val)} validation examples")
print(f"{len(test)} test examples")

### Create an input pipeline using tf.data

Next, you will wrap the dataframes with tf.data, in order to shuffle and batch the data. If you were working with a very large CSV file (so large that it does not fit into memory), you would use tf.data to read it from disk directly. That is not covered in this tutorial.

Note: Below utility functions are copied from https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers and have been adapted for our use-case.

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop("diagnosis")
    df = {key: value[:, tf.newaxis] for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

Now that you have created the input pipeline, let's call it to see the format of the data it returns. You have used a small batch size to keep the output readable.

In [None]:
batch_size = 5
train_ds = df_to_dataset(train, shuffle=False, batch_size=batch_size)

In [None]:
[(train_features, label_batch)] = train_ds.take(1)
print(f"Every feature: {list(train_features.keys())}")
print(f'A batch of mean symmetry values: {train_features["symmetry_mean"]}')
print(f"A batch of targets {label_batch}")

In [None]:
# check documentation for details:
# - https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/Normalization
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)

    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

All potential features are numeric. The only layer type we need in this exampleis a "normalization layer". Let's take a look at an example to see how the encoding works:

*Create a normalization layer for 'symmetry_mean'*

In [None]:
symmetry_mean_col = train_features["symmetry_mean"]
symmetry_mean_col

In [None]:
layer = get_normalization_layer("symmetry_mean", train_ds)
layer(symmetry_mean_col)

#### Choose which columns to use

You have seen how to use several types of preprocessing layers. Now you will use them to train a model. You will be using [Keras-functional API](https://www.tensorflow.org/guide/keras/functional) to build the model. The Keras functional API is a way to create models that are more flexible than the [tf.keras.Sequential API](https://www.tensorflow.org/api_docs/python/tf/keras/Sequential).

In [None]:
batch_size = 64
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [None]:
dataframe.info()

In [None]:
# option 1: a suitable subset of predictors (e.g. predictors that are not correlated)
feature_columns1 = [
    "radius_mean",
    "smoothness_mean",
    "compactness_mean",
    "symmetry_mean",
    "fractal_dimension_mean",
    "radius_se",
    "texture_se",
    "smoothness_se",
    "compactness_se",
    "symmetry_se",
    "fractal_dimension_se",
    "symmetry_worst",
]

In [None]:
# option 2: all predictors except for the id because we know the id is just a random number without an impact on the result
feature_columns2 = dataframe.columns.drop(["id", "diagnosis"])

In [None]:
feature_columns = feature_columns1

In [None]:
all_inputs = []
encoded_features = []

# Numeric features.
for header in feature_columns:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

#### Create, compile, and train the model

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x1 = tf.keras.layers.Dense(32, activation="relu")(all_features)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x1)
model = tf.keras.Model(all_inputs, output)
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

In [None]:
model.summary()

In [None]:
history = model.fit(train_ds, epochs=50, validation_data=val_ds)

Review how training and validation accuracy evolved

In [None]:
acc = history.history["accuracy"]
val_acc = history.history["val_accuracy"]
loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs = range(len(acc))

plt.plot(epochs, acc, "darkgreen", label="Training accuracy")
plt.plot(epochs, val_acc, "darkblue", label="Validation accuracy")
plt.plot(epochs, loss, "lightgreen", label="Training loss")
plt.plot(epochs, val_loss, "lightblue", label="Validation loss")
plt.title("Training and validation accuracy")
plt.legend(loc=0)
plt.figure()


plt.show()

In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

### Evaluate the model (confusion matrix)

In [None]:
y_pred = model.predict(test_ds)

In [None]:
predicted_categories = tf.round(y_pred)

In [None]:
true_categories = tf.concat([y for x, y in test_ds], axis=0)

In [None]:
true_categories

In [None]:
# yields count of true negatives, false positives, false negatives, true positives
confusion_matrix(predicted_categories, true_categories)

In [None]:
# check that tp, fp, tn, fn are not confused
tn, fp, fn, tp = confusion_matrix(true_categories, predicted_categories).ravel()
print(tn, fp, fn, tp)

In [None]:
# yields class-specific precision, recall and f1-score
print(classification_report(true_categories, predicted_categories))

Try different neural network configurations (e.g. add additional layers, change the number of neurons per layer, or train for more epochs). Do you get a better results than using the initial configuration?

Continue once you are satisfied with the results.

### Perform inference

Save model to filesystem and reload it for test purposes.

In [None]:
model.save("breast_cancer_model/1")
reloaded_model = tf.keras.models.load_model("breast_cancer_model/1")

Make a local prediction with the aid of the reloaded model

In [None]:
sample_row_df = dataframe.iloc[102:103]
sample_row_df

In [None]:
# drop target ('diagnosis') from record
sample_row_df = sample_row_df.drop(["diagnosis"], axis=1)

In [None]:
sample_row_dict = sample_row_df.to_dict(orient="records")[0]
sample_row_dict

In [None]:
input_dict = {
    name: tf.convert_to_tensor([value]) for name, value in sample_row_dict.items()
}
predictions = reloaded_model.predict(input_dict)
print(
    "The breast cancer described by this row had a %.1f percent probability of being malignant."
    % (100 * predictions[0])
)

### Deployment

How can you deploy your TensorFlow model?