In [1]:
import tensorflow as tf

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

import seaborn as sns
import matplotlib.pyplot as plt

## Prepare and Preprocess Data

In [2]:
df = pd.read_csv('data/data.csv').dropna(axis=1)
df_orig = df.copy()
feats = df.columns.difference(['id', 'diagnosis'])
df.loc[:, feats] = ColumnTransformer([('mmx', MinMaxScaler(), list(feats))]).fit_transform(df)

In [3]:
train_df, val_df = train_test_split(df, stratify=df.diagnosis, test_size=0.2, random_state=42)
train_df_orig, val_df_orig = train_test_split(df_orig, stratify=df.diagnosis, test_size=0.2, random_state=42)
train_X, val_X = train_df[feats].values, val_df[feats].values

## Autoencoder 

### Build

In [4]:
class Autoencoder(tf.keras.Model):
    def __init__(self, latent_dim, inp_shp):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.inp_shp = inp_shp
        self.encoder = tf.keras.Sequential([
                                            tf.keras.layers.Dense(20, activation='relu'),
                                            tf.keras.layers.Dense(self.latent_dim, activation='relu')
                                           ])
        self.decoder = tf.keras.Sequential([
                                            tf.keras.layers.Dense(20, activation='relu'),
                                            tf.keras.layers.Dense(self.inp_shp, activation='sigmoid')
                                           ])
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [5]:
autoencoder = Autoencoder(15, len(feats))
autoencoder.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())

In [6]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=10)

### Train

In [7]:
autoencoder.fit(train_X, train_X, epochs=100, shuffle=True, 
                validation_data=(val_X, val_X), callbacks=[early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f2d303aa490>

In [8]:
val_df.shape

(114, 32)

## Visualize (Latent Dimension = 2)

In [9]:
# reduced_train = pd.DataFrame(autoencoder.encoder(train_df[feats].values).numpy(), columns=['x1','x2'])
# reduced_val =  pd.DataFrame(autoencoder.encoder(val_df[feats].values).numpy(),  columns=['x1','x2'])

In [10]:
# reduced_train['target'] = train_df.diagnosis
# reduced_val['target'] = val_df.diagnosis

In [11]:
# fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10,20))
# sns.scatterplot(x='x1', y='x2', hue='target', data=reduced_train, ax=ax1)
# sns.scatterplot(x='x1', y='x2', hue='target', data=reduced_val, ax=ax2)

## Binary Classification Modeling

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, classification_report

In [13]:
lr1 = LogisticRegression(max_iter=5000).fit(train_df_orig[feats], train_df_orig.diagnosis)

In [14]:
lr2 = LogisticRegression(max_iter=5000).fit(autoencoder(train_df[feats].values).numpy(), train_df.diagnosis)

In [15]:
pred1 = lr1.predict_proba(val_df_orig[feats])[:,1]
pred2 = lr2.predict_proba(autoencoder(val_df[feats].values).numpy())[:,1]

In [16]:
y_true = val_df.diagnosis.map({'B':0, 'M':1}).values

### Plain Data Results

In [17]:
roc_auc_score(y_true, pred1), f1_score(y_true, 1*(pred1>0.5))

(0.9923941798941799, 0.9113924050632912)

In [18]:
print(classification_report(y_true, 1*(pred1>0.5)))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95        72
           1       0.97      0.86      0.91        42

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



### Autoencoder "Denoised" Results

In [19]:
roc_auc_score(y_true, pred2), f1_score(y_true, 1*(pred2>0.5))

(0.9976851851851852, 0.951219512195122)

In [20]:
print(classification_report(y_true, 1*(pred2>0.5)))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97        72
           1       0.97      0.93      0.95        42

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

