# Breast Cancer Recurrence 

### Contributors: Hyeeun Hughes, Arnold Schultz, Mauvonte Roberts, Ryan Grimsley

* Number of instances: 286

* Number of attributes: 9

* Attribute information:
   
  1. Class: no-recurrence-events, recurrence-events
        
  2. age: 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99.
    
  3. menopause: lt40, ge40, premeno.
    
  4. tumor-size: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44, 45-49, 50-54, 55-59.
    
  5. inv-nodes: 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26, 27-29, 30-32, 33-35, 36-39.
    
  6. node-caps: yes, no.
    
  7. deg-malig: 1, 2, 3.
    
  8. breast: left, right.
    
  9. breast-quad: left-up, left-low, right-up, right-low, central.
     
  10. irradiat: yes, no.


* Missing attribute values: None

## Preprocessing

In [None]:
# Import our dependencies
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
#  Import and read the index.csv.
df = pd.read_csv("../Resources/Index")
df.head()

In [None]:
#  Import and read the breast-cancer.data.csv.
df = pd.read_csv("../Resources/breast-cancer.data")
df.head(30)

In [None]:
df.columns

In [None]:
# Re-naming columns
df.rename(columns={'no-recurrence-events': 'recurrence', 
                   '30-39': 'age',
                   'premeno': 'menopause',
                   '30-34': 'tumor_size',
                   '0-2': 'inv-nodes',
                   'no': 'node-caps',
                   '3': 'deg-malig',
                   'left': 'breast',
                   'left_low': 'breast-quad',
                   'no.1': 'irradiat'
                  }, inplace=True)
df.head()

In [None]:
# 'recurrence' value count
df['recurrence'].value_counts()

In [None]:
# 'age' value count
df['age'].value_counts()

In [None]:
# 'menopause' value count
df['menopause'].value_counts()

In [None]:
# 'tumor_size' value count
df['tumor_size'].value_counts()

In [None]:
# 'inv-nodes' value count
df['inv-nodes'].value_counts()

In [None]:
# 'node-caps' value count
df['node-caps'].value_counts()

In [None]:
# 'deg-malig' value count
df['deg-malig'].value_counts()

In [None]:
# 'breast' value count
df['breast'].value_counts()

In [None]:
# 'breast-quad' value count
df['breast-quad'].value_counts()

In [None]:
# 'irradiat' value count
df['irradiat'].value_counts()

In [None]:
# Drop the non-beneficial ID columns, 'menopause'.
# df = df.drop(['menopause'], axis = 1)
# df.head(30)

In [None]:
# Find null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

In [None]:
# Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

In [None]:
# Determine the number of unique values in each column.
df.nunique()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Look at 'tumor_size' value counts for binning
val_counts = df['tumor_size'].value_counts()
val_counts

In [None]:
# Choose a cutoff value and create a list of breast to be replaced
# use the variable name `breast_to_replace`

# Transform breast
def breast_to_replace(breast):
    if breast == "left":
        return 1
    else:
        return 0
    
df["breast"] = df["breast"].apply(breast_to_replace)
df.head(20)

In [None]:
# Choose a cutoff value and create a list of breast to be replaced
# use the variable name `breast_to_replace`
breast_to_replace = list(val_counts [val_counts == 1].index)

# Replace in dataframe
for app in breast_to_replace:
    df['breast'] = df['breast'].replace(app,"Other")

In [None]:
# Check to make sure binning was successful
df["breast"].value_counts()

In [None]:
# Look at age value counts for binning
age_value_counts = df['age'].value_counts()
age_value_counts 

In [None]:
# look at age value counts <60
age_value_counts[age_value_counts  < 60]

In [None]:
age_value_counts[age_value_counts  > 29]

In [None]:
# Determine which values to replace if counts are less than 60
age_to_replace = list(age_value_counts [age_value_counts < 60].index)

# Replace in dataframe
for age in age_to_replace:
    df['age'] = df['age'].replace(age,"Other")

In [None]:
# Check to make sure binning was successful
df['age'].value_counts()

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
dummies_df = pd.get_dummies(df)
dummies_df.head(30)

In [None]:
# Split our preprocessed data into our features and target arrays
X = dummies_df.drop(["breast"], axis='columns').values
y = dummies_df["breast"].values
                    

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0]) 
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation ="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")