# Breast Cancer - Wisconsin 

TODO: add banner image from wisconsin kaggle dataset

TODO: add description

In [1]:
import os
import time
import numpy as np
import pandas as pd

# Matplotlib
import matplotlib.pyplot as plt

In [None]:
# Folder path to dataset
path = "./datasets/data.csv"
original_df = pd.read_csv(path)

original_df.head()

In [None]:
# Statistical summary of our dataset
original_df.describe()

In [None]:
# Data type summary of our dataset
original_df.info()

In [None]:
# Inspect null values, lucky that we don't have any
original_df.isnan.sum()

# Data Cleaning

In [None]:
# 0.1. Seperate target and dataset
target_df = original_df['diagnosis']
data_df = original_df.drop(columns=['diagnosis'])

print("Shape of target df: ", target_df.shape)
print("Shape of dataset: ", data_df.shape)

In [None]:
# 0.2. Change target to binary representation
target_df = target_df.replace({ 'Benign':0, 'Malignant': 1}).astype('int')
target_df.head()

In [None]:
# 0.3. Normalize input data of floating point
columns = data_df.columns

from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
scl_np = std_scaler.fit_transform(data_df.values) #fit the numpy rep of df
data_df = pd.DataFrame(scl_np)

#put columns label back
data_df.columns = columns
data.head()

# Feature Selection and Classification Task

## 1. Feature selection by exploratory data analysis

As far as we remember from wisconsin-visualization, we plan to drop the following features:
* For Feature Selection using Exploratory Analysis from Data Viz:
    - Drop columns: compactness_se, concavity_se, concave points_se
    - Drop columns: texture_se, perimeter_se
    - Drop columns: radius_worst, texture_worst, perimeter_worst
    - Drop columns: compactness_worst, concavity_worst, concave point_worst
    - Drop columns: perimeter_mean and area_mean
    - Drop columns: smoothness_se

In [None]:
# 1.1 Drop any weak or redundant features
dropped_feats = ["compactness_se",  "concavity_se", "concave points_se", 
                "texture_se", "perimeter_se", "radius_worst", "texture_worst", "perimeter_worst",
                "compactness_worst", "concavity_worst", "concave point_worst",
                "perimeter_mean", "area_mean", "smoothness_se"]
print("Number of features dropped: ", len(dropped_feats))

#dropping
data_cleaned_df = data_df.drop(dropped_feats, axis=1)

print("Shape of dataset after dropping weak features: ", data_cleaned_df.shape)
data_cleaned_df.head()

In [None]:
# 1.2 train split test

#make sure both target and dataset df has same index
print("Index of target df: ", target_df.index)
print("Index of dataset df: ", data_cleaned_df.index)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
        data_cleaned_df, target_df, test_size=0.23, random_state=42)

#make sure size of each train-test splits
print("Size of trained X-y: ", X_train, y_train)
print("Size of test X-y: ", X_test, y_test)

In [None]:
# 1.3 define get_accuracy__rocauc function
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

def get_acc_rocauc(X_train_test, y_train_test, max_depth=7, n_trees=100):
    X_train, X_test = X_train_test
    y_train, y_test = y_train_test
    #define RandomForest model
    rforest_clf = RandomForestClassifier(n_estimators=n_trees, max_depth=max_depth,
                                        random_state=42, n_jobs=-1)
    rforest_clf.fit(X_train, y_train)
    #accuracy score
    y_pred = rforest_clf.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)
    #prediction probability 
    pred_proba = rforest_clf.predict_proba(X_test)[:,1]
    auc_score = roc_auc_score(y_test, pred_proba)
    
    return acc_score, auc_score, pred_proba, y_pred

# 1.4 define roc curve plotting function
def plot_roc_curve(pred_proba, y_test):
    #calculating
    fpr, tpr, thresholds = roc_curve(y_test, pred_proba)
    roc_auc = auc(fpr, tpr)
    #plotting
    plt.figure(figsize=(8,8))
    plt.plot(fpr, tpr, color='darkorange', lw=1, label="ROC Curve Area=%.2f" %roc_auc)
    plt.plot([0,1], [0,1], color='Navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False positive rate")
    plt.ylabel("True positive rate")
    plt.title("Receiver Operating Characteristic")
    plt.show()

# 1.5 define confusion matrix calculation function
def get_plt_conf_mat(acc_score, y_test):
    conf_mat = confussion_matrix(y_test, acc_score)
    plt.figure(figsize=(4, 4))
    sns.heatmap(conf_mat, annot=True, fmt="d")

## 2. Univariate Feature Selection using chi quare

In [None]:
data.head()

In [None]:
# 2.1 split training and test data
from sklearn.model_selection import train_test_split


In [None]:

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

k_list = [5, 10, 14, 15, 16]

# 2.1 define feature selecting function for any given k
def select_K_features(X_train, y_train, k):
    feature_selector = SelectKBest(chi2, k=k).fit(X_train, y_train)
    feats_imp = feature_selector.scores_
    feats_argsort = np.argsort(feats_imp)[::-1] #from higher importance to lower
    