In [1]:
import pandas as pd
import numpy as np
from hvplot import hvPlot
import panel as pn
import holoviews as hv
import hvplot.pandas
hv.extension('bokeh')

In [2]:
pn.pane.Markdown("# Breast Cancer Detection", width=500).servable()

In [3]:
pn.Row(pn.pane.JPG("image.jpg"),pn.pane.JPG("image3.jpg", width=400)).servable()

In [4]:
pn.pane.Markdown("Breast cancer is the most common cancer in American women, except for skin cancers. Currently, \
the **average risk of a woman in the United States developing breast cancer sometime in her life is about 13%**. \
This means there is a **1 in 8** chance she will develop breast cancer. This also means there is a 7 in 8 chance she \
will never have the disease.",width=600).servable()



# Read in Data

In [5]:
df = pd.read_csv("breast-cancer-wisconsin.txt")

# Data Cleaning

In [6]:
pn.pane.Markdown("# Data Cleaning").servable()

In [7]:
pn.pane.Markdown("### Checking few samples of data").servable()
pn.panel(df.head(),width=1000).servable()

In [8]:
pn.pane.Markdown(f"### Shape of original data is {df.shape}").servable()

In [9]:
pn.pane.Markdown(f"### Looks like we have some mixed data types").servable()
pn.panel(df.dtypes).servable()

In [10]:
pn.pane.Markdown("Something odd is happening with our label data. For now, let us note that there is a massive\
 imbalance between our two main classes 4 and 2").servable()

pn.panel(df.Class.value_counts()).servable()

In [11]:
pn.pane.Markdown("### Checking for missing values").servable()

pn.panel(df.isnull().sum()).servable()

In [12]:
num_df = (df.drop(df[df.columns], axis=1).join(df.apply(pd.to_numeric, errors='coerce')))
num_df = num_df[num_df.notnull().all(axis=1)]
num_df.shape

(15760, 12)

In [13]:
def value_counts_pane(df):
    col_dropdown = pn.widgets.Select(value="Clump Thickness",options=list(df.columns))
    value_counts = pn.pane.DataFrame(df[col_dropdown.value].value_counts())

    @pn.interact(col=col_dropdown)
    def value_counts(col="Clump Thickness"):
        return df[col].value_counts()
    return value_counts

pn.pane.Markdown("### Let's first investigate the value counts of all features").servable()
value_counts_pane(df).servable()

In [14]:
num_df = (df.drop(df[df.columns], axis=1).join(df.apply(pd.to_numeric, errors='coerce')))

num_df = num_df[num_df.notnull().all(axis=1)]

In [15]:
pn.pane.Markdown("### After removing the non numeric values as well as all observations that contain nulls").servable()
value_counts_pane(num_df).servable()

In [16]:
pn.pane.Markdown("### Looks like we have some data points which have been entered at 10x scale based on the mode of the most common entrants.").servable()

In [17]:
for col in num_df.drop(["Index","ID"],1):
    num_df[col] = num_df[col].apply(lambda x: x if x <= 10 else x/10)

In [18]:
pn.pane.Markdown("### After fixing the incorrectly entered data").servable()

value_counts_pane(num_df).servable()

In [19]:
feat_cols = df.columns.drop(["ID","Index","Class"])
label_col = "Class"

In [20]:
pn.pane.Markdown("### Lastly, let's modify out target variable so it is 0 and 1 instead of 4 and 2").servable()


In [21]:
num_df["Class"] = num_df["Class"].apply(lambda x: 0 if x == 4 else 1)

pn.panel(num_df.Class.value_counts()).servable()

# EDA

In [22]:
pn.pane.Markdown("# EDA").servable()

In [23]:
pn.pane.JPG("image2.jpg",width=500).servable()

In [24]:
pn.panel(num_df.corr().hvplot.heatmap(rot = 18)).servable()

In [25]:
pn.panel(num_df.describe(),width=1200).servable()

In [26]:
def hist_plot_pane(df):
    col_dropdown = pn.widgets.Select(value="Clump Thickness",options=list(df.columns))
    value_counts = pn.pane.DataFrame(df[col_dropdown.value].value_counts())

    @pn.interact(col=col_dropdown)
    def hist_plot(col="Clump Thickness"):
        return df[col].hvplot.hist()
    return hist_plot
hist_plot_pane(num_df).servable()

# Splitting and Resampling

In [27]:
# Proper Oversampling Technique adpated from https://beckernick.github.io/oversampling-modeling/
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
seed = 2010

In [28]:
# Splitting the dataset before resampling
training_features, test_features, \
training_target, test_target, = train_test_split(num_df[feat_cols],
                                               num_df[label_col],
                                               test_size = .2,
                                               random_state=seed)

In [29]:
# Oversampling the training dataset only
x_train, x_val, y_train, y_val = train_test_split(training_features, training_target,
                                                  test_size = .2,
                                                  random_state=seed)

In [30]:
sm = SMOTE(random_state=seed)
x_train_res, y_train_res = sm.fit_sample(x_train, y_train.astype(int))

In [31]:
# New Resampled Value_Counts
np.unique(y_train_res,return_counts=True)

(array([0, 1]), array([9780, 9780]))

In [32]:
from sklearn.metrics import recall_score, classification_report, confusion_matrix, accuracy_score, roc_curve, auc

# results output function
def results(sklearn_model,thresh=None):
    res = ""
    if thresh is None:
        for label,y_true,x in [["Train", y_train_res,x_train_res], ["Validation", y_val,x_val], ["Test", test_target,test_features]]:
            res += "{} Results, Confusion Matrix:\n\n".format(label) + str(confusion_matrix(y_true.astype(int), sklearn_model.predict(x).astype(int))) + "\n"
            res += "{} Results, Classification Report:\n\n".format(label) + str(classification_report(y_true.astype(int), sklearn_model.predict(x).astype(int))) + "\n"
    else:
        for label,y_true,x in [["Train", y_train_res,x_train_res], ["Validation", y_val,x_val], ["Test", test_target,test_features]]:
            res += "{} Results, Confusion Matrix:\n\n".format(label) + str(confusion_matrix(y_true.astype(int), (sklearn_model.predict_proba(x)[:,-1]>thresh).astype(int))) + "\n"
            res += "{} Results, Classification Report:\n\n".format(label) + str(classification_report(y_true.astype(int), (sklearn_model.predict_proba(x)[:,-1]>thresh).astype(int))) + "\n"
    return res

# Modeling

In [33]:
pn.pane.Markdown("# Modeling").servable()

## Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train_res, y_train_res)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
pn.pane.Markdown("## Results from Logistic Regression:").servable()
pn.panel(results(lr),width=1000).servable()

# Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=seed)
rf.fit(x_train_res, y_train_res)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=2010,
                       verbose=0, warm_start=False)

In [37]:
pn.pane.Markdown("## Results from Random Forest:").servable()
pn.panel(results(rf),width=1000).servable()

In [38]:
d_rf = {feat:round(np.mean(score), 4)  for
              feat, score in zip(feat_cols,rf.feature_importances_)}
ft_rf = {k: v for k, v in sorted(d_rf.items(), key=lambda item: item[1], reverse=True)}

pn.pane.Markdown("RF Feature Important").servable()
pn.panel(ft_rf).servable()

In [39]:
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(random_state=seed)
gbm.fit(x_train_res, y_train_res)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2010, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [40]:
pn.pane.Markdown("## Results from Gradient Boosting:").servable()
pn.panel(results(gbm),width=1000).servable()

In [41]:
d = {feat:round(np.mean(score), 4)  for
              feat, score in zip(feat_cols,gbm.feature_importances_)}
ft = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

pn.pane.Markdown("GBMFeature Important").servable()
pn.panel(ft).servable()

In [None]:
! panel serve --show "breast_cancer_detection.ipynb"

2020-02-14 10:42:58,675 Starting Bokeh server version 1.4.0 (running on Tornado 6.0.3)
2020-02-14 10:42:58,677 User authentication hooks NOT provided (default user enabled)
2020-02-14 10:42:58,678 Bokeh app running at: http://localhost:5006/breast_cancer_detection
2020-02-14 10:42:58,679 Starting Bokeh server with process id: 75295
2020-02-14 10:43:03,638 Error running application handler <bokeh.application.handlers.notebook.NotebookHandler object at 0x7fa0c8d71ad0>: name 'get_ipython' is not defined
File "breast_cancer_detection.ipynb", line 353, in <module>:
"\n", Traceback (most recent call last):
  File "/Users/devsharma/anaconda3/envs/fastai/lib/python3.7/site-packages/bokeh/application/handlers/code_runner.py", line 179, in run
    exec(self._code, module.__dict__)
  File "/Users/devsharma/Dropbox/Projects/breast-cancer-detection/breast_cancer_detection.ipynb", line 353, in <module>
    "\n",
NameError: name 'get_ipython' is not defined
 
2020-02-14 10:43:03,841 WebSocket connect