# *Data Source*

https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/

https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29

# Github

https://github.com/christinewinter/intro_ml_bc

# What is the goal?

Predict if a breast tumor is malign or benign.

## Import modules 

In [None]:
# Silencing warnings from scikit-learn
import warnings
warnings.filterwarnings("ignore")

# Importing libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import model_selection, linear_model, metrics
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import seaborn as sns
import pydotplus

# Import breast cancer data

In [None]:
data = pd.read_csv("../data/data.csv", index_col=0)

# Understand the data

### Data definition 
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/


In [None]:
data.head(10)

### Types in python and basic information about the occurence of data 

In [None]:
data.info()

## Questions we need to answer: 

What do these values mean? 

Which columns stick out? 

Which columns could be interesting for our model?


In [None]:
"""
 Count of malignant (= M) and benign (= B) data points
"""

ax = sns.countplot(data.diagnosis,label="Count")

In [None]:
""" 
 Create violin plot for first ten features
"""

data_dia = data.diagnosis
data_f = data.drop('diagnosis', axis=1)
data_n_2 = (data_f - data_f.mean()) / (data_f.std())              # standardization
data_pp = pd.concat([data_dia, data_n_2.iloc[:,0:10]],axis=1)
data_pp = pd.melt(data_pp,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(20,20))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_pp,split=True, inner="quart")
plt.xticks(rotation=90)


In [None]:
feature_list = ['radius_mean']  # fill in the other features where median differs between malignant and benign

In [None]:
""" 
 Create violin lpot for features 10 - 20 
"""

data_pp = pd.concat([data_dia, data_n_2.iloc[:,10:20]],axis=1)
data_pp = pd.melt(data_pp,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(20,20))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_pp,split=True, inner="quart")
plt.xticks(rotation=90)

In [None]:
feature_list.extend(['radius_se'])  # fill in features where median differs between malignant and benign

In [None]:
""" 
 Create violine lpot for features 20 - 30
"""


data_pp = pd.concat([data_dia, data_n_2.iloc[:,20:30]],axis=1)
data_pp = pd.melt(data_pp,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(20,20))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data_pp,split=True, inner="quart")
plt.xticks(rotation=90)

In [None]:
feature_list.extend(['radius_worst'])  # fill in features where median differs between malignant and benign

## Checking correlation

Variables that contain the same information / are highly correlated we should sort from the dataset and decide for one. 
Fewer features reduces computation time and can help to build more robust models.  

In [None]:
corr = data[feature_list].corr()

plt.figure(figsize=(20,20))

sns.heatmap(corr, cbar = True, square = True, annot=True, fmt= '.2f', annot_kws={'size': 10},
            cmap= 'coolwarm')


## Questions we need to answer: 

Which variables are correlated? 

Which ones should we keep and which ones should we drop? 


In [None]:
variables_to_drop = ['radius_worst']  # fill in names here t.ex. 'area_worst'

# Feature selection

To the the model we need to remove our target variable 'diagnosis' from the training data set x.
Therefore we drop this and 'Unnamed: 32' 

In [None]:
feature_list = ['texture_mean','perimeter_mean','smoothness_mean','compactness_mean','symmetry_mean']
variables_to_drop = []

## Separate target from features 

In [None]:
# Transform diagnosis strings into numerical format so that sklearn can process it 
y = data.diagnosis.map({'M':1,'B':0}) 

# Extract features from data
x = data[feature_list].drop(variables_to_drop, axis=1)

In [None]:
x.head()

In [None]:
y.head()

# Train the model

In [None]:
# Splitting the final dataset into training and testing datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, random_state=0, test_size = 0.3)

In [None]:
X_train.count()

In [None]:
X_test.count()

In [None]:
# Define a decision tree https://scikit-learn.org/stable/modules/tree.html

model = DecisionTreeClassifier(
    max_leaf_nodes=8, 
    random_state=0,
) # Random state is fixed for reproducibility

In [None]:
# Fit the model with training data 
model.fit(X_train, y_train.tolist())

### Visualize the tree

In [None]:
# Export resulting tree to DOT source code string
dot_data = export_graphviz(    model, 
    out_file=None,
    feature_names=X_train.columns,
    class_names=['B', 'M'],  # 'B'mapped to 0, 'M' mapped to 1
    filled=True, 
    rounded=True,
    proportion= True,
    special_characters=True,
    label='all')  

pydot_graph = pydotplus.graph_from_dot_data(dot_data)
pydot_graph.set_size('"10,10!"')

from IPython.display import Image
Image(pydot_graph.create_png())

gini: Quality measure of the split

samples: procentage of data points that fall in this bucket

value: range 


In [None]:
from dtreeviz.trees import *

viz = dtreeviz(model,
               X_train,
               y_train,
               target_name='diagnosis',
               feature_names=X_train.columns.to_list(),
               class_names=['B', 'M'])
              
viz.view()  

# Validation

In [None]:
# Predict our test data
y_pred = model.predict(X_test)

In [None]:
# Evaluate the accuracy of the model 
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
metrics.accuracy_score(y_test, y_pred.astype(int))

In [None]:
y_pred_train = model.predict(X_train)
metrics.accuracy_score(y_pred_train, y_train)

In [None]:
# Cross validation
#    * split the data into 10 parts
#    * fit on 9-parts
#    * test accuracy on the remaining part


scores = cross_val_score(model, X_train, y_train, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()),
                                          end="\n\n" )

That means that the average accuracy (percentage of correct classifications using the trained model) is 88.9%.

# THAT'S IT!! 

Questions, comments, ideas, feedback ? 

# What else can we do? 

## Test different model parameters 

There are many parameters to set for Decision Tree classifiers. Can you find a better model by changing/tuning these?

https://scikit-learn.org/stable/modules/tree.html


In [None]:
better_model = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                                       max_features=None, max_leaf_nodes=8,
                                       min_impurity_decrease=0.0, min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0, presort=False,
                                       random_state=0, splitter='best')

## Testing other models from scikit learn 
https://scikit-learn.org/stable/supervised_learning.html#supervised-learning

## Hyper parameter tuning

http://chrisstrelioff.ws/sandbox/2015/06/25/decision_trees_in_python_again_cross_validation.html