# Exercise 4: Breast cancer classification

## Exercises:
---
### 1. Preprocessing:

    1.1 Drop the "id" column and drop "Unnamed" column
    1.2 Replace the label "diagnosis" with binary labels, where "M"(Malignant)=1, and "B"(=Benign)=0.
    1.3 Split the dataset into a training set(=80% of the data) and a test set(=20% of the data). Shuffle the data.
    1.4 For both datasets, split it into a dataset with only features: "X" and one another containing only the labels: "Y".

### 2. Implementation and training of the models:
    2.1 Implement and train a Logistic Regression model on the training set using SkLearn. Use the default arguments for the model. 
    2.2 Implementand train a Decision Tree model on the training set using SkLearn. Use the default arguments for the model.
    2.3 Evaluate the test accuracy of both models on the test set, which one is better ?
    2.4 Are we overfitting for one of the models ? 

### 3. Playing with different features and Hyperparameters
    3.1 Train both the Logistic Regression AND the Decision Tree model using only the following features:
    'radius_mean','perimeter_mean','area_mean','compactness_mean','concave points_mean'.
    
    3.2 Compare the test accuracy results with the models from 2 

# 0. Imports

In [1]:
import numpy as np 
import pandas as pd 

%matplotlib inline 
import matplotlib.pyplot as plt # side-stepping mpl backend
import matplotlib.gridspec as gridspec # subplots
#import mpld3 as mpl

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

# 1. Load and preprocess the data

In [6]:
df = pd.read_csv("data/breast_cancer.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


### Solutions 1.1-1.3

In [7]:
df.drop('id',axis=1, inplace=True)
df.drop('Unnamed: 32',axis=1, inplace=True)
len(df)

569

In [8]:
df['diagnosis'] = df['diagnosis'].map({'M':1,'B':0})
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [34]:
trainset, testset = train_test_split(df, test_size = 0.2, random_state=2, shuffle=True)

In [35]:
Y_train = trainset['diagnosis']
X_train = trainset.drop('diagnosis',axis=1)

In [36]:
Y_test = testset['diagnosis']
X_test = testset.drop('diagnosis',axis=1)

In [37]:
X_train.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
560,14.05,27.15,91.38,600.4,0.09929,0.1126,0.04462,0.04304,0.1537,0.06171,...,15.3,33.17,100.2,706.7,0.1241,0.2264,0.1326,0.1048,0.225,0.08321
428,11.13,16.62,70.47,381.1,0.08151,0.03834,0.01369,0.0137,0.1511,0.06148,...,11.68,20.29,74.35,421.1,0.103,0.06219,0.0458,0.04044,0.2383,0.07083
198,19.18,22.49,127.5,1148.0,0.08523,0.1428,0.1114,0.06772,0.1767,0.05529,...,23.36,32.06,166.4,1688.0,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221
203,13.81,23.75,91.56,597.8,0.1323,0.1768,0.1558,0.09176,0.2251,0.07421,...,19.2,41.85,128.5,1153.0,0.2226,0.5209,0.4646,0.2013,0.4432,0.1086
41,10.95,21.35,71.9,371.1,0.1227,0.1218,0.1044,0.05669,0.1895,0.0687,...,12.84,35.34,87.22,514.0,0.1909,0.2698,0.4023,0.1424,0.2964,0.09606


### Solutions 2

#### Logistic Regression

In [52]:
lr_model=LogisticRegression(max_iter=1000)
lr_model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=1000)

In [53]:
# Training the model 
train_predictions = lr_model.predict(X_train)
train_accuracy = metrics.accuracy_score(train_predictions,Y_train)

print("Train Accuracy : %s" % "{0:.3%}".format(train_accuracy))

Train Accuracy : 96.044%


In [54]:
lr_model.score(X_train, Y_train)

0.9604395604395605

In [55]:
# Testing the model
test_predictions = lr_model.predict(X_test)
test_accuracy = metrics.accuracy_score(test_predictions,Y_test)
print("Test accuracy : %s" % "{0:.3%}".format(test_accuracy))

Test accuracy : 92.982%


#### $\Rightarrow:$ Not overfitting.

#### Decision Tree model

In [42]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)

DecisionTreeClassifier()

In [43]:
# Training the model 
train_predictions = dt_model.predict(X_train)
train_accuracy = metrics.accuracy_score(train_predictions,Y_train)

print("Accuracy : %s" % "{0:.3%}".format(train_accuracy))

Accuracy : 100.000%


In [44]:
# Testing the model
test_predictions = dt_model.predict(X_test)
test_accuracy = metrics.accuracy_score(test_predictions,Y_test)
print("Test accuracy : %s" % "{0:.3%}".format(test_accuracy))

Test accuracy : 91.228%


#### $\Rightarrow:$ Overfitting slightly.

### Solutions 3

In [56]:
features = ['radius_mean','perimeter_mean','area_mean','compactness_mean','concave points_mean']

#### Logistic Regression 

In [57]:
lr_model=LogisticRegression(max_iter=1000)
lr_model.fit(X_train[features], Y_train)

LogisticRegression(max_iter=1000)

In [58]:
# Training the model 
train_predictions = lr_model.predict(X_train[features])
train_accuracy = metrics.accuracy_score(train_predictions,Y_train)
print("Train Accuracy : %s" % "{0:.3%}".format(train_accuracy))

Train Accuracy : 90.330%


In [59]:
# Testing the model
test_predictions = lr_model.predict(X_test[features])
test_accuracy = metrics.accuracy_score(test_predictions,Y_test)
print("Test accuracy : %s" % "{0:.3%}".format(test_accuracy))

Test accuracy : 87.719%


#### Decision Trees

In [60]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train[features], Y_train)

DecisionTreeClassifier()

In [61]:
# Training the model 
train_predictions = dt_model.predict(X_train[features])
train_accuracy = metrics.accuracy_score(train_predictions,Y_train)
print("Accuracy : %s" % "{0:.3%}".format(train_accuracy))

Accuracy : 100.000%


In [62]:
# Testing the model
test_predictions = dt_model.predict(X_test[features])
test_accuracy = metrics.accuracy_score(test_predictions,Y_test)
print("Test accuracy : %s" % "{0:.3%}".format(test_accuracy))

Test accuracy : 88.596%


### (Optional) 4.: Write own logistic regression class

Write a class that has the following methods:

- fit(X,Y): Should fit the model weights using feature matrix X and label vector y. Implement gradient descent to find a solution for a suitable w-vector.

- predict(X): Should make predictions label predictions for X.

- score(X,Y): Should make label predictions on X, and compute accuracy using the label information Y and the predictions.

- train and evaluate your model on the wine dataset

### Load and preprocess the data

In [63]:
from sklearn.datasets import load_wine
from sklearn.metrics import log_loss 
import itertools

In [64]:
X, Y = load_wine(return_X_y=True)

In [65]:
X = np.hstack([np.ones((X.shape[0], 1)), X])

### Recap: Gradient:
---
$\nabla_{\mathbf w} J(\mathbf w) = \frac{1}{N}\mathbf X^T(\sigma(\mathbf X \mathbf w)-\mathbf y)$

In [66]:
# Implement the sigmoid function - Warning: Might encounter overflow error
def sigmoid(z):
    return (1 / (1 + np.exp(-z)))

In [67]:
from scipy.special import expit

In [71]:
# This is a Binary Logistic Regressor that uses Gradient Descent for learning the best fitting weights
class BinaryLogisticRegressor():
    
    # Should store the positive class and its weights
    def __init__(self, pos_class=None):
        self.w = None
        self.pos_class = pos_class
    
    # Implement GD using The formula above
    def fit(self, X, Y, n_epochs=1000, alpha=0.001):
        self.w = np.random.rand(X.shape[1],1)
        loss = []
        
        for i in range(n_epochs):        
            # Compute the gradient
            delta_J = (1/len(X)) * np.dot(X.T, (expit(np.dot(X,self.w)) - Y[:,np.newaxis]))
            
            # Do the update
            self.w = self.w - alpha * delta_J
            
            # store the lossin loss array (optional)
            loss.append(log_loss(Y, expit(np.dot(X,self.w))))
            
    def predict(self, X):
        # predict
        predictions = expit(np.dot(X,self.w))
        return np.round(predictions)
    
    def score(self, X, Y):
        predictions = self.predict(X)
        accuracy_score = np.sum(predictions == Y[:,np.newaxis]) / X.shape[0]
        
        return accuracy_score

In [72]:
def get_2_class_dataset(X, Y, pos_class_id):
    
        class_1_ids = np.where(Y == pos_class_id)[0]
        class_0_ids = np.where(Y != pos_class_id)[0]
        
        X_1 = X[class_1_ids]
        X_0 = X[class_0_ids]
        
        Y_1 = np.ones(len(Y[class_1_ids])) 
        Y_0 = np.zeros(len(Y[class_0_ids]))
    
        X = np.concatenate([X_1, X_0], axis=0)
        Y = np.concatenate([Y_1, Y_0])
    
        shuffled_indices = np.random.permutation(X.shape[0])
    
        return X[shuffled_indices], Y[shuffled_indices]

In [73]:
class MultiClassLogisticRegression():
    
    def __init__(self):
        self.models = None
        
    def fit(self, X, Y, n_epochs=2):
        
        self.n_epochs = n_epochs
        
        # 1. Get k unique labels
        # 2  Create array to store k model weights 
        # 3. Get unique, class-dependent datasets
        # 4. For each unique class:
            # Create a binary dataset
            # Create binary classifier
            # Train binary classifier
        # 5. Store model weights
        
        self.num_features = X.shape[1]
        self.unique_classes = np.unique(Y)
        
        # Create array to store model weights
        self.models = np.zeros((len(self.unique_classes), self.num_features))
        
        for i, pos_cls in enumerate(self.unique_classes):
            
            # Create unique dataset
            X_tmp, Y_tmp = get_2_class_dataset(X, Y, pos_cls)
            
            # Create binary classifier
            tmp_model = BinaryLogisticRegressor(pos_cls)
            
            # Train binary classifier
            tmp_model.fit(X_tmp, Y_tmp, n_epochs)
            
            # Store model weights
            self.models[i][:,np.newaxis] += tmp_model.w[:]
            
    def predict(self, X):
        # Create empty prediction table:
        
        # First Dimension represents classifiers
        # Second Dimension represents points
        # Third Dimension represents class scores, first entry is prob. score for class 1, second class 0
        predictions = np.zeros((len(self.unique_classes), 2, X.shape[0]))
        
        # Do a prediction for every classifier
        for i, weights in enumerate(self.models):
            
            # Probability score for class 1
            predictions[i][0][:,np.newaxis] += expit(np.dot(X, weights[:,np.newaxis]))
            
            # Probability score for class 2
            predictions[i][1][:,np.newaxis] += 1-expit(np.dot(X, weights[:,np.newaxis]))
            
        # final predictions are is the argmax over classifier probability scores for class 1
        final_predictions = np.argmax(predictions[:,0,:], axis=0)
        
        return final_predictions
            
    def score(self, X, Y):
        
        predictions = self.predict(X)
        
        correctly_predicted = np.sum(predictions == Y) 
        
        return correctly_predicted / X.shape[0]

In [74]:
mclr = MultiClassLogisticRegression()

In [75]:
mclr.fit(X,Y, 3000)

In [76]:
preds = mclr.predict(X)

In [77]:
mclr.score(X,Y)

0.6741573033707865

In [None]:
mclr.models