In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load and process dataset
load breast_cancer.csv, drop columns "id" and "Unnamed: 32", investigate the dataset, and divide into train and test with 80/20 ratio, map values of "diagnosis" from ("B","M") to (0,1)

In [49]:
df = pd.read_csv('breast_cancer.csv')
df.drop(['id','Unnamed: 32'], axis=1, inplace=True)
df['diagnosis'].replace({'B':0, 'M':1}, inplace=True)
df['diagnosis'].unique()

array([1, 0], dtype=int64)

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['diagnosis'])], df['diagnosis'], test_size=0.33, random_state=42)

In [51]:
y_train.head()

172    1
407    0
56     1
497    0
301    0
Name: diagnosis, dtype: int64

## Implementing KMeans

In [52]:
class KMeans(object):
    def __init__(self, K, metric='L2', max_iter=200, eps=1e-4):
        self.K=K
        self.max_iter=max_iter
        self.eps=eps
        self.cluster_centers=np.array([])
        """
        TODO:
        if metric is 'L2' let self.dist be a function that computes euclidian distance between x and y vectors,
        if metric is 'L1' let self.dist be a function that computes manhattan distance between x and y vectors,
        otherwise raise not implemented error
        """
        
        
    def distortion(self, X):
        """
        param X: numpy array of shape (M,N)
        return: distortion value of the dataset
        """
        #TODO calculate distortion measure using X and self.cluster_centers
        raise NotImplementedError
        
    def init_centroids(self, X, centers_init):
        """
        :param X: numpy array of shape (M,N)
        :param centers_init: numpy array of shape (K,N)
        """
        """TODO: 
        If centers_init is 'random' initialize self.cluster_centers with random K items from X,
        if it is 'kmeans++' initialize centroids according to the algorithm in 
        http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf page 3,
        otherwise raise not implemented error .
        """
        if centers_init == 'random':
            pass
        
    def fit(self, X, centers_init='random'):
        """
        :param X: numpy array of shape (M,N)
        :param centers_init: numpy array of shape (K,N)
        """
        """TODO: 
        1. Initialize cluster centers using self.init_centroids method
        2. Implement KMeans algorithm and  terminate it when either self.max_iter iterations are performed,
        or the biggest change in cluster centers is smaller than self.eps
        
        The final cluster centers should be saved in self.cluster_centers
        """
        raise NotImplementedError
    
    def predict(self, X):
        """
        :param X: numpy array of shape (M,N)
        :return: numpy array of shape (M,)
        """
        """TODO:
        using  self.cluster_centers predict to which cluster each datapoint of X belongs, values in returned array
        are integers(id of the cluster). 
        """
        raise NotImplementedError
    
    

In [53]:
kmean = KMeans(K=3)

## Cluster the dataset with kmeans, model and predict malignancy of tumors in the test set entries
## 1. Perform clustering using the following hyperparameter pairs
1. metric='L1', center_init='random'
2. metric='L1', center_init='kmeans++'
3. metric='L2', center_init='random'
4. metric='L2', center_init='kmeans++'

## 2. Predict malignancy of tumors in the test set entries using all 4 models trained above, compare their performances.


## Fit your implementation of Logistic Regression on the dataset, predict on test set and compare the results with kmeans approach

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
yhat = LR.predict(X_test)
yhat

array([0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0], dtype=int64)

In [55]:
from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(y_test, yhat)

0.9680851063829787

In [56]:
cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])
cnf_matrix

array([[ 62,   5],
       [  1, 120]], dtype=int64)

## Analyze the coefficients of fitted logistic regression model, drop 2 most unimportant features and train again Logistic regression and Kmeans with best metric, center_init hyperparameters, evaluate and compare results

In [57]:
df.corr()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,1.0,0.730029,0.415185,0.742636,0.708984,0.35856,0.596534,0.69636,0.776614,0.330499,...,0.776454,0.456903,0.782914,0.733825,0.421465,0.590998,0.65961,0.793566,0.416294,0.323872
radius_mean,0.730029,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
texture_mean,0.415185,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
perimeter_mean,0.742636,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
area_mean,0.708984,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
smoothness_mean,0.35856,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
compactness_mean,0.596534,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
concavity_mean,0.69636,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
concave points_mean,0.776614,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
symmetry_mean,0.330499,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413


In [66]:
df1 = df.drop(['symmetry_se','texture_se'], axis=1, inplace=False)
#Spliting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1[df1.columns.difference(['diagnosis'])], df1['diagnosis'], test_size=0.33, random_state=42)

#Logistic regr
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
yhat = LR.predict(X_test)

from sklearn.metrics import jaccard_similarity_score
print('Jaccard similarity score:' , jaccard_similarity_score(y_test, yhat))

cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])
print('Cnf matrix:' , cnf_matrix)


Jaccard similarity score: 0.9680851063829787
Cnf matrix: [[ 62   5]
 [  1 120]]


## Analyze the coefficients of fitted initial logistic regression model(using all features), select two most important features and train again Logistic regression and Kmeans with best metric, center_init hyperparameters, evaluate and compare results, make the following plot using the test set:

datapoints with cluster centers and decision boundary, color the datapoints according to Kmeans predictions
color the datapoints on which predictions of logistic regression and Kmeans disagree with separate color


## Compare performance of best Kmeans model with the performance of Kmeans in sklearn library, using the same hyperparameters.

In [94]:
from sklearn.cluster import KMeans
kmeans = KMeans(2, init='k-means++',max_iter=500).fit(X_train)
yhat = kmeans.predict(X_test)
#Cnf
cnf_matrix = confusion_matrix(y_test, yhat, labels=[1,0])
print('Cnf_matrix:\n', cnf_matrix)

from sklearn.metrics import jaccard_similarity_score
print('Jacard:', jaccard_similarity_score(y_test, yhat))


Cnf_matrix:
 [[ 42  25]
 [  0 121]]
Jacard: 0.8670212765957447
