In [1]:
!wget -O dataset.csv https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data

--2021-10-18 17:52:18--  https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19889 (19K) [application/x-httpd-php]
Saving to: ‘dataset.csv’


2021-10-18 17:52:18 (544 KB/s) - ‘dataset.csv’ saved [19889/19889]



## What is Logistic Regression?


It’s a classification algorithm that is used where the response variable is categorical. The idea of Logistic Regression is to find a relationship between features and probability of particular outcome.
When we have to predict if a student passes or fails in an exam when the number of hours spent studying is given as a feature, the response variable has two values, pass and fail.
 


## Why Logistic, not Linear?
With binary classification, let ‘x’ be some feature and ‘y’ be the output which can be either 0 or 1.
The probability that the output is 1 given its input can be represented as:

If we predict the probability via linear regression, we can state it as:

where, p(x) = p(y=1|x)
Linear regression model can generate the predicted probability as any number ranging from negative to positive infinity, whereas probability of an outcome can only lie between 0< P(x)<1.



In [1]:
!head -3 dataset.csv

1000025,5,1,1,1,2,1,3,1,1,2
1002945,5,4,4,5,7,10,3,2,1,2
1015425,3,1,1,1,2,2,3,1,1,2


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('dataset.csv', names=[
  "id number",
  "Clump Thickness",
  "Uniformity of Cell Size",
  "Uniformity of Cell Shape",
  "Marginal Adhesion",
  "Single Epithelial Cell Size",
  "Bare Nuclei",
  "Bland Chromatin",
  "Normal Nucleoli",
  "Mitoses",
  "Class"
])

In [4]:
df.head()

Unnamed: 0,id number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id number                    699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitoses                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [6]:
df = df.replace('?',np.NaN)

In [7]:
df.isna().sum()

id number                       0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [8]:
X = df[["Clump Thickness",
  "Uniformity of Cell Size",
  "Uniformity of Cell Shape",
  "Marginal Adhesion",
  "Single Epithelial Cell Size",
  "Bare Nuclei",
  "Bland Chromatin",
  "Normal Nucleoli",
  "Mitoses"
]].values.astype(np.float32)


In [9]:
idx = np.where(np.isnan(X))
X[idx] = np.take(np.nanmedian(X, axis = 0), idx[1])

In [10]:
# we only run this cell once.
y = df['Class'].values
if y[0] == 2:
    y = np.array(y == 4, dtype=np.float32)

In [11]:
X = np.hstack((np.ones((len(X), 1)), X))

In [13]:
m,n = X.shape
m,n

(699, 10)

# Initialize Parameters 

In [14]:
theta = np.zeros(n)

In [15]:
theta

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## The model

In [95]:
def g(z):
    """ Sigmoid Function"""
    return (1/(1 + np.exp(-z)))

In [96]:
def h(X, theta):
    """ predicions"""
    return g(X@theta)


In [97]:
preds = h(X, theta)
preds.shape

(699,)

## Cost function & gradients¶

In [98]:
def J(preds,y):
    """Cost function"""
    return -1/m * (y@np.log(preds) +(1-y)@np.log(1-preds))

In [99]:
def compute_gradient(theta, X, y):
    """Compute gradient descent"""
    preds = h(X,theta)
    gradient = 1/m*X.T@(preds - y)
    return gradient 

In [100]:
compute_gradient(theta,X,y)

array([ 0.0215226 , -0.00184768,  0.00234385, -0.00067933, -0.00029077,
       -0.00193813,  0.00026125, -0.00224882,  0.00043837, -0.00146965])

# Training loop

In [109]:
hist = {'loss': [], 'acc': []}
alpha = 0.1

for i in range(100):
  gradient = compute_gradient(theta, X, y)
  theta -= alpha * gradient

  # loss
  preds = h(X, theta)
  loss = J(preds, y)
  hist['loss'].append(loss)

  # acc
  c = 0
  for j in range(len(y)):
    if (h(X[j], theta) > .5) == y[j]:
      c += 1
  acc = c / len(y)
  hist['acc'].append(acc)

  # print stats
  if i % 10 == 0: print(loss, acc)

0.09950797199481756 0.9656652360515021
0.09936230933977888 0.9656652360515021
0.09921834488428373 0.9656652360515021
0.09907605103664555 0.9656652360515021
0.0989354007868958 0.9656652360515021
0.09879636769169092 0.9656652360515021
0.09865892585968324 0.9656652360515021
0.09852304993734008 0.9656652360515021
0.09838871509519487 0.9656652360515021
0.09825589701451504 0.9656652360515021
