In [1]:
#Numpy is used for handling the multi-dimensional array operation
import numpy as np
#Pandas is used for reading the data from csv
import pandas as pd
#For finding the p-value
import statsmodels.api as sm
#sklearn preprocessing is used for normalization
from sklearn.preprocessing import MinMaxScaler
#sklearn model selection is used for splitting dataset into train and test sets
from sklearn.model_selection import train_test_split as tts
#sklearn matrics used for calcualting the accuracy score, recall score and precision score
from sklearn.metrics import accuracy_score, recall_score, precision_score
#Sequence of shuffled copies of the collections will be import through sklearn
from sklearn.utils import shuffle

In [2]:
#FEATURE SELECTION

In [3]:
#Remove the highly correlated features
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped

In [4]:
#Remove the less significant features using the pvalues and elimination
def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

In [5]:
#MODEL TRAINING

In [6]:
#Cost function
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    # equivalent to max(0, distance)
    distances[distances < 0] = 0  
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [7]:
#The Gradient of the Cost function
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        # gives multidimensional array
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

     # average
    dw = dw/len(Y_batch) 
    return dw

In [8]:
#Train model Using SGD
def sgd(features, outputs):
    #Running the loop 5000 times
    max_epochs = 5000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)

        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights



In [9]:
#Initilalize the algorithm

In [10]:
def init():
    # read data in pandas (pd) data frame
    #Import the Bread Cancer dataset
    data = pd.read_csv('data.csv')
    
    # unnecessary first column (Sample code numbe) will be removed
    data.drop(data.columns[[0]], axis=1, inplace=True)
  
    
    # convert the 4 and 2 class label to 1 and -1 respectively
    diag_map = {4: 1.0, 2: -1.0}
    data['Class'] = data['Class'].map(diag_map)

    # put features & output lables into different data frames
    Y = data.loc[:, 'Class']
    X = data.iloc[:, 0:9]

    # filter features
    remove_correlated_features(X)
    remove_less_significant_features(X, Y)

    # normalize data for better convergence and to prevent overflow
    #As many features are in different range of valaues, we are trying to get them into intervals like [-1,1] or[0,1].
    #This will be help for the speed of the learning of faster convergence in gradient descent
    X_normalized = MinMaxScaler().fit_transform(X.values)
    X = pd.DataFrame(X_normalized)

    # insert 1 in every row for intercept b as in the cost function the intercept will be missed
    X.insert(loc=len(X.columns), column='intercept', value=1)

    # split data into train and test set
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

    # train the model
    W = sgd(X_train.to_numpy(), y_train.to_numpy())
    #trained Finished
    print("training finished.")
    print("weights are: {}".format(W))

    # testing the model
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)

    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)

    #Print the accuracy, recall and precision values of the dataset
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))


In [11]:
regularization_strength = 10000
learning_rate = 0.000001
init()


Epoch is: 1 and Cost is: 1503.4077000758928
Epoch is: 2 and Cost is: 1084.2156159874844
Epoch is: 4 and Cost is: 940.1947032477274
Epoch is: 8 and Cost is: 862.4734953439086
Epoch is: 16 and Cost is: 821.5589083708489
Epoch is: 32 and Cost is: 793.1776276129445
Epoch is: 64 and Cost is: 769.6905273160899
Epoch is: 128 and Cost is: 746.2380441486853
Epoch is: 256 and Cost is: 744.3168644026167
training finished.
weights are: [ 2.34499817  1.18195134  0.86477801  2.18043713  1.70429836  0.39438998
  1.67916619 -3.18116038]
accuracy on test dataset: 0.9642857142857143
recall on test dataset: 0.9111111111111111
precision on test dataset: 0.9111111111111111
