## Lab11 Logistic :Regression and k-Nearest Neighbors (kNN)


In [44]:
import pandas as pd
import numpy as np

In [45]:
def normalize_features(X):
    sum = np.sum(X,axis=0)
    sum = sum / X.shape[0]
    X = X - sum
    standard_deviation = np.sqrt(np.sum(np.square(X),axis=0)/X.shape[0])
    return X / standard_deviation


### Loading Dataset
Dataset is loaded and then equal percentage(80%) of both malignant and benign patients are added to the training set and the rest(20%) to the test set

In [46]:
data = pd.read_csv("wbdc.csv")
def replM(df): return df != 'M'
def replB(df): return df != 'B'

# Replace M and B with 1 and 0
data = data.where(replM,1)
data = data.where(replB,0)

gp = data.groupby(['diagnosis'])
data_malignant = gp.get_group(1)
data_benign = gp.get_group(0)

X = data_malignant.iloc[:,2:].to_numpy(np.float64)
Y = data_malignant.iloc[:,1].to_numpy(np.float64)

X_train,X_test = X[0:(X.shape[0]*8)//10],X[(X.shape[0]*8)//10:]
Y_train,Y_test = Y[0:(X.shape[0]*8)//10],Y[(X.shape[0]*8)//10:]

X = data_benign.iloc[:,2:].to_numpy(np.float64)
Y = data_benign.iloc[:,1].to_numpy(np.float64)

X_train = np.concatenate((X_train,X[0:(X.shape[0]*8)//10]),axis = 0)
X_test = np.concatenate((X_test,X[(X.shape[0]*8)//10:]),axis = 0)

Y_train = np.concatenate((Y_train,Y[0:(Y.shape[0]*8)//10]),axis = 0)
Y_test = np.concatenate((Y_test,Y[(Y.shape[0]*8)//10:]),axis = 0)

  data_malignant = gp.get_group(1)
  data_benign = gp.get_group(0)


### Logistic Regression

In [47]:
def sigmoid(z): # return numpy array with element wise operations
    return 1/(1+np.exp(-z))

def initialize_weights(n_features):
    weights = np.zeros(n_features)
    bias  = np.zeros(1)
    return weights,bias

def compute_cost(X,y,weights,bias):
    Y = sigmoid(X.dot(weights) + bias)
    SUM = np.sum(y * np.log10(Y) + (1-y) * np.log10(1-Y))
    return -SUM/y.shape[0]

def optimize_weights(X, y, weights, bias, learning_rate, num_iterations):
    for epoch in range(num_iterations):
        pred = sigmoid(X.dot(weights) + bias)
        grad = np.matmul(X.T,(y/pred - (1-y)/(1-pred)) * (pred * (1-pred))) * (-1/X.shape[0])
        #print(grad)
        weights = weights - grad * learning_rate
        bias_grad = np.sum(((y/pred - (1-y)/(1-pred)) * (pred * (1-pred))) * (-1/X.shape[0]))
        bias = bias - bias_grad * learning_rate

        
        print(f"epoch {epoch + 1} training loss : {compute_cost(X,y,weights,bias)}")
    return weights,bias

def train_logistic_regression(X, y, learning_rate, num_iterations):
    weights,bias = initialize_weights(X.shape[1])
    return optimize_weights(X, y,weights,bias,learning_rate, num_iterations)
    
def predict_logistic_regression(X, weights, bias):
    Y = sigmoid(X.dot(weights) + bias)
    Y = np.where(Y < 0.5,0.0,1.0)
    return Y

def accuracy(y_pred,y_true):
    correct = 0
    for i in range(y_true.shape[0]):
        if y_pred[i] == y_true[i]:
            correct+=1
    return correct/y_true.shape[0] 



In [48]:
weights,bias = train_logistic_regression(X_train,Y_train,0.0000082,100)
print("train accuracy Logistic Regression: ",accuracy(predict_logistic_regression(X_train,weights,bias),Y_train))
print("test accuracy Logistic Regression: ",accuracy(predict_logistic_regression(X_test,weights,bias),Y_test))

epoch 1 training loss : 0.31304369587060604
epoch 2 training loss : 0.34802431757763413
epoch 3 training loss : 0.44298018286057994
epoch 4 training loss : 0.3866376083594666
epoch 5 training loss : 0.48830663566081756
epoch 6 training loss : 0.3586702930990497
epoch 7 training loss : 0.4603060594788346
epoch 8 training loss : 0.35963017235303957
epoch 9 training loss : 0.4608668470705603
epoch 10 training loss : 0.3494025746180105
epoch 11 training loss : 0.4478627616763778
epoch 12 training loss : 0.34467197660628385
epoch 13 training loss : 0.4405698274067052
epoch 14 training loss : 0.3381181634628196
epoch 15 training loss : 0.4303376155460528
epoch 16 training loss : 0.3328806045253153
epoch 17 training loss : 0.4212911961115972
epoch 18 training loss : 0.32751550522328665
epoch 19 training loss : 0.4116904978191488
epoch 20 training loss : 0.32257083990735813
epoch 21 training loss : 0.40241480012999714
epoch 22 training loss : 0.31775683843522917
epoch 23 training loss : 0.3931

### KNN

In [49]:
def euclidean_distance(x1,x2):
    return np.sqrt(np.sum(np.square(x1-x2))).item()

def get_neighbors(X_train, X_test_instance, k):
    distances = np.array([euclidean_distance(X_test_instance,x2) for x2 in X_train])
    args = np.argsort(distances)
    return args[:k]

def predict_kNN(X_train,y_train,X_test,k):
    preds = []
    for X_test_instance in X_test:
        neighbors = get_neighbors(X_train,X_test_instance,k)
        ones = 0
        for idx in neighbors: 
            if y_train[idx] == 1: ones+=1
        if(ones > k//2): preds.append(1.0)
        else : preds.append(0.0)
    return np.array(preds)

In [50]:
for k in range(1,16,2):    
    Y_test_pred = predict_kNN(X_train,Y_train,X_test,k)
    print(f"test kNN accuracy for k  = {k}: ",accuracy(Y_test_pred,Y_test))

test kNN accuracy for k  = 1:  0.9043478260869565
test kNN accuracy for k  = 3:  0.9043478260869565
test kNN accuracy for k  = 5:  0.9217391304347826
test kNN accuracy for k  = 7:  0.9130434782608695
test kNN accuracy for k  = 9:  0.9304347826086956
test kNN accuracy for k  = 11:  0.9217391304347826
test kNN accuracy for k  = 13:  0.9391304347826087
test kNN accuracy for k  = 15:  0.9391304347826087


## Comparison and Analysis

Q1 â€“ Which model performs better and why?<br>
Ans - It can be seen that knn is able to achieve a higher accuracy of the test data.<br>
Which can be because a non-linear decision boundary might be more suitable for the data.<br>

Q2 - How does the choice of k affect the performance of kNN?<br>
Ans - The range of values of k from 5-15 produce very similiar result.<br>But values 1 and 3 produce slightly worse result.<br>Best result is seen for k = 13,15.<br>

Q3 - What are the strengths and limitations of Logistic Regression and kNN for this classification problem?<br>
Ans - <br>
Logistic Regression:<br>
Strengths: Efficient, good for high-dimensional data.<br>
Limitations: Assumes linear relationships, sensitive to outliers.<br>

kNN:<br>
Strengths: Simple to implement,can deal with non-linear decision boundaries.<br>
Limitations: Computationally expensive, sensitive to irrelevant features<br>

