In [12]:
# import libraries required
import numpy as np
import pandas as pd
from random import randrange
from random import seed

import warnings
warnings.filterwarnings( "ignore" )

**Task-1:Implement Logistic Regression from scratch**


In [3]:
# define a class Logistic_Regression with two methods fit and predict
# fit method will do gradient ascent and find parameter vector W
# predict method will predict the class labels as 1 or 0 for input X

In [4]:
class Logistic_Regression:
  def __init__(self,learning_rate, iterations):
    self.learning_rate = learning_rate
    self.iterations = iterations 

  # method for training
  def fit(self, X, y):
    self.n_samples,  self.n_features = X.shape
    self.W = np.zeros(self.n_features + 1)                    # define parameter vector, including bias coefficient

    self.X = np.column_stack((X, np.ones(self.n_samples)))    # add a column with all one for considering bias
    self.y = y

    # update parameter vector using gradient ascend
    for i in range(self.iterations):

      temp1 = 1/(1+ np.exp(-(np.dot(self.X, self.W)).astype(float))).reshape(self.n_samples,1)
      temp2 = y - temp1

      dW = np.dot(self.X.T,temp2)                                                              # calculate gradient
      self.W = self.W.reshape(self.n_features+1,1) + self.learning_rate*dW                     # update parameters 
  
    return self 
    
  # method for predict the output labels
  def predict(self, X ):
    n_samples, n_features = X.shape
    X = np.column_stack((X, np.ones(n_samples)))              # add column with all ones for including bias
    temp = 1/(1+ np.exp(-(np.dot(X, self.W)).astype(float)))  # calculate  1/1+exp(-XW)    
    y = np.where( temp > 0.5, 1, 0 )                          # if 1/1+exp(-XW) > 0.5 y = 1 else y = 0
    return y

**Task-2: Implement k-fold cross validation logic from scratch**

In [5]:
# define a class Kfold_cross_validation

In [6]:
class Kfold_cross_validation:
  def __init__(self, K):
    self.K = K

  # method for dataset in to k parts
  def k_splits(self):
    X_splits = []
    y_splits = []
    
    X_copy = list(self.X)                             # list for saving each folds
    y_copy = list(self.y)

    fold_size = int(len(self.X) / self.K)             # calculate the size of each fold

    for i in range(self.K):
      X_fold = []
      y_fold = []

      for j in range(fold_size):                     # creating a fold of size calculated
        index = randrange(len(X_copy))
        X_fold.append(X_copy.pop(index))
        y_fold.append(y_copy.pop(index))

      X_splits.append(X_fold)                       
      y_splits.append(y_fold)
    return X_splits, y_splits

  # method for calculating accuracy
  def accuracy(self, y_true, y_pred):
    count = 0
    for i in range(np.size(y_pred)) :
      if y_true[i] == y_pred[i]:
        count = count + 1
    return count / np.size(y_pred)

  # method for do cross validation and find accuracy corresponding to each fold
  def validate(self, clf, X, y):
    self.clf = clf
    self.X = X
    self.y =  y

    X_splits, y_splits = self.k_splits()                # get list of k folds
    scores = []                                         # list for saving accuracies for each fold
    for i in range(len(X_splits)):

      X_test = np.array(X_splits[i])                    # choose a fold as test
      y_test = np.array(y_splits[i])

      X_train = list(X_splits)
      y_train = list(y_splits)
      X_train.pop(i)                                    # remaining folds as train
      y_train.pop(i)

      X_train = np.array(sum(X_train,[]))              # combine remaining folds as training data
      y_train = np.array(sum(y_train,[]))


      self.clf.fit(X_train, y_train)                   # fit the classifier with train data
      y_pred = self.clf.predict(X_test)                # predict the y values for test data

      score = self.accuracy(y_test, y_pred)            # calculate accuracy 
      scores.append(score)                             # save the acuuracy values
    
    return scores

**Task - 3**:
Train a binary classifier for the provided dataset using the implementation created in task 1. Use the k-fold cross validation implementation created in task 2 while training the model. Choose an appropriate value for ‘k’. Compute the mean and variance of classification accuracies of all ‘k’ trained models.

In [10]:
# define class for calculating sample mean and variance 
class Statistics:
  def __init__(self):
    pass
  
  # method for calculating mean
  def mean(self, samples):
    return sum(samples)/len(samples)
  
  #method for calculating variance
  def variance(self, samples):
    mean = self.mean(samples)
    sum = 0
    for i in range(len(samples)):
      sum += (samples[i] - mean)**2
    return sum/(len(samples)-1)

In [7]:
# load the dataset 
df = pd.read_csv('/content/drive/MyDrive/datasets/ionosphere.data',  header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,0.85243,-0.17755,0.59755,-0.44945,0.60536,-0.38223,0.84356,-0.38542,0.58212,-0.32192,0.56971,-0.29674,0.36946,-0.47357,0.56811,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,0.50874,-0.67743,0.34432,-0.69707,-0.51685,-0.97515,0.05499,-0.62237,0.33109,-1.0,-0.13151,-0.453,-0.18056,-0.35734,-0.20332,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,0.73082,0.05346,0.85443,0.00827,0.54591,0.00299,0.83775,-0.13644,0.75535,-0.0854,0.70887,-0.27502,0.43385,-0.12062,0.57528,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.14516,0.54094,-0.3933,-1.0,-0.54467,-0.69975,1.0,0.0,0.0,1.0,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,0.52798,-0.20275,0.56409,-0.00712,0.34395,-0.27457,0.5294,-0.2178,0.45107,-0.17813,0.05982,-0.35575,0.02309,-0.52879,0.03286,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [8]:
# split data into input X and output class y
X = df.iloc[:,:-1].values
y = df.iloc[:,-1:].values

# Encode the output label
# 'g' as 1 and 'b' as 0
for i in range(len(y)):
  if y[i] == 'g':
    y[i] = 1
  else:
    y[i] = 0

In [13]:
seed(10)

# define an object for Logistic_Regression class with learning_rate=0.01 and iterations=1000
clf = Logistic_Regression(learning_rate=0.01, iterations=1000)

# define an object for Kfold_cross_validation class with K = 5
CV = Kfold_cross_validation(K=5)

# do cross validation with logistic regression classifier (clf) and data X and y
scores = CV.validate(clf, X, y)

# define object for class Statistics
stat = Statistics()
print('Accuracy of each validation:{}'.format(scores))
print('Mean accuracy:{}'.format(stat.mean(scores)))
print('Variance of accuracies:{}'.format(stat.variance(scores)))

Accuracy of each validation:[0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8571428571428571, 0.8857142857142857]
Mean accuracy:0.8628571428571428
Variance of accuracies:0.0001632653061224491


In [14]:
seed(10)
# cross validation with K = 3
CV = Kfold_cross_validation(K=3)
scores = CV.validate(clf, X, y)

print('Accuracy of each validation:{}'.format(scores))
print('Mean accuracy:{}'.format(stat.mean(scores)))
print('Variance of accuracies:{}'.format(stat.variance(scores)))

Accuracy of each validation:[0.8547008547008547, 0.905982905982906, 0.8974358974358975]
Mean accuracy:0.886039886039886
Variance of accuracies:0.0007548640027272522


In [15]:
seed(10)
# cross validation with K = 10
CV = Kfold_cross_validation(K=10)
scores = CV.validate(clf, X, y)
print('Accuracy of each validation:{}'.format(scores))
print('Mean accuracy:{}'.format(stat.mean(scores)))
print('Variance of accuracies:{}'.format(stat.variance(scores)))

Accuracy of each validation:[0.9142857142857143, 0.8285714285714286, 0.8857142857142857, 0.8857142857142857, 0.9428571428571428, 0.8285714285714286, 0.8857142857142857, 0.8571428571428571, 0.9142857142857143, 0.9142857142857143]
Mean accuracy:0.8857142857142856
Variance of accuracies:0.0014512471655328777
