In [1]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
import autograd.numpy as np
from autograd import grad
                                                                           
np.random.seed(100)

ModuleNotFoundError: No module named 'autograd'

In [2]:
def cross_entropy_loss(X, y, beta ):
  if X.shape[0] != len(y):
    raise TypeError("The number of datapoints must match between X and y")
  else:
    n = len(y)
    Lambda = 0.001  # l2 regularization weight
    y= y.reshape(-1,1)
    beta = beta.reshape(-1,1)
    p = np.exp(X @ beta) /(1+ np.exp(X @ beta))
    label_probabilities = np.log(p)* y + np.log(1 - p) * (1 - y) 
    cost = -(np.sum(label_probabilities)) 
    #print(cost)
  return cost


def design_matrix(x_train, x_test):
  #now scale the data
  from sklearn.preprocessing import StandardScaler
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train_scaled = scaler.transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  return X_train , X_test


def simple_SGD(X_train_scaled, y_train, beta , Minibatch_size = 5 ,lr_eta = 0.1):
  if X_train_scaled.shape[0] != len(y_train):
    raise TypeError("The number of datapoints must match between X and y")
  else:
    n = len(y_train)  #number of datapoints
    m = int(n/Minibatch_size) #number of minibatches
    n_epochs = 1000 #number of epochs
    batch_indices= {}
        
    if m == 1:
      batch_indices[0] = np.arange(len(y_train))
    else:
      kf= KFold(n_splits = m)
      for k,(_, test_index) in enumerate(kf.split(X_train_scaled)):
        batch_indices[k] = test_index
            
    for epoch in range(1,n_epochs+1):
      for i in range(m):
        chosen_batch = np.random.randint(m)

        #Pick minibatch corresponding to 'chosen_batch'
        X_train_minibatch = X_train_scaled[batch_indices[chosen_batch ]]
        y_train_minibatch = y_train[batch_indices[chosen_batch ]]   

        #Compute the gradient using the data in minibatch Bk
        # Compute gradients using grad
        gradients = grad(cross_entropy_loss,2)   # 2 refers to the parameter with respect to whcih to find derivative

        #Compute new suggestion for beta
        beta -= lr_eta*gradients(X_train_minibatch, y_train_minibatch, beta)    
        #print(beta)
        
  return beta

# Example from lecture notes

In [None]:
# Build a toy dataset.
inputs = np.array([[0.52, 1.12, 0.77],
[0.88, -1.08, 0.15],
[0.52, 0.06, -1.30],
[0.74, -2.49, 1.39]])
targets = np.array([True, True, False, True])


# Optimize weights using gradient descent.
weights = np.array([0.0, 0.0, 0.0])
print("Initial loss:", cross_entropy_loss(inputs, targets,weights))

weights_opt =  simple_SGD(inputs, targets, weights, Minibatch_size = len(targets),lr_eta = 0.01)
print("Trained loss:", cross_entropy_loss(inputs, targets,weights))
print("Weights :",weights)


# Part g) Logistic Regression code (Wisconsin)

In [None]:
# Load the data
cancer = load_breast_cancer()
X, y =cancer.data,cancer.target

## Cross-validation based Logistic regression
ksplits = 5
kfold = KFold(n_splits = ksplits )
lr_eta = [1e-1]

est_test_acc = np.zeros((len(lr_eta), ksplits ))

for n_lr,lr in  enumerate(lr_eta):

  ## split the data into 5 folds and evaluate performance 
  for fold, (train_ind, test_ind) in enumerate(kfold.split(X)):
    print('Fold', fold+1)
    X_train , X_test = X[train_ind], X[test_ind]
    y_train, y_test = y[train_ind], y[test_ind]

    ## Create design matrix with standard scaling
    X_train_scaled, X_test_scaled  = design_matrix(X_train, X_test)
    
    # Randomly initialize the beta values
    #beta = np.random.randn(X_train_scaled.shape[1],1)
    beta = np.zeros((X_train_scaled.shape[1],1))
    
    print('Cross Entropy loss:',cross_entropy_loss(X_train_scaled, y_train, beta))
    
    # optimize betas using simple_sgd
    beta_opt =  simple_SGD(X_train, y_train, beta, Minibatch_size = len(y_train), lr_eta = lr)

    print('Cross Entropy loss:',cross_entropy_loss(X_train, y_train, beta_opt))

    break
    
    # find accuracy on test set with beta_opt
    y_pred = X_test_scaled @ beta_opt
    
    #est_test_acc[n_lr , fold] = np.mean(y_pred== y_test)



#print(est_test_acc)

# Part g) Scikit Learn's Logistic Regression

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import  train_test_split 
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

# Load the data
cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(cancer.data,cancer.target,random_state=0)
print(X_train.shape)
print(X_test.shape)
# Logistic Regression
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(X_train, y_train)
print("Test set accuracy with Logistic Regression: {:.2f}".format(logreg.score(X_test,y_test)))


#now scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Logistic Regression
logreg.fit(X_train_scaled, y_train)
print("Test set accuracy Logistic Regression with scaled data: {:.2f}".format(logreg.score(X_test_scaled,y_test)))