# Loading Data

In [None]:
from google.colab import auth
import google.colab
auth.authenticate_user()
print('Authenticated')

Authenticated


In [None]:
%%bigquery --project formal-branch-269704 X_full
SELECT * FROM `formal-branch-269704.home_credit_default.X_train`

In [None]:
%%bigquery --project formal-branch-269704 y_full
SELECT * FROM `formal-branch-269704.home_credit_default.y_train`

In [None]:
import pandas as pd
import numpy as np

In [None]:
Xfull = X_full.iloc[:,1:]
Xfull['target'] = y_full['TARGET']

# Dropping first column
X_complete = X_full.iloc[:,1:]
y_complete = y_full['TARGET']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_complete = scaler.fit_transform(X_complete)

# To account for unbalanced data we train on balanced sub-samples
class1 = Xfull[Xfull['target']==1].sample(10000)
class0 = Xfull[Xfull['target']==-1].sample(10000)
train = pd.concat([class1,class0])

X = train.iloc[:,0:16]
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = train['target']

n, p = X.shape

# Setting regularization strength
lam = 0.10819565

# Target labels for logistic regression needs to be (0,1)
y_logr = (y+1)/2

# Logistic Regression w/ Ridge Penalty

**Objective Function for Logistic Regression with Ridge Penalty**
$$ p_i = \frac{e^{\alpha+\bf{x_i}𝛃}}{1+e^{\alpha+\bf{x_i}𝛃}}$$
$$ L(y,p) = -y\text{ log}(p) - (1-y) \text{log} (1-p) + \lambda \sum_{j=1}^{p}\beta^2_j $$
$$ \frac{\partial J}{\partial \alpha} = -\frac{1}{n} \sum_{i=1}^n (y_i - p_i) $$
$$ \frac{\partial J}{\partial \beta} = - \frac{1}{n} \sum_{i=1}^n (-y_i {\bf x}_i^{T} + p_i  {\bf x}_i^T) $$
$$ \frac{\partial J}{\partial \beta} = - \frac{1}{n} (-y_i X^{T} + p_i  X^T) $$

In [None]:
# Sklearn model for Logistic Regression with Ridge Penalty
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1/(2*n*lam))
clf.fit(X,y_logr)
clf.intercept_, clf.coef_

(array([-0.03854883]),
 array([[-0.03752668,  0.01531697,  0.00656067, -0.03425379,  0.41324003,
         -0.69639037, -0.02014574, -0.01225417,  0.00671573, -0.19576206,
          0.01517515, -0.07876296,  0.0664365 ,  0.03174906, -0.01426948,
          0.1004592 ]]))

In [None]:
# Function for running Logistic Ridge Regression

def logistic_reg(X, y, lr = .01, lam = lam):

  n, p = X.shape

  # Initializing values of alpha and beta to the origin
  alpha = 0
  betas = np.ones(p)

  # Gradient Descent with Ridge Penalty
  for _ in range(5000):

    # Sigmoid Function
    p_i = (np.exp(alpha+X@betas)) / (1 + np.exp(alpha+X@betas))

    # Alpha gradient
    grad_alpha = -(y - p_i).mean()
    # Beta gradient
    grad_betas = -(1/n)*((X.T@y) - (X.T@p_i)) + 2*lam*betas

    # Taking steps in the opposite direction of the gradient
    alpha = alpha - lr*grad_alpha
    betas = betas - lr*grad_betas

  return alpha, betas

logr_alpha, logr_betas = logistic_reg(X, y_logr)
logr_alpha, logr_betas

(-0.038523121844216676,
 array([-0.03752474,  0.01531806,  0.00656113, -0.03425316,  0.41324359,
        -0.69638376, -0.02014513, -0.01225363,  0.00671535, -0.19576242,
         0.01517557, -0.07876255,  0.0664364 ,  0.03174901, -0.01426961,
         0.10046109]))

# Support Vector Machines (SVM)

**Objective Function for SVM**

$$\displaystyle J(\alpha,\beta) = \frac{1}{n}\sum_{i=1}^n max(1-y_i(\alpha+\vec{x}_i\vec{\beta}),0) +\lambda\sum_{j=1}^p\beta_j^2$$

$$\frac{\partial J}{\partial \alpha} = \frac{1}{n}\sum_{i=1}^n\begin{cases} 
      0 & 1-y_i(\alpha+\vec{x}_i\vec{\beta}) < 0 \\
      -y_i & 1-y_i(\alpha+\vec{x}_i\vec{\beta}) >0 
   \end{cases}$$

$$\frac{\partial J}{\partial \beta} = (\frac{1}{n}\sum_{i=1}^n\begin{cases} 
      0 & 1-y_i(\alpha+\vec{x}_i\vec{\beta}) < 0 \\
      -y_i\vec{x}_i & 1-y_i(\alpha+\vec{x}_i\vec{\beta}) >0 
   \end{cases}) + 2\lambda \sum_{j=1}^p\beta_i$$

<br>

Sci-kitlearn's implementation uses the objective function

$$\displaystyle J(\alpha,\beta) = C\sum_{i=1}^n max(1-y_i(\alpha+\vec{x}_i\vec{\beta}),0) + \frac{1}{2} \sum_{j=1}^p\beta_j^2$$

If we modify our objective function

$$\displaystyle J(\alpha,\beta) = \frac{1}{2n\lambda}\sum_{i=1}^n max(1-y_i(\alpha+\vec{x}_i\vec{\beta}),0) + \frac{1}{2}\sum_{j=1}^p\beta_j^2$$

We can see that $\displaystyle C=\frac{1}{2n\lambda}$

So after using hyperparameter tuning, we found that the lambda's would end up being similar for Perceptron and Logistic Regression. In fact, the actual decision boundaries were also very similar. However, for SVM it would be completely different. This makes sense because  the regularization term is for a different for SVM. So, we needed a different lambda for SVM to make it converge.

In [None]:
# Sklearn model for SVM
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

clf = SVC(kernel='linear', C=1 / (2 * n * 1))
clf.fit(X, y)
clf.intercept_, clf.coef_

(array([0.02552482]),
 array([[-0.03089084,  0.01904228,  0.00250013, -0.02380001,  0.29342944,
         -0.3695291 , -0.0069959 , -0.00443586,  0.00275573, -0.11035444,
          0.01716461, -0.02613729,  0.05060491,  0.0214118 , -0.01206518,
          0.06225573]]))

In [None]:
# My SVM implementation
import numpy as np

# initialize alpha and betas
alpha = 0
betas = np.ones(p)

#setting learning rate
learning_rate = 0.01

for _ in range(5000):
    # Coefficient penalty term
    penalty = 2 * 1 * betas
    # Checking if y is on correct side of margin or not
    incorrect_map = ((1 - y * (alpha + X @ betas)) > 0)

    # alpha gradient is the sum of -y for every y that is on 
    # the incorrect side of the margin divided by n
    d_alpha = 1/n * -y @ incorrect_map

    # beta gradient is sum of -y * x for every y that is on the incorrect side
    # of the margin divided by n + coefficient penalty term
    d_beta = 1/n * -y * incorrect_map @ X + penalty

    # Take steps in the opposite direction of the gradient
    alpha -= learning_rate * d_alpha
    betas -= learning_rate * d_beta

svm_alphas, svm_betas = alpha, betas
svm_alphas, svm_betas

(0.02562250000000107,
 array([-0.03089075,  0.01902537,  0.00250013, -0.02378231,  0.29341174,
        -0.36951094, -0.00695738, -0.00442313,  0.00280566, -0.1103313 ,
         0.0171829 , -0.02613464,  0.05057741,  0.02140252, -0.01204823,
         0.06224347]))

# Perceptron



**Objective Function for Perceptron**

$$ J(\alpha, \boldsymbol \beta) = \frac{1}{n} \sum_{i=1}^{n} (y_i - \alpha -  \boldsymbol{x_i\beta})^2 $$

$$ \frac{\partial J}{\partial \alpha} = \frac{1}{n} \sum_{i=1}^{n} -2(y_i - \alpha - \boldsymbol{x_i\beta}) $$

$$ \alpha = \bar y - \bar X \boldsymbol{\beta} $$

$$ \frac{\partial J}{\partial \beta} = \frac{1}{n} \sum_{i=1}^{n} 2 (y_i - \alpha -  x_i \boldsymbol{\beta}) (-\boldsymbol{x_i^T})$$

$$ \beta = (\tilde X^T \tilde X)^{-1} \tilde X ^T \boldsymbol{\tilde y}$$

Where $\tilde X$ and $\tilde y$ represent the centered $X$'s and centered $y$'s, respectively

In [None]:
# Sklearn ridge classifier implementation (Perceptron w/ Ridge penalty)
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier(alpha= lam * n, solver='lsqr')
clf.fit(X,y)
clf.intercept_, clf.coef_

(array([1.04143196e-16]),
 array([[-1.93479563e-03, -3.10344687e-03,  2.19312877e-03,
         -1.04603033e-02,  1.14807609e-01, -5.98235352e-01,
         -5.05457379e-03, -3.63996704e-03,  2.24734384e-03,
         -1.02830702e-01, -4.78831926e-04, -4.69657928e-02,
          7.37622727e-03,  1.18709193e-02, -5.55994216e-04,
          3.11265812e-02]]))

In [None]:
# Function for running Perceptron with a ridge penalty

def perceptron(X, y, lam=lam, lr=0.1):

  # initialize alpha and betas to the origin
  alpha = 0
  betas = np.ones(p)

  # Finding the means for X and y
  y_mean = y.mean()
  X_mean = X.mean()

  # Gradient Descent with Ridge penalty
  for _ in range(5000):
    # Alpha Gradient
    d_alpha = -2 * (y_mean - alpha - (X_mean * betas).sum())
    # Beta Gradient
    d_beta = -2 / n * X.T @ (y - alpha - X @ betas) + 2*lam*betas
    # Taking steps in the opposite direction of the gradient
    alpha -= lr * d_alpha
    betas -= lr * d_beta

  return alpha, betas

perc_alpha, perc_betas = perceptron(X, y)
perc_alpha, perc_betas

(1.2169513989826738e-17,
 array([-2.72954145e-03, -3.02641854e-03,  2.08634337e-03, -1.10701660e-02,
         1.14794997e-01, -5.98271718e-01, -4.67684345e-03, -3.74946156e-03,
         1.92311604e-03, -1.02747869e-01, -4.88957386e-04, -4.71597585e-02,
         7.10304012e-03,  1.14845264e-02, -5.60290871e-04,  3.08210968e-02]))

In [None]:
# Functions to make predictions for each model
def make_predictions(X, alpha, betas, logreg=False):
  predictions = []
  for i in range(len(X)):
    # Adding the intercept term to the prediction first
    y_hat = alpha
    # Looping through each beta coefficient and adding the product of the 
    # coefficient and its corresponding x value to the prediction
    for j in range(len(betas)):
      y_hat += X[i,j]*betas[j]
      # Classifying values based on their y-hat values
    if logreg:
      # For logistic regression we have to pass the prediction through the
      # sigmoid function before classifying the value
      y_hat = np.exp(y_hat)/(1+np.exp(y_hat))
      if y_hat >= 0.5:
        pred = 1
      else:
        pred = 0
    else:
      if y_hat >= 0:
        pred = 1
      else:
        pred = -1
    # Appending predictions for each row to a list
    predictions.append(pred)
  return predictions

In [None]:
logr_preds = make_predictions(X_complete, logr_alpha, logr_betas, logreg=True)
perc_preds = make_predictions(X_complete, perc_alpha, perc_betas)
svm_preds = make_predictions(X_complete, svm_alphas, svm_betas)

In [None]:
logr_y_complete = (y_complete+1)/2

from sklearn.metrics import f1_score
logr_f1 = f1_score(logr_y_complete, logr_preds, pos_label=1, average='binary')
svm_f1 = f1_score(y_complete, svm_preds, pos_label=1, average='binary')
perc_f1 = f1_score(y_complete, perc_preds, pos_label=1, average='binary')

logr_f1, svm_f1, perc_f1

(0.3963236942389599, 0.3914907077896402, 0.4332157765312284)

In [None]:
from sklearn.metrics import accuracy_score
logr_acc = accuracy_score(logr_y_complete, logr_preds)
svm_acc = accuracy_score(y_complete, svm_preds)
perc_acc = accuracy_score(y_complete, perc_preds)

logr_acc, svm_acc, perc_acc

(0.754033441848631, 0.7490034642258336, 0.788731153893829)

In [None]:
preds1 = pd.Series(logr_preds)*2-1
preds2 = pd.Series(svm_preds)
preds3 = pd.Series(perc_preds)

preds = pd.DataFrame({'LogReg':preds1, 'SVM':preds2, 'Perceptron':preds3})
preds['EXT_SOURCE_2'] = Xfull['remainder__EXT_SOURCE_2']
preds = pd.melt(preds, id_vars=['EXT_SOURCE_2'], value_vars=['LogReg', 'SVM', 'Perceptron'])