In [11]:
import numpy as np
import numpy.linalg as LA
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [12]:
#Load the data
iris = datasets.load_iris()

#convert to dataframe
df = pd.DataFrame(iris.data, columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
df.insert(4,'Target',iris.target)
# target = [0,1,2] corresponds to Setosa, Versicolour, and Virginica, resp.

#check head of df
df.head()
#description of df
df.describe()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [13]:
#Extract features and labels

#feature data
X =  df[['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']].values

#remove the mean
X_mean = np.mean(X,axis=0)
X_no_mean = X - X_mean

#add columm of ones to compute intercept value
X_aug = np.concatenate( (X_no_mean,np.ones((len(X),1))) , axis=1)

#targets
y = df[['Target']].values

#one hot encoding of y_train
enc = OneHotEncoder()
enc.fit([[0],[1],[2]])
y_OH = enc.transform(y).toarray()

#split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X_aug, y, test_size=0.2, random_state=0)

In [14]:
#softmax function (logits)
def softmax(z):
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z,axis=1).reshape(-1,1)

#objective function
def logprobs(probs, y_one_hot):
    return -np.mean(np.sum(y_one_hot * np.log(probs),axis=1))

#compute probabilities (normalized scores)
def compute_probs(X, theta):
    return softmax(np.dot(X,theta))

In [15]:
#solve with sklearn
#sklearn logistic regression
c = 1 #regularization coefficient (expressed as 1/c in sklearn)
softmax_reg = LogisticRegression(multi_class='multinomial',solver='lbfgs',fit_intercept=True,C=c)
softmax_reg.fit(X_no_mean,y.squeeze())
softmax_reg.get_params()
coefs = softmax_reg.coef_
intercept = softmax_reg.intercept_
print("feature weights:", coefs)
print("intercepts:", intercept)

theta_sklearn = np.concatenate((coefs, intercept.reshape(-1,1)), axis=1)
probs = compute_probs(X_aug, theta_sklearn.T)
loss_sklearn = logprobs(probs, y_OH)
print('sklearn loss: ', loss_sklearn)

sklearn_scores = np.round(probs)
sklearn_accuracy = (y_OH == sklearn_scores).sum().astype(float) / len(y_OH) / 3
print('skelarn accuracy: ', sklearn_accuracy)

feature weights: [[-0.42348033  0.96739605 -2.51712086 -1.07936828]
 [ 0.53444333 -0.32162603 -0.20642703 -0.94425181]
 [-0.110963   -0.64577002  2.7235479   2.02362009]]
intercepts: [-0.42151116  2.46887351 -2.04736235]
sklearn loss:  0.11963709827020219
skelarn accuracy:  0.9822222222222222


In [16]:
#Solve from scratch

max_iters = 20000
alpha = 1e-1
theta0 = np.ones([X_aug.shape[1], y_OH.shape[1]])
theta = []
theta.append(theta0)
k = 0

ll = []

while k < max_iters:
    
    #compute probability scores (normalize logits through softmax)
    probs = compute_probs(X_aug,theta[k])
    #compute prediction error
    e = y_OH - probs
    
    #compute gradient
    #don't penalize the bias term
    theta_regularization = np.copy(theta[k])
    theta_regularization[4,::]=0
    grad = 1/len(X_aug) * (np.dot(X_aug.T, e) - 1/c * theta_regularization)
    
    #update weights with gradient ascent
    theta.append(theta[k] + alpha * grad)
    
    #compute loss function
    loss = logprobs(probs, y_OH)
    ll.append(loss)
    if k%2000 == 0:
        print('iteration: ', k, '- loss:', loss)
    k+=1
    
print('final log liklihood loss: ', loss)
print('scratch coefficients: ', theta[k])

iteration:  0 - loss: 1.0986122886681096
iteration:  2000 - loss: 0.12653808506368472
iteration:  4000 - loss: 0.12051464992086894
iteration:  6000 - loss: 0.11980823400344635
iteration:  8000 - loss: 0.11968142996558431
iteration:  10000 - loss: 0.11965010104112833
iteration:  12000 - loss: 0.11964091896023404
iteration:  14000 - loss: 0.11963804155197351
iteration:  16000 - loss: 0.11963711898702958
iteration:  18000 - loss: 0.11963682091784501
final log liklihood loss:  0.11963672437977206
scratch coefficients:  [[-0.42351163  0.53446451 -0.11094804]
 [ 0.9673487  -0.32158537 -0.64575849]
 [-2.5171512  -0.20638836  2.7235444 ]
 [-1.07933591 -0.94429604  2.02363679]
 [ 0.57841986  3.46889278 -1.04731264]]


In [17]:
# compare scores
probs = compute_probs(X_aug,theta[k])
scratch_accuracy = (y_OH == np.round(probs)).sum().astype(float) / len(y_OH) / 3
print('scratch accuracy: ', scratch_accuracy)
print('sklearn accuracy: ', sklearn_accuracy)

scratch accuracy:  0.9822222222222222
sklearn accuracy:  0.9822222222222222
