In [1]:
import torch
import torchvision
import numpy as np
from torch import nn,optim

In [3]:
n_train,n_test,num_inputs=20,100,200
true_w,true_b=torch.ones(num_inputs,1)*0.01 ,0.05

features=torch.randn((n_train+n_test,num_inputs))
labels= torch.matmul(features,true_w)+true_b
labels+=torch.Tensor(np.random.normal(0,0.01,size=labels.size()))
train_features,test_features=features[:n_train],features[n_train:]
train_labels,test_labels=labels[:n_train],labels[n_train:]

In [4]:
def init_params():
    w=torch.randn((num_inputs,1),requires_grad=True)
    b=torch.zeros(1,requires_grad=True)
    return [w,b]

In [5]:
def l2_penalty(w):
    return (w**2).sum()/2

In [6]:
def net(x,w,b):
    return torch.mm(x,w)+b

In [7]:
def squared_loss(y_hat,y):
    return (y_hat-y.view(y_hat.size()))**2/2

In [11]:
def sgd(params,lr,batch_size):
    for p in params:
        p.data -= lr*p.grad/batch_size

In [8]:
batch_size,num_epochs,lr=1,100,0.003
net=net
loss=squared_loss

In [9]:
dataset=torch.utils.data.TensorDataset(train_features,train_labels)
train_iter=torch.utils.data.DataLoader(dataset,batch_size,shuffle=True)

In [28]:
def fit_and_plot(ld):
    w,b=init_params()
    train_ls,test_ls=[],[]
    for _ in range(num_epochs):
        for x,y in train_iter:
            l=loss(net(x,w,b),y)+ld*l2_penalty(w)
            l=l.sum()
            
            if w.grad is not None:
                w.grad.data.zero_()
                b.grad.data.zero_()
            l.backward()
            sgd([w,b],lr,batch_size)
        train_ls.append(loss(net(train_features,w,b),train_labels).mean().item())
        test_ls.append(loss(net(test_features,w,b),test_labels).mean().item())
    print('L2 norm of w',w.norm().item())
    print('true_w norm of w',((true_w**2).sum())**(1/2))

In [29]:
fit_and_plot(ld=0)

L2 norm of w 12.246352195739746
true_w norm of w tensor(0.1414)


In [30]:
((true_w**2).sum())**(1/2)

tensor(0.1414)

In [27]:
true_w.norm()

tensor(0.1414)

weight decay by pytorch

In [33]:
def fit_and_plot_pytorch(wd):
    net=nn.Linear(num_inputs,1)
    nn.init.normal_(net.weight,mean=0,std=1)
    nn.init.normal_(net.bias,mean=0,std=1)
    optimizer_w=torch.optim.SGD(params=[net.weight],lr=lr,weight_decay=wd)
    optimizer_b=torch.optim.SGD(params=[net.bias],lr=lr)
    
    train_ls,test_ls=[],[]
    for _ in range(num_epochs):
        for x,y in train_iter:
            l=loss(net(x),y).mean()
            
            optimizer_w.zero_grad()
            optimizer_b.zero_grad()
            
            l.backward()
            
            optimizer_w.step()
            optimizer_b.step()
        train_ls.append(loss(net(train_features),train_labels).mean().item())
        test_ls.append(loss(net(test_features),test_labels).mean().item())
    print('l2 norm of w ',net.weight.data.norm().item())
    print('l2 norm of true_w ',true_w.norm())
            
            
            

In [34]:
fit_and_plot_pytorch(0)

l2 norm of w  12.520109176635742
l2 norm of true_w  tensor(0.1414)


In [35]:
fit_and_plot_pytorch(3)

l2 norm of w  0.10526750236749649
l2 norm of true_w  tensor(0.1414)
