In [184]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch import nn
from sklearn.model_selection import KFold

In [None]:
# a linear model using the torch nn module.
class RidgeRegressionModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(RidgeRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size, bias=False)
    
    def forward(self, x):
        return self.linear(x)

# wrapper for our data.
class DataSet(Dataset):
    def __init__(self, csv_file):
        data = pd.read_csv("train.csv")
        self.X = torch.as_tensor(data.iloc[:,1:].values).float()
        self.y = torch.as_tensor(data.iloc[:,0].values).float()

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

def pred_loss_fn(outputs, labels, alpha, weights):
    return torch.sum(torch.square(outputs-labels)) + alpha*torch.sum(torch.square(weights))

def train(model, data_loader, optimizer, num_epochs, alpha):
    for epoch in range(num_epochs):
        for X,y in data_loader:
            model_outputs = model(X)
            model_loss = pred_loss_fn(model_outputs, y, alpha, model.linear.weight)
            optimizer.zero_grad()
            model_loss.backward()
            optimizer.step()
        
        X, y = next(iter(data_loader))

def test(model, data_loader, gen_loss_fn):
    loss = 0
    with torch.no_grad():
        for X,y in data_loader:
            output = model(X).flatten()
            loss+=gen_loss_fn(output,y).item()
    return loss
    
def kfcvalidation(model, data, batch_size=5, alpha=0.1, learning_rate=0.0000000001, num_epochs=100):
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 
    kfold = KFold(n_splits=10, shuffle=True)
    gen_loss_fn = nn.MSELoss()
    overall_loss = 0

    for fold, (train_indices, test_indices) in enumerate(kfold.split(data)):
        train_samples = SubsetRandomSampler(train_indices)
        test_samples = SubsetRandomSampler(test_indices)

        # the DataLoader allows us to iterate over batch_size pairs of X and y.
        train_data_loader = DataLoader(data, batch_size=batch_size,sampler=train_samples)
        test_data_loader = DataLoader(data, batch_size=len(test_indices),sampler=SubsetRandomSampler)

        print(f'Fold: {fold}, commencing training')
        print(f'-----------------------------------')

        train(model, train_data_loader, optimizer, num_epochs, alpha)
        
        fold_loss = test(model, train_data_loader, gen_loss_fn)
        print(f'Loss for this fold is: {fold_loss}')
        print(f'-----------------------------------')
        overall_loss+=fold_loss
    
    return overall_loss/10



In [187]:
data = DataSet("train.csv")
model = RidgeRegressionModel(13,1)

kfcvalidation(model=model, data=data)





Fold: 0, commencing training
-----------------------------------
Loss for this fold is: 3003.4609565734863
-----------------------------------
Fold: 1, commencing training
-----------------------------------
Loss for this fold is: 2466.7358798980713
-----------------------------------
Fold: 2, commencing training
-----------------------------------


KeyboardInterrupt: 