[![open in colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DSSHN_lRd0A_tPBwYBi6zlOd_9N1DBJ3#scrollTo=dpz7yKFTYXPZ)

## HW Requirement

• Implement the code for the 2-layer neural networks in CS231n 
2021 version with PyTorch (or TensorFlow). 

• Once you have the code (regardless of which framework you 
choose above), you will apply your own data.  The training and test 
dataset is 80%:20%.

• You need to run the code with the following hyperparameter 
settings:

✓ Activation function: tanh, ReLU

✓ Data preprocessing

✓ Initial weights: small random number, Xavier or Kaiming/MSRA 
Initialization

✓ Loss function: without or with the regularization term 
(L2), λ = 
0.001 or 0.0001
$$ E(w) = \frac{1}{N}\sum^{N}_{c=1}[𝑓(X^c, w) −y^c]^2 
 + \lambda[\sum^{p}_{i=0}(w^{o}_{i})^2
 + \sum_{i=1}^{p}\sum_{j=0}^{m}(w_{ij}^H)^2]
$$
✓ Optimizer: gradient descent, Momentum, Adam

✓ Learning epochs: 100, 200, 300

✓ Amount of hidden nodes: 5, 8, 11

✓ Learning rate decay schedule: none and cosine

✓ Ensembles: top 3 models

## Model

In [1]:
import torch
from torch import nn, optim, Generator
from torch.utils.data import DataLoader, Dataset, random_split
import sys
import pandas as pd
import cProfile
import pstats

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from typing import Iterable, Callable, Type
from operator import mul

def product(nums: Iterable[Type], func: Callable[[Type, Type], Type] = mul):
    def _product(nums):
        nonlocal func
        if len(nums) == 1:
            return nums[0]
        return func(nums[-1], _product(nums[:-1]))
    try:
        return _product(nums)
    except Exception as e:
        raise e

In [2]:
ACTIVES = {
    "relu": nn.ReLU,
    "tanh": nn.Tanh
}
INIT_FUNCS = {
    "small_random": lambda x: nn.init.normal_(tensor=x, mean=0, std=0.01),
    "xavier": lambda x: nn.init.xavier_uniform_(tensor=x) if len(x.shape) > 1 else None,
    "kaiming": lambda x: nn.init.kaiming_uniform_(tensor=x, nonlinearity='relu') if len(x.shape) > 1 else None
}
OPTIM_FUNCS = {
    "sgd": optim.SGD,
    "momentum": lambda param, lr, weight_decay: optim.SGD(params=param, lr=lr, momentum=0.9, weight_decay=weight_decay),
    "adam": optim.Adam
}
SCHEDULERS = {
    "None": None,
    "cos": lambda opt: torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=opt, T_max=200)
}


In [4]:
from collections.abc import Callable
class TwoLayerNetwork(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_classes: int, init_method:Callable, active_func:nn.modules.module.Module) -> None:
        super(TwoLayerNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size= hidden_size
        ## first layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        ## activation
        self.active_func = active_func()
        ## initialize
        for param in self.parameters():
            init_method(param)
        ## second layer
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.active_func(out)
        out = self.fc2(out)
        return out


In [5]:
def train(model: TwoLayerNetwork, opt: nn.Module, device: str, epochs: int, learning_rate: float, trainloader: DataLoader, valloader: DataLoader, criterion: nn.modules.loss._Loss, sched: optim.lr_scheduler._LRScheduler, weight_decay:float):
    if epochs < 1:
        raise ValueError("Invalid epoch!!")
    else:
        epochs = int(epochs)
    model.to(device)
    optimizer = opt(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = sched(optimizer) if sched else None
    history = []
    # Train the model
    for epoch in range(epochs):
        train_loss = 0.0
        train_correct = 0
        model.train()
        for X, y in trainloader:
            X = X.view(-1, model.input_size).to(device)
            y = y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X.size(0)
            _, predicted = torch.max(outputs.data, 1)
            train_correct += (predicted == y).sum().item()
        train_loss /= len(trainloader.dataset)
        train_accuracy = train_correct / len(trainloader.dataset)

        # Validate the model
        val_loss = 0.0
        val_correct = 0
        model.eval()
        with torch.no_grad():
            for X, y in valloader:
                X = X.view(-1, model.input_size).to(device)
                y = y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item() * X.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_correct += (predicted == y).sum().item()
            val_loss /= len(valloader.dataset)
            val_accuracy = val_correct / len(valloader.dataset)
        if scheduler:
            scheduler.step()
        # Print epoch statistics
        history.append((train_loss, train_accuracy, val_loss, val_accuracy))
        # sys.stdout.write('Epoch [{}/{}], Train Loss: {:.4f}, Train Accuracy: {:.2f}%, Val Loss: {:.4f}, Val Accuracy: {:.2f}%\n'
        #       .format(epoch+1, epochs, train_loss, train_accuracy, val_loss, val_accuracy))
    return history


In [15]:
def test(model:nn.Module, device:str, testloader:DataLoader):
    val_correct = 0
    model.to(device)
    model.eval()
    with torch.no_grad():
        for X, y in testloader:
            X = X.view(-1, model.input_size).to(device)
            y = y.to(device)
            outputs = model(X)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == y).sum().item()
        val_accuracy = val_correct / len(testloader.dataset)
    return val_accuracy

# Dataset

### pytorch dataset

In [8]:
# load pytorch dataset

from torchvision import datasets, transforms

def getPytorchData(train: float = 0.8, remain: float = 0.1):
    """
    Args:
        train: train_amount / total_amount or 1 - valid_amount / total_amount
        remain: reduce data amount to save time
    """
    # preprocess: flatten, normalize, drop 90%, split
    transform = transforms.transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    if 0 >= train or train >= 1:
        raise ValueError()
    if 0 > remain or remain > 1:
        raise ValueError()
    # Split the training set into training and validation sets
    trainset = datasets.FashionMNIST(
        root="./data/", train=True, download=False, transform=transform)
    train_count = int(train * remain * len(trainset))
    valid_count = int((1-train) * remain * len(trainset))
    if train_count * valid_count == 0:
        raise ValueError()
    datum_size = product(trainset[0][0].size())
    class_amount = len(trainset.classes)
    testset = datasets.FashionMNIST(
        root="./data/", train=False, download=False, transform=transform)
    print(train_count, valid_count, len(testset))
    trainset, valset, _ = random_split(
        trainset, (train_count, valid_count, len(trainset)-train_count-valid_count), Generator().manual_seed(42))
    # Create dataloaders to load the data in batches
    trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
    valloader = DataLoader(valset, batch_size=32, shuffle=True)
    testloader = DataLoader(testset, batch_size=32, shuffle=True)
    return trainloader, valloader, testloader, datum_size, class_amount


### customized pytorch dataset

In [13]:
import pandas as pd
import numpy as np
class HotelReservationDataset(Dataset):
    """Hotel Reservation dataset."""

    def __init__(self, csv_path):
        """
        Args:
            csv_path (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        # 19
        reservations = pd.read_csv(csv_path)
        # 5
        for col in map(lambda x: x[0], filter(lambda x:x[1]=="O", reservations.dtypes.items())):
            d = dict((j, i) for i, j in enumerate(reservations[col].value_counts().index))
            setattr(self, f"labels_of_{col}", d.keys())
            reservations[col]=reservations[col].map(d.__getitem__)
        # 17(drop id)
        self.feature = torch.from_numpy(reservations.iloc[:, 1:-1].to_numpy(dtype=np.float32))
        # two status
        self.booking_status = torch.reshape(torch.tensor(reservations.iloc[:, -1:].to_numpy()), shape=(len(self.feature),))
        self.classes = list(getattr(self, f"labels_of_{reservations.columns[-1]}"))
    def __len__(self):
        return len(self.booking_status)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.feature[idx], self.booking_status[idx]

# kaggle: ahsan81/hotel-reservations-classification-dataset
def getCustomizedData():
    # preprocess
    dataset = HotelReservationDataset(
        csv_path=r"D:\dataset\archive\Hotel Reservations.csv")
    class_amount = len(dataset.classes)
    # train test split
    train_count = int(0.7 * len(dataset))
    valid_count = int(0.2 * len(dataset))
    test_count = len(dataset) - train_count - valid_count
    print(train_count, valid_count, test_count)
    trainset, valset, testset = random_split(
        dataset, (train_count, valid_count, test_count), Generator().manual_seed(42))
    datum_size = product(trainset[0][0].size())
    # set loaders
    trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
    valloader = DataLoader(valset, batch_size=32, shuffle=True)
    testloader = DataLoader(testset, batch_size=32, shuffle=True)
    return trainloader, valloader, testloader, datum_size, class_amount


### kaggle dataset

In [3]:
# download data(zipped csv) from kaggle with username and apikey
import os
import json
with open("kaggle.json", "r") as j:
    for (k, v) in json.load(j).items():
        os.environ[k] = v
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
# https://www.kaggle.com/datasets/uciml/iris/download?datasetVersionNumber=2
# owner/datasetname
api.dataset_download_files('uciml/iris', path="./data/")


## Training

In [8]:
def training_schedule():
    counter = 0
    # processor
    device = "cuda" if torch.cuda.is_available(
    ) else "mps" if torch.backends.mps.is_available() else "cpu"
    # hyper parameters
    trainloader, valloader, testloader, input_size, output_size = getPytorchData()
    learning_rate = 0.001
    criterion = nn.CrossEntropyLoss()
    # ✓ Amount of hidden nodes: 5, 8, 11
    for hidden_size in (5, 8, 11):
        # ✓ Learning epochs: 100, 200, 300
        for epochs in (100, 200, 300):
            # Create model, optimizer, scheduler
            for (init, method) in INIT_FUNCS.items():
                for (active, func) in ACTIVES.items():
                    # ✓ Activation function: tanh, ReLU
                    # ✓ Initial weights: small random number, Xavier or Kaiming/MSRA Initialization
                    model = TwoLayerNetwork(input_size, hidden_size, output_size,
                                            init_method=method, active_func=func).to(device)
                    # ✓ Optimizer: gradient descent, Momentum, Adam
                    for (optimize, optm) in OPTIM_FUNCS.items():
                        # ✓ Learning rate decay schedule: none and cosine
                        for (schedule, schd) in SCHEDULERS.items():
                            # ✓ Loss function: without or with L2, λ = 0.001 or 0.0001
                            for weight_decay in (0.0, 0.001, 0.0001):
                                with open(f"./data/{counter}.txt", "w") as f:
                                    sys.stdout = f
                                    f.write(f"{counter}: {hidden_size}, {epochs}, {init}, {active}, {optimize}, {schedule}, {weight_decay}\n")
                                    counter+=1
                                    test(model=model, device=device, testloader=testloader)
                                    break
                                    train(model=model, opt=optm, device=device, epochs=epochs, learning_rate=learning_rate,
                                        trainloader=trainloader, valloader=valloader, criterion=criterion, sched=schd, weight_decay=weight_decay)
                                    test(model=model, device=device, testloader=testloader)
                            break
                        break
                    break
                break
            break
        break


In [27]:
# schedule
def createSchedule(path: str):
    hidden_size = (5, 8, 11)
    epochs = (100, 200, 300)
    init = tuple(INIT_FUNCS.keys())
    active = tuple(ACTIVES.keys())
    optimize = tuple(OPTIM_FUNCS.keys())
    schd = tuple(SCHEDULERS.keys())
    weight_decay = (0.0, 0.001, 0.0001)
    total_count = (len(hidden_size) + len(epochs) + len(init) +
                   len(active) + len(optimize) + len(schd) + len(weight_decay))
    column_names = ["hidden_size", "epochs", "schd",
                    "weight_decay", "init", "active", "optimize"]
    column = [hidden_size, epochs, schd, weight_decay, init, active, optimize]
    pd.DataFrame(dict(zip(column_names,
                          map(lambda c: c + (c[0],)*(total_count-len(c)), column)))
                 ).to_csv(path_or_buf=path, index=False)


In [28]:
createSchedule(r"./data/schedule.csv")

In [8]:
def _training_schedule():
    def _training(counter, hidden_size, epochs, weight_decay, init, active, optimize, schd):
        nonlocal args
        (trainloader, valloader, testloader, input_size,
         output_size, learning_rate, criterion, device) = args
        model = TwoLayerNetwork(input_size, hidden_size,
                                output_size, INIT_FUNCS[init], ACTIVES[active])
        with open(f"./data/{counter}.txt", "w") as f:
            sys.stdout = f
            sys.stdout.write(
                f"{counter}: {hidden_size}, {epochs}, {init}, {active}, {optimize}, {schd}, {weight_decay}\n")
            test(model, device, testloader)
            train(model, OPTIM_FUNCS[optimize], device, epochs, learning_rate,
                  trainloader, valloader, criterion, SCHEDULERS[schd], weight_decay)
            test(model, device, testloader)

    # processor
    device = "cuda" if torch.cuda.is_available(
    ) else "mps" if torch.backends.mps.is_available() else "cpu"
    # hyper parameters
    trainloader, valloader, testloader, input_size, output_size = getPytorchData()
    learning_rate = 0.001
    criterion = nn.CrossEntropyLoss()
    args = (trainloader, valloader, testloader, input_size,
            output_size, learning_rate, criterion, device)
    df = pd.read_csv(r"./data/schedule.csv")
    df["counter"] = df.index
    return df, _training


In [9]:
df, _training = _training_schedule()
for index, row in df.iterrows():
    # cProfile.run("_training(**row)", "result.out")
    _training(**row)
    break

4800 1199 10000


In [None]:
p = pstats.Stats("result.out")

In [None]:
print(p.strip_dirs().sort_stats("tottime").print_stats())

### train

In [9]:
device = "cuda" if torch.cuda.is_available(
) else "mps" if torch.backends.mps.is_available() else "cpu"
# hyper parameters
trainloader, valloader, testloader, input_size, output_size = getPytorchData()
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()

4800 1199 10000


In [17]:
def _training(row):
    model = TwoLayerNetwork(input_size, row["hidden_size"],
                            output_size, INIT_FUNCS[row["init"]], ACTIVES[row["active"]])
    baseline = test(model, device, testloader)
    history = train(model, OPTIM_FUNCS[row["optimize"]], device, row["epochs"], learning_rate,
          trainloader, valloader, criterion, SCHEDULERS[row["schd"]], row["weight_decay"])
    result = test(model, device, testloader)
    return baseline, history, result

df = pd.read_csv(r"./data/schedule.csv")
df["counter"] = df.index
for index, row in df.iterrows():
    cProfile.run("baseline, history, result = _training(row)", "result.out")
    print(baseline, history, result)
    break


10.04 [(2.2543232107162474, 15.041666666666666, 2.2057615469852223, 18.34862385321101), (2.179066778818766, 18.395833333333332, 2.152984736659708, 16.09674728940784), (2.1329888979593914, 18.6875, 2.1124607562223408, 20.266889074228523), (2.0931432008743287, 21.75, 2.0746849438267216, 22.68557130942452), (2.0544890411694845, 23.895833333333332, 2.0372910644731688, 25.020850708924105), (2.015653196970622, 25.916666666666668, 1.9996665553116022, 25.938281901584652), (1.9763879760106404, 26.604166666666668, 1.9614586758553931, 28.10675562969141), (1.9368771759668986, 28.416666666666668, 1.9232837591099678, 29.608006672226857), (1.897013602256775, 29.333333333333332, 1.8852086180542984, 30.275229357798164), (1.8569814610481261, 29.729166666666668, 1.846724114585062, 32.11009174311926), (1.8174788411458334, 31.375, 1.8080830415951599, 34.528773978315265), (1.7780001473426819, 34.354166666666664, 1.7699393387930507, 36.44703919933278), (1.7397701334953308, 36.291666666666664, 1.7330190956443

# ranking

In [None]:
import re
import pandas as pd
results = []
for i in range(1, 19):
    with open(f"./data/{i}.txt", "r") as f:
        l = list(f.readlines())
        accuracy = float(l[len(l) - 1][14:-2])
        hidden_size, epochs, init_func, active_func, optimizer, lr_scheduler, weight_decay \
            = l[0][:-1].split(", ")
        hidden_size = int(hidden_size)
        epochs = int(epochs)
        weight_decay = float(weight_decay)
        results.append([hidden_size, epochs, init_func, active_func, optimizer, lr_scheduler, weight_decay, accuracy])
        continue
        history = pd.DataFrame(
            map(lambda r: r[:-1].split(", "), l[1:len(l)-1]))
        history.columns = ["epoch", "train_loss",
                           "train_acc", "val_loss", "val_acc"]
        history.epoch = history.epoch.map(
            lambda epoch: int(re.search(r"\[(.)*?/", epoch)[0][1:-1]))
        history.train_loss = history.train_loss.map(lambda value: float(
            re.search(r":\s(.)*", value)[0][2:]))
        history.val_loss = history.val_loss.map(
            lambda value: float(re.search(r":\s(.)*", value)[0][2:]))
        history.train_acc = history.train_acc.map(lambda value: float(
            re.search(r":\s(.)*", value)[0][2:-1])*0.01)
        history.val_acc = history.val_acc.map(lambda value: float(
            re.search(r":\s(.)*", value)[0][2:-1])*0.01)


In [None]:

results.sort(key=lambda x:x[-1])

In [None]:
results[-3:]

[[11, 200, 'small_random', 'relu', 'sgd', 'None', 0.0, 81.89],
 [11, 300, 'small_random', 'relu', 'sgd', 'None', 0.0, 82.41],
 [11, 300, 'small_random', 'relu', 'sgd', 'None', 0.0, 82.45]]