diff --git a/img/backprop.png b/img/backprop.png new file mode 100644 index 0000000..ecff16a Binary files /dev/null and b/img/backprop.png differ diff --git a/img/loss.png b/img/loss.png new file mode 100644 index 0000000..91c5a5b Binary files /dev/null and b/img/loss.png differ diff --git a/img/nonconvex.png b/img/nonconvex.png new file mode 100644 index 0000000..0468f1e Binary files /dev/null and b/img/nonconvex.png differ diff --git a/img/sgd2d.png b/img/sgd2d.png new file mode 100644 index 0000000..191149b Binary files /dev/null and b/img/sgd2d.png differ diff --git a/lab05-training.ipynb b/lab05-training.ipynb index 2fd6442..cd55374 100644 --- a/lab05-training.ipynb +++ b/lab05-training.ipynb @@ -1,6 +1,529 @@ { - "cells": [], - "metadata": {}, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Backpropogation\n", + "- Training a network (backpropagation) consists of:\n", + " - Initializing weights at “random”.\n", + " - Compute the network forward (forward pass)\n", + " - Reduce loss by updating weights in opposite direction of gradient of the loss function.\n", + " - Repeat the process until an optimized set of weights are calculated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Gradient Descent\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Loss Function\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "source": [ + "# Non-Convex Optimization\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training a network\n", + "- Define the network\n", + "- Initialize the network with with random/pre-trained weights\n", + "- Choose a loss function\n", + "- Choose an optimizer\n", + "- Prepare Dataset\n", + "- Run back propogation algorithm.\n", + "- Evaluate the output" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import mxnet as mx\n", + "from mxnet import gluon, nd, autograd\n", + "import numpy as np\n", + "ctx = mx.gpu()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Define the network" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "net = gluon.nn.Sequential()\n", + "\n", + "with net.name_scope(): #Returns a name space object managing a child :py:class:`Block` and parameter names.\n", + " net.add(gluon.nn.Dense(units=128, activation='relu'))\n", + " net.add(gluon.nn.Dense(units=64, activation='relu'))\n", + " net.add(gluon.nn.Dense(units=10))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Initialize the network" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "net.initialize(mx.init.Xavier(magnitude=2.24), force_reinit=True, ctx=ctx)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Choose a loss function" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Choose an Optimizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "trainer = gluon.Trainer(params=net.collect_params(), \n", + " optimizer='sgd', \n", + " optimizer_params={\"learning_rate\":0.01, \"momentum\": .9, \"wd\":.1})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Prepare dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((28, 28, 1), 5.0)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_size = 128\n", + "def transform(data, label):\n", + " return (data.astype(np.float32)/255, label.astype(np.float32))\n", + "\n", + "train_dataset = gluon.data.vision.MNIST(train=True, transform=transform)\n", + "val_dataset = gluon.data.vision.MNIST(train=False, transform=transform)\n", + "\n", + "train_data_loader = gluon.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n", + "val_data_loader = gluon.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)\n", + "\n", + "(train_dataset[0][0].shape, train_dataset[0][1])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "text_labels = [\n", + " 'zero', 'one', 'two', 'three', 'four',\n", + " 'five', 'six', 'seven', 'eight', 'nine'\n", + "]\n", + "X, y = train_dataset[0:6]\n", + "_, figs = plt.subplots(1, X.shape[0], figsize=(15, 15))\n", + "for f,x,yi in zip(figs, X,y):\n", + " # 3D->2D by removing the last channel dim\n", + " f.imshow(x.reshape((28,28)).asnumpy())\n", + " ax = f.axes\n", + " ax.set_title(text_labels[int(yi)])\n", + " ax.title.set_fontsize(20)\n", + " ax.get_xaxis().set_visible(False)\n", + " ax.get_yaxis().set_visible(False)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6, 28, 28, 1)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((28, 28, 1), 5.0)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(train_dataset[0][0].shape, train_dataset[0][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(128, 28, 28, 1) (128,)\n" + ] + } + ], + "source": [ + "for data, label in train_data_loader:\n", + " print(data.shape, label.shape)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Run back propogation algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPOC: [0]: ; train loss: 1.0181736946105957\n" + ] + } + ], + "source": [ + "epochs = 1\n", + "for e in range(epochs):\n", + " for i, (data, label) in enumerate(train_data_loader):\n", + " with autograd.record():\n", + " data = data.as_in_context(ctx)\n", + " label = label.as_in_context(ctx)\n", + " outputs = net(data)\n", + " loss = loss_fn(outputs, label)\n", + " loss.backward()\n", + " trainer.step(batch_size)\n", + " print(\"EPOC: [{}]: ; train loss: {}\".format(e, loss.mean().asscalar()))\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Evaluate the Output\n", + "- In order to evaluate the putput we need to compare performance of the algorithm on training and evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "slideshow": { + "slide_type": "subslide" + } + }, + "outputs": [], + "source": [ + "def evaluate_results(data_loader, network):\n", + " acc = mx.metric.Accuracy()\n", + " acc.reset()\n", + " for i, (data, label) in enumerate(data_loader):\n", + " data = data.as_in_context(ctx)\n", + " label = label.as_in_context(ctx)\n", + " outputs = network(data)\n", + " predictions = nd.argmax(outputs, axis=1)\n", + " acc.update(preds=predictions, labels=label)\n", + " return acc\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPOC: [0]: ; train loss: 0.9534021019935608; train_acc: 0.7991333333333334; val_acc: 0.8055\n", + "EPOC: [1]: ; train loss: 0.9825796484947205; train_acc: 0.80535; val_acc: 0.8126\n", + "EPOC: [2]: ; train loss: 0.9855998158454895; train_acc: 0.8008833333333333; val_acc: 0.8102\n", + "EPOC: [3]: ; train loss: 1.0217260122299194; train_acc: 0.8086; val_acc: 0.8133\n", + "EPOC: [4]: ; train loss: 0.9908719062805176; train_acc: 0.8077; val_acc: 0.8151\n" + ] + } + ], + "source": [ + "epochs = 5\n", + "for e in range(epochs):\n", + " for i, (data, label) in enumerate(train_data_loader):\n", + " with autograd.record():\n", + " data = data.as_in_context(ctx)\n", + " label = label.as_in_context(ctx)\n", + " outputs = net(data)\n", + " loss = loss_fn(outputs, label)\n", + " loss.backward()\n", + " trainer.step(batch_size)\n", + " train_acc = evaluate_results(train_data_loader, net).get()[1]\n", + " val_acc = evaluate_results(val_data_loader, net).get()[1]\n", + " print(\"EPOCH: [{}]: ; train loss: {}; train_acc: {}; val_acc: {}\".format(e, loss.mean().asscalar(), train_acc, val_acc))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Challenge\n", + "- Try to rewrite the training code, using c conolutional network form the previous lab\n", + "- beware that `gluon.nn.Conv2d()` supports NCHW’ and ‘NHWC’ layout for now. ‘N’, ‘C’, ‘H’, ‘W’ stands for batch, channel, height, and width dimensions respectively. Convolution is applied on the ‘H’ and ‘W’ dimensions. \n", + "- We need to use `nd.transpose()` in order to change the layout" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(('NEW: ', (1, 28, 28), 5.0), ('OLD: ', (28, 28, 1), 5.0))" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def transform(data, label):\n", + " return nd.transpose(data.astype(np.float32), (2,0,1))/255, label.astype(np.float32)\n", + "\n", + "train_dataset_conv = gluon.data.vision.MNIST(train=True, transform=transform)\n", + "((\"NEW: \", train_dataset_conv[0][0].shape, train_dataset_conv[0][1]),\n", + "(\"OLD: \", train_dataset[0][0].shape, train_dataset[0][1]))" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "conda_mxnet_p36", + "language": "python", + "name": "conda_mxnet_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, "nbformat": 4, "nbformat_minor": 2 }