In [192]:
import numpy as np
import pandas as pd

## Loading MNIST Dataset

In [193]:
mnist=pd.read_csv('train_mnist.csv')

In [194]:
mnist.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Getting Target Values

In [195]:
y=mnist.label
y.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

## Target values into Categorical Feature

In [196]:
y_d=pd.get_dummies(y,prefix='Num_')

In [197]:
y_d.head()

Unnamed: 0,Num__0,Num__1,Num__2,Num__3,Num__4,Num__5,Num__6,Num__7,Num__8,Num__9
0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0


In [198]:
y_d=np.array(y_d)
y_d.shape

(42000, 10)

## Dropping Target values from input

In [199]:
X=mnist.drop('label',axis=1)

## Reshaping Input to apply convolution

In [200]:
X=np.array(X)
X=X.reshape(42000,28,28,1)
X.shape

(42000, 28, 28, 1)

## Function for Padding

In [201]:
def zero_pad(X, pad):
    X_pad = np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 'constant', constant_values=0)
    return X_pad

## Single Convolution Step

In [202]:
def conv_single_step(a_slice_prev, W, b):
    s = np.multiply(a_slice_prev, W) + b
    Z = np.sum(s)
    return Z

## Function for Convolution

In [203]:
def conv_forward(A_prev, W, b, hparameters):
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters['stride']
    pad = hparameters['pad']
    n_H = int((n_H_prev - f + 2 * pad) / stride) + 1
    n_W = int((n_W_prev - f + 2 * pad) / stride) + 1
    Z = np.zeros((m, n_H, n_W, n_C))
    A_prev_pad = zero_pad(A_prev, pad)
    for i in range(m):                                 # loop over the batch of training examples
        a_prev_pad = A_prev_pad[i]                     # Select ith training example's padded activation
        for h in range(n_H):                           # loop over vertical axis of the output volume
            for w in range(n_W):                       # loop over horizontal axis of the output volume
                for c in range(n_C):
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[...,c], b[...,c])
    assert(Z.shape == (m, n_H, n_W, n_C))
    cache = (A_prev, W, b, hparameters)
    return Z, cache 

## Defining ReLU Function

In [204]:
def ReLU(x):
    return np.maximum(0,x)

## Backprop in convolution

In [205]:
def conv_backward(dZ, cache):    
    (A_prev, W, b, hparameters) = cache
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters["stride"]
    pad = hparameters["pad"]
    (m, n_H, n_W, n_C) = dZ.shape    
    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))                           
    dW = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))

    # Pad A_prev and dA_prev
    A_prev_pad = zero_pad(A_prev, pad)
    dA_prev_pad = zero_pad(dA_prev, pad)
    
    for i in range(m):                      
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]
        
        for h in range(n_H):                   # loop over vertical axis of the output volume
            for w in range(n_W):               # loop over horizontal axis of the output volume
                for c in range(n_C):           # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice"
                    vert_start = h
                    vert_end = vert_start + f
                    horiz_start = w
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]

                    # Update gradients for the window and the filter's parameters using the code formulas given above
                    da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                    dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                    db[:,:,:,c] += dZ[i, h, w, c]
    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))    
    return dA_prev, dW, db

## Defining softmax function

In [206]:
def softmax(x):
    s=np.exp(x)/np.sum(np.exp(x))
    return s

###  This function convert our input matrices to a matrices with 10 features (by multiplying with weight matrices and adding bias) so that we can apply softmax regression to it.

In [207]:
def ultimate_function(a,w1,b1,gamma1,beta1):
    
    Z2=np.dot(a,w1)+b1
    ep=0.00000001
    Z2_mean=np.mean(Z2)
    Z2_std=np.std(Z2)
    Z2_norm=(Z2-np.mean(Z2))/(np.sqrt((Z2_std**2)+ep))   
    Z2_tilde=gamma1*Z2_norm+beta1
    a2=np.zeros((500,10))
    for i in range(500):
        a2[i]=softmax(Z2_tilde[i])
    return a2,Z2_tilde,Z_norm

## Defining Cross Entropy Cost Function

In [208]:
def Cost(y,a):
    J=np.zeros((500,10))    #
    for i in range(500):
        J= - np.multiply(y[i], np.log(a[i])).sum()
        return J

## Differentiating softmax for backpropagation and getting result

In [209]:
def softmax_backprop(a):
    b=1-a
    return np.multiply(a+0.00000001,b)

## Differentiating Cost Function for backpropagation and getting result

In [210]:
def cost_back(y_d,a):
    return np.multiply(y_d,1/(a+0.00000001))

## Finding dL/dW

In [211]:
def wz_back(X,a):
    dzw=np.dot(X,a)
    return dzw

## Applying Batch_Norm (Forward Propagation and Backward Propagation)
### In this we initialize our weight, bias and hperparameters. Then we divide our input into mini batches of 500 images and apply convolution. After convolution batch normalization is done. Then we apply ReLU activation Function. Then we calculate our prediction values using softmax  regression. Cost Function is calculated with Cross Entropy. Then we apply backward Propagation.

In [212]:
W = np.random.randn(2, 2, 1, 1)                  # Initializing weight for covolution Layer 
b = np.random.randn(1, 1, 1, 1)                  # Initializing bias for convolution Layer
hparameters = {"pad" : 0,                        # Initializing hyperparameters
               "stride": 2}
gamma1=np.random.rand(1)                        # Intializing BETA and GAMMA for batch-norm
beta1=np.random.rand(1)
w1=np.random.rand(196,10)                       # Initializing weight for 2nd Layer 
b1=np.random.rand(1,10)                         # Initializing bias for 2nd Layer
alpha=0.05                                  # Learning Rate
for i in range(0,42000,500):
   
    Z, cache_conv = conv_forward(X[i:i+500,:,:,:], W, b, hparameters)  # Forward Convolution Layer
    print("Z's mean =", np.mean(Z))
    Z_mean=np.mean(Z)
    Z_std=np.std(Z)
    ep=0.00000001
    Z_norm=(Z-Z_mean)/(np.sqrt(Z_std**2+ep))          # Normalization of Z
    
    Z_tilde=gamma*Z_norm+beta                         # Changing mean and variance of Z
    a=ReLU(Z_tilde)                                   # Activation Function
    ac=a.reshape(500,196)                             # Reshaping a copy of Input Activation
    
    a2,Z2_tilde,Z2_norm=ultimate_function(ac,w1,b1,gamma1,beta1)  # Finding the softmax output
    Z2_norm=Z2_norm.reshape(500,196)
    print(a2)                                             # Printing the probabilities of each number
    
    cost_func=Cost(y_d[i:i+500,:],a2)                     # Finding cost function using cross entropy
    
    print(cost_func)  
    
    g=softmax_backprop(a2)                              # Softmax Backpropagation differentiation output matrix
    
    h=cost_back(y_d[i:i+500],a2)                        # Cost Function differentiation output matrix 
    
    gh=np.multiply(g,h)                                    
    
    dbeta1=np.sum(gh)                                  # Finding dBeta
    
    dw1=wz_back(ac.T,gh)                               # Finding dW of 2nd Layer
    
    w1=w1-alpha*dw1/500                                # Changing W
    
    dgamma1=np.sum(np.dot(gh.T,Z2_norm))                # Finding dGamma
    
    gamma1=gamma1-alpha*dgamma1/500                    # Changing Gamma
    
    beta1=beta1-alpha*dbeta1/500                       # Changing Beta
    
    dA, dW, db = conv_backward(a, cache_conv)           # Backprop for Convolution Layer
    
    W=W-alpha*dW/500                                    # Changing W for Convolution Layer
    
    b=b-alpha*db/500                                    # Changing b for Convolution Layer
    
    print("dW_mean =", np.mean(dW))
    print("dW1_mean =", np.mean(dw1))
    print("dbeta1_mean =", np.mean(dbeta1))
    print("dgamma1_mean =", np.mean(dgamma1))

Z's mean = -37.49711399940434
[[0.09377935 0.04835103 0.07110973 ... 0.11146721 0.06897036 0.13006882]
 [0.12404578 0.05848711 0.07430491 ... 0.09628747 0.10206963 0.11120751]
 [0.10794053 0.04431453 0.07524467 ... 0.08797547 0.06990366 0.1136533 ]
 ...
 [0.10669052 0.04076586 0.06879678 ... 0.08272253 0.07339761 0.1197935 ]
 [0.10411888 0.05496187 0.08233186 ... 0.10277271 0.0660283  0.10902087]
 [0.10925275 0.06679248 0.07898761 ... 0.10516832 0.07911077 0.12623971]]
3.029267745038895
dW_mean = 2856479.3594692918
dW1_mean = 44.290844288890085
dbeta1_mean = 451.2585284661873
dgamma1_mean = 80.14243073603222
Z's mean = -39936.990763249996
[[0.09827143 0.04507356 0.0785147  ... 0.10391494 0.0774785  0.10858761]
 [0.09805464 0.04371744 0.07528576 ... 0.09709838 0.07296455 0.12002611]
 [0.0992563  0.05302838 0.07288257 ... 0.0840867  0.09067856 0.11128652]
 ...
 [0.10981297 0.04024991 0.08633863 ... 0.0839184  0.08210762 0.12584271]
 [0.08794843 0.04366913 0.06979173 ... 0.08449755 0.0766

dW_mean = 2824317.792898051
dW1_mean = 44.13862052870389
dbeta1_mean = 452.16113085627245
dgamma1_mean = 111.96888405550686
Z's mean = -545528.9920585514
[[0.10461636 0.04601695 0.07896222 ... 0.08938373 0.08468201 0.10549899]
 [0.12048959 0.04652256 0.08494799 ... 0.07725287 0.08328859 0.12238335]
 [0.10799536 0.05597078 0.08159171 ... 0.10672907 0.07755057 0.11171529]
 ...
 [0.11462638 0.04419682 0.07565982 ... 0.0945049  0.07475902 0.12797578]
 [0.11862892 0.04139103 0.06647807 ... 0.08433355 0.08212654 0.11514051]
 [0.0882596  0.05363427 0.07962391 ... 0.08545342 0.09763942 0.11248802]]
2.104558019515307
dW_mean = 2931082.2085515866
dW1_mean = 44.102329974357836
dbeta1_mean = 451.88286269178934
dgamma1_mean = 97.25223790761936
Z's mean = -579580.028158509
[[0.11702011 0.03658405 0.06736607 ... 0.088309   0.07827805 0.11646752]
 [0.1008653  0.0505947  0.07707713 ... 0.07502177 0.10549482 0.10140227]
 [0.11554289 0.04478894 0.07141091 ... 0.09319874 0.08299643 0.10268765]
 ...
 [0.12

dW_mean = 2797516.334987683
dW1_mean = 44.293817484874026
dbeta1_mean = 453.6184969838285
dgamma1_mean = 102.74553765239398
Z's mean = -1085792.193735741
[[0.11790431 0.05089041 0.08720103 ... 0.10278007 0.08864618 0.10953421]
 [0.11494118 0.0476639  0.08886242 ... 0.08468837 0.09315525 0.11150416]
 [0.09821172 0.06093493 0.09025506 ... 0.08864525 0.10457273 0.10863898]
 ...
 [0.12013502 0.0479329  0.08727834 ... 0.09354345 0.08923819 0.10931093]
 [0.11528545 0.05313237 0.08217015 ... 0.08881059 0.08984001 0.11264729]
 [0.11616763 0.05034932 0.08883    ... 0.08869951 0.08846591 0.11349587]]
2.0998413248617545
dW_mean = 2928706.68753147
dW1_mean = 44.12106918069178
dbeta1_mean = 452.10775127109787
dgamma1_mean = 76.57768733549653
Z's mean = -1114004.9019441204
[[0.11324546 0.05238118 0.09485483 ... 0.0912681  0.09157387 0.10849297]
 [0.11847626 0.05021957 0.08246884 ... 0.08655826 0.09100606 0.11522948]
 [0.11101532 0.04825584 0.08688477 ... 0.09379993 0.09320447 0.11608684]
 ...
 [0.10

dW_mean = 2922992.7100151954
dW1_mean = 44.04515597325223
dbeta1_mean = 451.32390306630646
dgamma1_mean = 77.54974080604103
Z's mean = -1630562.8408390286
[[0.1139443  0.06237872 0.08802054 ... 0.09422241 0.09753905 0.10491943]
 [0.10460437 0.07112074 0.09606871 ... 0.10237986 0.10150102 0.10202936]
 [0.10953383 0.06919563 0.092519   ... 0.09529458 0.09229201 0.10748361]
 ...
 [0.11027048 0.06196586 0.08810249 ... 0.09566906 0.09901214 0.10773373]
 [0.10873141 0.06554898 0.095337   ... 0.08906603 0.09983113 0.10399791]
 [0.10524748 0.06802471 0.09143036 ... 0.08872523 0.09248568 0.1084576 ]]
2.219483695868503
dW_mean = 2980350.325500362
dW1_mean = 44.050754447087954
dbeta1_mean = 451.5666682368528
dgamma1_mean = 67.97191927093837
Z's mean = -1643180.5143890595
[[0.1074635  0.06774302 0.09733929 ... 0.09101986 0.10491682 0.10591723]
 [0.11572036 0.06182439 0.09230619 ... 0.09600727 0.09527066 0.10287167]
 [0.10135045 0.06727251 0.09731838 ... 0.08733986 0.09817138 0.1086635 ]
 ...
 [0.1

dW_mean = 2897925.55872636
dW1_mean = 44.04466532469715
dbeta1_mean = 451.46380201282943
dgamma1_mean = 57.72102409541901
Z's mean = -2101562.0362635264
[[0.10527609 0.08184218 0.09295689 ... 0.09834675 0.09977994 0.10440305]
 [0.10478812 0.07558298 0.0946607  ... 0.09880605 0.09779433 0.10408848]
 [0.10875064 0.07801722 0.09617004 ... 0.09685697 0.09950502 0.10410499]
 ...
 [0.10868967 0.07697063 0.09831083 ... 0.09512569 0.09999032 0.10235181]
 [0.104233   0.07928105 0.09729709 ... 0.09214638 0.09659219 0.10668828]
 [0.10557899 0.07634255 0.09302647 ... 0.09505191 0.10022624 0.10647089]]
2.329283575312051
dW_mean = 2852305.635489992
dW1_mean = 44.04072260649588
dbeta1_mean = 451.2278812620016
dgamma1_mean = 56.845600070036085
Z's mean = -2215180.242984419
[[0.10644331 0.0760844  0.09895268 ... 0.09270255 0.10068969 0.10142907]
 [0.09943127 0.08457489 0.09806765 ... 0.09414486 0.10377994 0.10363629]
 [0.10196175 0.08130267 0.09606259 ... 0.09877485 0.10381202 0.10368583]
 ...
 [0.1070

dW_mean = 2757118.3236906095
dW1_mean = 44.010146762528144
dbeta1_mean = 450.9454391307323
dgamma1_mean = 35.59408341668103
Z's mean = -2709047.2315771603
[[0.10258926 0.08865154 0.09714527 ... 0.09639814 0.10033729 0.10324676]
 [0.10178279 0.08984231 0.09758023 ... 0.09919467 0.09954116 0.10138382]
 [0.10218835 0.0879992  0.09929282 ... 0.09795941 0.10009968 0.10075472]
 ...
 [0.10439505 0.08705639 0.09980005 ... 0.09696802 0.10019094 0.10089717]
 [0.10401472 0.08998238 0.09812707 ... 0.09893082 0.09977705 0.10006341]
 [0.10243439 0.09017713 0.09607288 ... 0.09701233 0.09969455 0.10279037]]
2.3315477531843927
dW_mean = 2929315.9214616027
dW1_mean = 43.96393853253777
dbeta1_mean = 450.6503799707774
dgamma1_mean = 25.932028188548543
Z's mean = -2749948.101220915
[[0.10431445 0.08928723 0.09982845 ... 0.09448718 0.10018709 0.10094849]
 [0.10406155 0.08806538 0.09864647 ... 0.09514669 0.10070239 0.10137511]
 [0.10274661 0.08931517 0.09585828 ... 0.09720338 0.10037397 0.10259631]
 ...
 [0.

dW_mean = 2826090.1710658623
dW1_mean = 43.96712146046053
dbeta1_mean = 450.45759697860575
dgamma1_mean = 16.519101049590404
