In [86]:
import numpy as np
import pandas as pd

## Loading MNIST Dataset

In [87]:
mnist=pd.read_csv('train_mnist.csv')

In [88]:
mnist.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Getting Target Values

In [89]:
y=mnist.label
y.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

## Target values into Categorical Feature

In [90]:
y_d=pd.get_dummies(y,prefix='Num_')

In [91]:
y_d.head()

Unnamed: 0,Num__0,Num__1,Num__2,Num__3,Num__4,Num__5,Num__6,Num__7,Num__8,Num__9
0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0


In [92]:
y_d=np.array(y_d)
y_d.shape

(42000, 10)

## Dropping Target values from input

In [93]:
X=mnist.drop('label',axis=1)

## Reshaping Input to apply convolution

In [94]:
X=np.array(X)
X=X.reshape(42000,28,28,1)
X.shape

(42000, 28, 28, 1)

## Function for Padding

In [95]:
def zero_pad(X, pad):
    X_pad = np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 'constant', constant_values=0)
    return X_pad

## Single Convolution Step

In [96]:
def conv_single_step(a_slice_prev, W, b):
    s = np.multiply(a_slice_prev, W) + b
    Z = np.sum(s)
    return Z

## Function for Convolution

In [97]:
def conv_forward(A_prev, W, b, hparameters):
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters['stride']
    pad = hparameters['pad']
    n_H = int((n_H_prev - f + 2 * pad) / stride) + 1
    n_W = int((n_W_prev - f + 2 * pad) / stride) + 1
    Z = np.zeros((m, n_H, n_W, n_C))
    A_prev_pad = zero_pad(A_prev, pad)
    for i in range(m):                                 # loop over the batch of training examples
        a_prev_pad = A_prev_pad[i]                     # Select ith training example's padded activation
        for h in range(n_H):                           # loop over vertical axis of the output volume
            for w in range(n_W):                       # loop over horizontal axis of the output volume
                for c in range(n_C):
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[...,c], b[...,c])
    assert(Z.shape == (m, n_H, n_W, n_C))
    cache = (A_prev, W, b, hparameters)
    return Z, cache 

## Defining ReLU Function

In [98]:
def ReLU(x):
    return np.maximum(0,x)

## Backprop in convolution

In [99]:
def conv_backward(dZ, cache):    
    (A_prev, W, b, hparameters) = cache
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters["stride"]
    pad = hparameters["pad"]
    (m, n_H, n_W, n_C) = dZ.shape    
    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))                           
    dW = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))

    # Pad A_prev and dA_prev
    A_prev_pad = zero_pad(A_prev, pad)
    dA_prev_pad = zero_pad(dA_prev, pad)
    
    for i in range(m):                      
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]
        
        for h in range(n_H):                   # loop over vertical axis of the output volume
            for w in range(n_W):               # loop over horizontal axis of the output volume
                for c in range(n_C):           # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice"
                    vert_start = h
                    vert_end = vert_start + f
                    horiz_start = w
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]

                    # Update gradients for the window and the filter's parameters using the code formulas given above
                    da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                    dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                    db[:,:,:,c] += dZ[i, h, w, c]
    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))    
    return dA_prev, dW, db

## Defining softmax function

In [100]:
def softmax(x):
    s=np.exp(x)/np.sum(np.exp(x))
    return s

###  This function convert our input matrices to a matrices with 10 features (by multiplying with weight matrices and adding bias) so that we can apply softmax regression to it.

In [101]:
def ultimate_function(a,w1,b1,gamma1,beta1):
    
    Z2=np.dot(a,w1)+b1
    ep=0.00000001
    Z2_mean=np.mean(Z2)
    Z2_std=np.std(Z2)
    Z2_norm=(Z2-np.mean(Z2))/(np.sqrt((Z2_std**2)+ep))   
    Z2_tilde=gamma1*Z2_norm+beta1
    a2=np.zeros((500,10))
    for i in range(500):
        a2[i]=softmax(Z2_tilde[i])
    return a2,Z2_tilde

## Defining Cross Entropy Cost Function

In [102]:
def Cost(y,a):
    J=np.zeros((500,10))    #
    for i in range(500):
        J= - np.multiply(y[i], np.log(a[i])).sum()
        return J

## Differentiating softmax for backpropagation and getting result

In [103]:
def softmax_backprop(a):
    b=1-a
    return np.multiply(a+0.00000001,b)

## Differentiating Cost Function for backpropagation and getting result

In [104]:
def cost_back(y_d,a):
    return np.multiply(y_d,1/(a+0.00000001))

## Finding dL/dW

In [105]:
def wz_back(X,a):
    dzw=np.dot(X,a)
    return dzw

## Applying Batch_Norm (Forward Propagation and Backward Propagation)
### In this we initialize our weight, bias and hperparameters. Then we divide our input into mini batches of 500 images and apply convolution. After convolution batch normalization is done. Then we apply ReLU activation Function. Then we calculate our prediction values using softmax  regression. Cost Function is calculated with Cross Entropy. Then we apply backward Propagation.

In [106]:
W = np.random.randn(2, 2, 1, 1)                  # Initializing weight for covolution Layer 
b = np.random.randn(1, 1, 1, 1)                  # Initializing bias for convolution Layer
hparameters = {"pad" : 0,                        # Initializing hyperparameters
               "stride": 2}
gamma1=np.random.rand(1)                        # Intializing BETA and GAMMA for batch-norm
beta1=np.random.rand(1)
w1=np.random.rand(196,10)                       # Initializing weight for 2nd Layer 
b1=np.random.rand(1,10)                         # Initializing bias for 2nd Layer
alpha=0.000005                                  # Learning Rate
for i in range(0,42000,500):
   
    Z, cache_conv = conv_forward(X[i:i+500,:,:,:], W, b, hparameters)  # Forward Convolution Layer
    print("Z's mean =", np.mean(Z))
    Z_mean=np.mean(Z)
    Z_std=np.std(Z)
    ep=0.00000001
    Z_norm=(Z-Z_mean)/(np.sqrt(Z_std**2+ep))          # Normalization of Z
    
    Z_tilde=gamma*Z_norm+beta                         # Changing mean and variance of Z
    a=ReLU(Z_tilde)                                   # Activation Function
    ac=a.reshape(500,196)                             # Reshaping a copy of Input Activation
    
    a2,Z2_tilde=ultimate_function(ac,w1,b1,gamma1,beta1)  # Finding the softmax output
    
    print(a2)                                             # Printing the probabilities of each number
    
    cost_func=Cost(y_d[i:i+500,:],a2)                     # Finding cost function using cross entropy
    
    print(cost_func)  
    
    g=softmax_backprop(a2)                              # Softmax Backpropagation differentiation output matrix
    
    h=cost_back(y_d[i:i+500],a2)                        # Cost Function differentiation output matrix 
    
    gh=np.multiply(g,h)                                    
    
    dbeta1=np.sum(gh)                                  # Finding dBeta
    
    dw1=wz_back(ac.T,gh)                               # Finding dW of 2nd Layer
    
    w1=w1-alpha*dw1/500                                # Changing W
    
    dgamma1=np.sum(dw1)                                # Finding dGamma
    
    gamma1=gamma1-alpha*dgamma1/500                    # Changing Gamma
    
    beta1=beta1-alpha*dbeta1/500                       # Changing Beta
    
    dA, dW, db = conv_backward(a, cache_conv)           # Backprop for Convolution Layer
    
    W=W-alpha*dW/500                                    # Changing W for Convolution Layer
    
    b=b-alpha*db/500                                    # Changing b for Convolution Layer
    
    print("dW_mean =", np.mean(dW))
    print("dW1_mean =", np.mean(dw1))
    print("dbeta1_mean =", np.mean(dbeta1))
    print("dgamma1_mean =", np.mean(dgamma1))

Z's mean = 22.76792867308532
[[0.09819632 0.09037209 0.11687867 ... 0.08026304 0.10786598 0.10488275]
 [0.10233775 0.09982876 0.10061598 ... 0.09547359 0.12025805 0.09582089]
 [0.09182155 0.1124685  0.11179781 ... 0.09461908 0.10399018 0.09909217]
 ...
 [0.09594417 0.10916993 0.10774578 ... 0.09375058 0.10264314 0.10505939]
 [0.08530978 0.10067067 0.07656782 ... 0.11118756 0.11171705 0.11613702]
 [0.10279768 0.0853607  0.10620139 ... 0.10594772 0.09283728 0.09096554]]
2.4038198107117252
dW_mean = 1061441.7059219794
dW1_mean = 14.18449592035357
dbeta1_mean = 449.86691759380136
dgamma1_mean = 27801.612003892995
Z's mean = -148254806.1522347
[[0.0873436  0.1091824  0.10276102 ... 0.08062356 0.14980786 0.10677989]
 [0.08041466 0.09400511 0.09581659 ... 0.07491491 0.17584214 0.12043925]
 [0.08035613 0.1147702  0.10792457 ... 0.07672314 0.15638525 0.11084175]
 ...
 [0.08296962 0.10234779 0.10865221 ... 0.07130378 0.16557711 0.1103228 ]
 [0.0796262  0.10110253 0.10431517 ... 0.07822794 0.1480

dW_mean = 1101328.7592625667
dW1_mean = 17.100297328001588
dbeta1_mean = 449.54880732116135
dgamma1_mean = 33516.58276288311
Z's mean = -3600946091.967984
[[0.07767106 0.10061202 0.11220828 ... 0.07728522 0.16399661 0.11459043]
 [0.08634922 0.1080398  0.0963465  ... 0.07108944 0.17208825 0.11500368]
 [0.08341692 0.11671168 0.09830848 ... 0.08973394 0.1475318  0.12330287]
 ...
 [0.08438718 0.10306236 0.10044681 ... 0.08039917 0.1721368  0.11217292]
 [0.08312522 0.09380461 0.10236344 ... 0.08414995 0.15634194 0.12017033]
 [0.08573577 0.1062333  0.10570922 ... 0.08763009 0.1481624  0.10449053]]
2.5226792590505713
dW_mean = 2450086.381003015
dW1_mean = 36.67983544896468
dbeta1_mean = 450.1604332254419
dgamma1_mean = 71892.47747997078
Z's mean = -3895184698.6550474
[[0.08076539 0.09085956 0.09681115 ... 0.08435022 0.17539908 0.11806027]
 [0.07637119 0.11751707 0.10402991 ... 0.07138033 0.15327504 0.11919943]
 [0.08263585 0.09557302 0.10860937 ... 0.08134676 0.15901455 0.11633169]
 ...
 [0.0

dW_mean = 699447.4164974364
dW1_mean = 11.005211894038565
dbeta1_mean = 448.98107151819886
dgamma1_mean = 21570.215312315588
Z's mean = -6818323734.259597
[[0.07077254 0.1201974  0.11396726 ... 0.08627813 0.15477043 0.1132071 ]
 [0.07456148 0.1061056  0.10935175 ... 0.07551048 0.16052575 0.11502729]
 [0.07287292 0.10887592 0.11566277 ... 0.0774487  0.14515196 0.11456321]
 ...
 [0.06912357 0.10693803 0.10948598 ... 0.07500467 0.16745234 0.1224818 ]
 [0.08046724 0.10379747 0.12059982 ... 0.08089432 0.16396715 0.10477241]
 [0.06852842 0.10446612 0.10378378 ... 0.07301866 0.16897226 0.11231656]]
2.427257398191677
dW_mean = 514361.78005955287
dW1_mean = 7.6816079775727095
dbeta1_mean = 448.86298752826895
dgamma1_mean = 15055.951636042511
Z's mean = -6818131188.729314
[[0.08547137 0.09936437 0.10354174 ... 0.08004848 0.16232222 0.11251911]
 [0.07730547 0.10712286 0.10103497 ... 0.07631908 0.16972808 0.11580214]
 [0.07139172 0.11315431 0.10083162 ... 0.08648074 0.14870168 0.12345218]
 ...
 [0

dW_mean = 1340690.6588718682
dW1_mean = 20.073263717731948
dbeta1_mean = 449.43214933903005
dgamma1_mean = 39343.59688675462
Z's mean = -9733684556.116898
[[0.07787551 0.10727684 0.09660113 ... 0.08751421 0.14810124 0.11310447]
 [0.08514691 0.10642799 0.11720596 ... 0.08055768 0.14649663 0.10495322]
 [0.07772348 0.09290129 0.11364548 ... 0.08595668 0.1746477  0.10125656]
 ...
 [0.08454353 0.09898922 0.09639128 ... 0.08072654 0.16415795 0.11318322]
 [0.08546504 0.09932871 0.12146821 ... 0.07749749 0.16757982 0.10552611]
 [0.0779748  0.10819704 0.10987122 ... 0.07970303 0.16902266 0.0991616 ]]
2.333881862179155
dW_mean = 1748748.8540531741
dW1_mean = 25.749946574358272
dbeta1_mean = 449.59121899397803
dgamma1_mean = 50469.89528574221
Z's mean = -9804938106.73507
[[0.06844923 0.10656395 0.11043874 ... 0.07109667 0.17334162 0.11633609]
 [0.07505937 0.10431579 0.10113563 ... 0.08113366 0.15919303 0.10900709]
 [0.06831989 0.11148698 0.10549189 ... 0.07070514 0.16205904 0.11316705]
 ...
 [0.0

dW_mean = 2726703.1822069865
dW1_mean = 41.18589588195987
dbeta1_mean = 449.01335442732415
dgamma1_mean = 80724.35592864135
Z's mean = -12615266075.906849
[[0.08626056 0.10215928 0.11747256 ... 0.09497043 0.14662305 0.09787565]
 [0.08158305 0.11863164 0.09142496 ... 0.07738329 0.15723627 0.1123391 ]
 [0.08720164 0.09605869 0.09867825 ... 0.08179289 0.16164136 0.12350635]
 ...
 [0.08595652 0.09861831 0.09588845 ... 0.07564047 0.17259206 0.11184888]
 [0.07885253 0.10695098 0.10540932 ... 0.07867098 0.159895   0.11211544]
 [0.07683513 0.10552634 0.1070048  ... 0.08074201 0.14676619 0.11568059]]
2.356202109558633
dW_mean = 2111767.5081314375
dW1_mean = 32.528106283128125
dbeta1_mean = 450.34739048154756
dgamma1_mean = 63755.08831493113
Z's mean = -13351515132.485905
[[0.06251231 0.11511986 0.09583731 ... 0.06276417 0.20052164 0.1338819 ]
 [0.06231795 0.11642509 0.09619071 ... 0.06329915 0.19841184 0.13450742]
 [0.06220071 0.11682039 0.09575601 ... 0.06325223 0.19760969 0.13471906]
 ...
 [0

dW_mean = 1697142.4999909606
dW1_mean = 27.17567288080394
dbeta1_mean = 450.6283810094694
dgamma1_mean = 53264.318846375725
Z's mean = -16078391705.391153
[[0.0778436  0.10469465 0.10071032 ... 0.08591608 0.15709118 0.10894268]
 [0.08559222 0.10431511 0.10789068 ... 0.08408104 0.14426223 0.10957776]
 [0.07896301 0.10956978 0.10211988 ... 0.08075686 0.13717124 0.11902212]
 ...
 [0.07828539 0.09276625 0.09825805 ... 0.07811939 0.17011064 0.11753221]
 [0.08129741 0.10022785 0.10978792 ... 0.0840631  0.15006777 0.12274586]
 [0.0809417  0.10504076 0.11452204 ... 0.07628087 0.15918069 0.10419362]]
2.2955069546793285
dW_mean = 1247476.4662077103
dW1_mean = 18.67425098995028
dbeta1_mean = 449.86495512036794
dgamma1_mean = 36601.53194030255
Z's mean = -16254820197.815092
[[0.07466025 0.11789281 0.10768929 ... 0.07824408 0.15075103 0.12120925]
 [0.07925593 0.10012455 0.10413917 ... 0.07237124 0.16211909 0.11319904]
 [0.07620264 0.09222647 0.11756905 ... 0.08204654 0.15346599 0.12004681]
 ...
 [0

dW_mean = 2826222.2854559477
dW1_mean = 43.87708702472908
dbeta1_mean = 449.51465922500023
dgamma1_mean = 85999.090568469
