In [151]:
import numpy as np
import pandas as pd

## Loading MNIST Dataset

In [152]:
mnist=pd.read_csv('train_mnist.csv')

In [153]:
mnist.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Getting Target Values

In [154]:
y=mnist.label
y.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

## Target values into Categorical Feature

In [155]:
y_d=pd.get_dummies(y,prefix='Num_')

In [156]:
y_d.head()

Unnamed: 0,Num__0,Num__1,Num__2,Num__3,Num__4,Num__5,Num__6,Num__7,Num__8,Num__9
0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0


In [157]:
y_d=np.array(y_d)
y_d.shape

(42000, 10)

## Dropping Target values from input

In [158]:
X=mnist.drop('label',axis=1)

## Reshaping Input to apply convolution

In [159]:
X=np.array(X)
X=X.reshape(42000,28,28,1)
X.shape

(42000, 28, 28, 1)

## Function for Padding

In [160]:
def zero_pad(X, pad):
    X_pad = np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 'constant', constant_values=0)
    return X_pad

## Single Convolution Step

In [161]:
def conv_single_step(a_slice_prev, W, b):
    s = np.multiply(a_slice_prev, W) + b
    Z = np.sum(s)
    return Z

## Function for Convolution

In [162]:
def conv_forward(A_prev, W, b, hparameters):
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters['stride']
    pad = hparameters['pad']
    n_H = int((n_H_prev - f + 2 * pad) / stride) + 1
    n_W = int((n_W_prev - f + 2 * pad) / stride) + 1
    Z = np.zeros((m, n_H, n_W, n_C))
    A_prev_pad = zero_pad(A_prev, pad)
    for i in range(m):                                 # loop over the batch of training examples
        a_prev_pad = A_prev_pad[i]                     # Select ith training example's padded activation
        for h in range(n_H):                           # loop over vertical axis of the output volume
            for w in range(n_W):                       # loop over horizontal axis of the output volume
                for c in range(n_C):
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[...,c], b[...,c])
    assert(Z.shape == (m, n_H, n_W, n_C))
    cache = (A_prev, W, b, hparameters)
    return Z, cache 

## Defining ReLU Function

In [163]:
def ReLU(x):
    return np.maximum(0,x)

## Backprop in convolution

In [164]:
def conv_backward(dZ, cache):    
    (A_prev, W, b, hparameters) = cache
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters["stride"]
    pad = hparameters["pad"]
    (m, n_H, n_W, n_C) = dZ.shape    
    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))                           
    dW = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))

    # Pad A_prev and dA_prev
    A_prev_pad = zero_pad(A_prev, pad)
    dA_prev_pad = zero_pad(dA_prev, pad)
    
    for i in range(m):                      
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]
        
        for h in range(n_H):                   # loop over vertical axis of the output volume
            for w in range(n_W):               # loop over horizontal axis of the output volume
                for c in range(n_C):           # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice"
                    vert_start = h
                    vert_end = vert_start + f
                    horiz_start = w
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]

                    # Update gradients for the window and the filter's parameters using the code formulas given above
                    da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                    dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                    db[:,:,:,c] += dZ[i, h, w, c]
    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))    
    return dA_prev, dW, db

## Defining softmax function

In [165]:
def softmax(x):
    s=np.exp(x)/np.sum(np.exp(x))
    return s

### This function takes our images after convolution and rehape it to a 2D matrices(No. of images,features). Then we convert this matrices to a matrices with 10 features (by multiplying with weight matrices and adding bias) so that we can apply softmax regression to it.

In [166]:
def ultimate_function(a):
    a=a.reshape(500,196)
    w1=np.random.rand(196,10)
    b1=np.random.rand(1,10)
    Z2=np.dot(a,w1)+b1
    ep=0.00000001
    Z2_mean=np.mean(Z2)
    Z2_std=np.std(Z2)
    Z2_norm=(Z2-np.mean(Z2))/(np.sqrt((Z2_std**2)+ep))
    gamma1=np.random.rand(1)
    beta1=np.random.rand(1)
    Z2_tilde=gamma1*Z2_norm+beta1
    a2=np.zeros((500,10))
    for i in range(500):
        a2[i]=softmax(Z2_tilde[i])
    return a2,Z2_norm

## Defining Cross Entropy Cost Function

In [167]:
def Cost(y,a):
    J=np.zeros((500,10))    #
    for i in range(500):
        J= - np.multiply(y[i], np.log(a[i])).sum()
        return J

In [168]:
def softmax_backprop(a):
    b=1-a
    return np.multiply(a+0.00000001,b)

In [169]:
def cost_back(y_d,a):
    return np.multiply(y_d,1/(a+0.00000001))

In [170]:
def wz_back(X):
    dzw=np.sum(X,axis=0)
    return dzw

## Applying Forward Propagation
### In this we initialize our weight, bias and hperparameters. Then we divide our input into mini batches of 500 images and apply convolution. After convolution batch normalization is done. Then we apply ReLU activation Function. Then we calculate our prediction values using softmax  regression. Cost Function is calculated with Cross Entropy. Then we apply backward Propagation.

In [171]:
W = np.random.randn(2, 2, 1, 1)
b = np.random.randn(1, 1, 1, 1)
hparameters = {"pad" : 0,
               "stride": 2}
for i in range(0,42000,500):
    Z, cache_conv = conv_forward(X[i:i+500,:,:,:], W, b, hparameters)
    print("Z's mean =", np.mean(Z))
    Z_mean=np.mean(Z)
    Z_std=np.std(Z)
    ep=0.00000001
    Z_norm=(Z-Z_mean)/(np.sqrt(Z_std**2+ep))
    gamma=np.random.rand(1)
    beta=np.random.rand(1)
    Z_tilde=gamma*Z_norm+beta
    a=ReLU(Z_tilde)    
    a2,Z2_norm=ultimate_function(a)
    print(a2)
    cost_func=Cost(y_d[i:i+500,:],a2)
    print(cost_func)  
    g=softmax_backprop(a2)
    h=cost_back(y_d[i:i+500],a2)
    gh=np.multiply(g,h)
    k=wz_back(Z2_norm)    
    dW2=np.multiply(gh,k)
    dA, dW, db = conv_backward(a, cache_conv)
    print("dW_mean =", np.mean(dW))
    print("dW2_mean =", np.mean(dW2))

Z's mean = 53.416851944754576
[[0.0922099  0.12572166 0.09146479 ... 0.11163668 0.07455732 0.08866839]
 [0.102226   0.11903097 0.07965086 ... 0.12943113 0.08148466 0.09574846]
 [0.09273306 0.09573683 0.1040541  ... 0.12255165 0.0825675  0.10581118]
 ...
 [0.09288351 0.09813341 0.10483337 ... 0.11696762 0.07894784 0.10652926]
 [0.1067603  0.11052085 0.09488496 ... 0.11614279 0.06951281 0.09628155]
 [0.08971386 0.1185841  0.10261994 ... 0.12877711 0.08135094 0.09649682]]
2.0736848523548157
dW_mean = 2120029.5115157976
dW2_mean = -0.9836949758309383
Z's mean = 55.788683006133056
[[0.08916571 0.09109705 0.09439007 ... 0.11212013 0.11922907 0.07187901]
 [0.08274049 0.09851488 0.10110023 ... 0.10504342 0.10798777 0.08879295]
 [0.08785957 0.09879071 0.11013633 ... 0.1033722  0.11953543 0.07682768]
 ...
 [0.08262084 0.10242916 0.1081012  ... 0.10400925 0.10298919 0.0831313 ]
 [0.09439761 0.08757166 0.1074601  ... 0.10444817 0.12086473 0.07917074]
 [0.08473929 0.09309969 0.10386228 ... 0.101737

dW_mean = 2575188.218681815
dW2_mean = 0.8686209248568645
Z's mean = 53.843109775082446
[[0.10083999 0.10234736 0.09223763 ... 0.10945517 0.09412513 0.09529288]
 [0.10256307 0.10283751 0.09427661 ... 0.09978642 0.09935695 0.08997836]
 [0.09932285 0.10011561 0.09577927 ... 0.10822712 0.09547901 0.0953342 ]
 ...
 [0.09811139 0.10073293 0.09047203 ... 0.11339724 0.09619137 0.08884273]
 [0.09617052 0.0988279  0.10151907 ... 0.10937383 0.09970837 0.09211573]
 [0.09641288 0.0989459  0.09132136 ... 0.10540744 0.09623707 0.09713194]]
2.2819467787577685
dW_mean = 1983327.586387075
dW2_mean = 0.970981381499675
Z's mean = 54.744605799442056
[[0.0574606  0.08780619 0.13135715 ... 0.0683736  0.10025045 0.11580078]
 [0.05287865 0.0948751  0.16099716 ... 0.0863534  0.10884701 0.11388514]
 [0.06413512 0.13781286 0.10902432 ... 0.0759221  0.11541509 0.07878447]
 ...
 [0.0420371  0.10657501 0.10840168 ... 0.08287646 0.18793318 0.1410916 ]
 [0.06424614 0.09707789 0.09717855 ... 0.09856651 0.13392115 0.11

dW_mean = 2712693.617350364
dW2_mean = -1.0302675477060217
Z's mean = 52.17917343231654
[[0.22883078 0.0488487  0.06090442 ... 0.03749759 0.17383423 0.08991195]
 [0.18559376 0.08001472 0.05998542 ... 0.02090655 0.11056721 0.06284201]
 [0.26708183 0.0654487  0.06260472 ... 0.02854747 0.12418568 0.05949321]
 ...
 [0.23152303 0.05228515 0.0678115  ... 0.02497875 0.12026926 0.06267168]
 [0.2642905  0.05506757 0.04119188 ... 0.0381914  0.09745631 0.09471308]
 [0.32193267 0.06441371 0.05830252 ... 0.02171685 0.10468285 0.06406212]]
3.019027440349101
dW_mean = 2401123.3817524062
dW2_mean = -2.3494790511727612
Z's mean = 53.63928916134866
[[0.09956715 0.10020725 0.09886799 ... 0.09926431 0.10043615 0.10090051]
 [0.0999864  0.10011523 0.09890863 ... 0.09912408 0.1005372  0.10102661]
 [0.09998071 0.10003508 0.09862064 ... 0.09926128 0.10032614 0.10061148]
 ...
 [0.10051615 0.10008209 0.09845972 ... 0.09919173 0.100277   0.10082016]
 [0.09951516 0.10028556 0.09877115 ... 0.09928613 0.10047491 0.1

dW_mean = 1129707.7390940944
dW2_mean = -0.36734612693438173
Z's mean = 52.13488230656429
[[0.10718835 0.11574996 0.09814558 ... 0.09091356 0.09914999 0.08192788]
 [0.09950146 0.12218315 0.11154842 ... 0.09378351 0.09435223 0.07916432]
 [0.10610151 0.11891242 0.09720778 ... 0.09852515 0.10003294 0.08503602]
 ...
 [0.09496214 0.1126377  0.11192177 ... 0.1016084  0.10208337 0.07767268]
 [0.10109372 0.10678786 0.10657252 ... 0.1000571  0.10311738 0.08265875]
 [0.09713168 0.12447665 0.11437271 ... 0.08760825 0.10201468 0.0858492 ]]
2.501915936665611
dW_mean = 584175.3241109016
dW2_mean = 1.8823339719887022
Z's mean = 52.483654569539866
[[0.12235333 0.13628467 0.06218569 ... 0.15531026 0.15112145 0.06980273]
 [0.13839047 0.13329382 0.05937392 ... 0.14604601 0.13634073 0.07609163]
 [0.1275306  0.13756152 0.06318181 ... 0.15283811 0.14295599 0.07440056]
 ...
 [0.12936116 0.13078762 0.0611455  ... 0.15462528 0.13947109 0.07259981]
 [0.13405637 0.12168166 0.06217771 ... 0.15361353 0.15522966 0.

dW_mean = 3181168.516050283
dW2_mean = 0.17205535743141576
Z's mean = 54.23602924167967
[[0.09858002 0.17227602 0.09074326 ... 0.0836342  0.08087072 0.06673782]
 [0.10055077 0.17840285 0.08726646 ... 0.08243763 0.07918318 0.06592918]
 [0.09563916 0.1792882  0.08669293 ... 0.0793557  0.08077813 0.05953833]
 ...
 [0.09363693 0.17348167 0.08684223 ... 0.08000152 0.08082973 0.06376313]
 [0.09047706 0.17076193 0.08557836 ... 0.08586873 0.07962364 0.06437856]
 [0.09218151 0.17468312 0.08545325 ... 0.0812069  0.07894071 0.06336233]]
1.9421117449405605
dW_mean = 159582.87396609495
dW2_mean = -0.737875979936814
Z's mean = 53.869921318889766
[[0.09972843 0.07115908 0.08336135 ... 0.11879821 0.10096756 0.0911778 ]
 [0.12049479 0.07051995 0.09116009 ... 0.10537786 0.08486194 0.10226875]
 [0.11011738 0.08476557 0.09737445 ... 0.10951122 0.08214442 0.09275183]
 ...
 [0.09205821 0.09876843 0.09353586 ... 0.08537069 0.10651794 0.09584062]
 [0.09115886 0.10273348 0.09056394 ... 0.09345271 0.10399452 0.

dW_mean = 2818002.7776514227
dW2_mean = 2.512905026218087
Z's mean = 53.449812115790834
[[0.07488807 0.10994696 0.01997658 ... 0.16271506 0.20556067 0.06413967]
 [0.07797973 0.11145755 0.01865545 ... 0.1848759  0.21102649 0.06866346]
 [0.09013254 0.11671983 0.03096194 ... 0.1600151  0.22047626 0.07414709]
 ...
 [0.08866998 0.11109271 0.03763415 ... 0.1108929  0.24381803 0.09262559]
 [0.07130262 0.13136633 0.02834793 ... 0.17236496 0.21777967 0.07521307]
 [0.07152319 0.15297471 0.02436207 ... 0.12293757 0.23069925 0.08054226]]
2.7466921788033076
dW_mean = 527818.9224742978
dW2_mean = -1.146469439640109
Z's mean = 53.59198027127459
[[0.10129756 0.09599802 0.10317456 ... 0.10579267 0.10243191 0.1020585 ]
 [0.10000479 0.09794838 0.09673013 ... 0.10236305 0.10446475 0.10593453]
 [0.09442452 0.10259193 0.09782161 ... 0.10178686 0.10593558 0.10224067]
 ...
 [0.09910234 0.09836697 0.09582558 ... 0.10371712 0.10321496 0.10271842]
 [0.09983348 0.09781323 0.09593026 ... 0.10664577 0.10701505 0.09