In [24]:
import numpy as np
import pandas as pd

## Loading MNIST Dataset

In [25]:
mnist=pd.read_csv('train_mnist.csv')

In [26]:
mnist.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Getting Target Values

In [27]:
y=mnist.label
y.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

## Target values into Categorical Feature

In [28]:
y_d=pd.get_dummies(y,prefix='Num_')

In [29]:
y_d.head()

Unnamed: 0,Num__0,Num__1,Num__2,Num__3,Num__4,Num__5,Num__6,Num__7,Num__8,Num__9
0,0,1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0


In [30]:
y_d=np.array(y_d)
y_d.shape

(42000, 10)

## Dropping Target values from input

In [31]:
X=mnist.drop('label',axis=1)

## Reshaping Input to apply convolution

In [32]:
X=np.array(X)
X=X.reshape(42000,28,28,1)
X.shape

(42000, 28, 28, 1)

## Function for Padding

In [33]:
def zero_pad(X, pad):
    X_pad = np.pad(X, ((0, 0), (pad, pad), (pad, pad), (0, 0)), 'constant', constant_values=0)
    return X_pad

## Single Convolution Step

In [34]:
def conv_single_step(a_slice_prev, W, b):
    s = np.multiply(a_slice_prev, W) + b
    Z = np.sum(s)
    return Z

## Function for Convolution

In [35]:
def conv_forward(A_prev, W, b, hparameters):
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters['stride']
    pad = hparameters['pad']
    n_H = int((n_H_prev - f + 2 * pad) / stride) + 1
    n_W = int((n_W_prev - f + 2 * pad) / stride) + 1
    Z = np.zeros((m, n_H, n_W, n_C))
    A_prev_pad = zero_pad(A_prev, pad)
    for i in range(m):                                 # loop over the batch of training examples
        a_prev_pad = A_prev_pad[i]                     # Select ith training example's padded activation
        for h in range(n_H):                           # loop over vertical axis of the output volume
            for w in range(n_W):                       # loop over horizontal axis of the output volume
                for c in range(n_C):
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    a_slice_prev = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]
                    Z[i, h, w, c] = conv_single_step(a_slice_prev, W[...,c], b[...,c])
    assert(Z.shape == (m, n_H, n_W, n_C))
    cache = (A_prev, W, b, hparameters)
    return Z, cache 

## Defining ReLU Function

In [36]:
def ReLU(x):
    return np.maximum(0,x)

## Backprop in convolution

In [37]:
def conv_backward(dZ, cache):    
    (A_prev, W, b, hparameters) = cache
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
    (f, f, n_C_prev, n_C) = W.shape
    stride = hparameters["stride"]
    pad = hparameters["pad"]
    (m, n_H, n_W, n_C) = dZ.shape    
    # Initialize dA_prev, dW, db with the correct shapes
    dA_prev = np.zeros((m, n_H_prev, n_W_prev, n_C_prev))                           
    dW = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))

    # Pad A_prev and dA_prev
    A_prev_pad = zero_pad(A_prev, pad)
    dA_prev_pad = zero_pad(dA_prev, pad)
    
    for i in range(m):                      
        a_prev_pad = A_prev_pad[i]
        da_prev_pad = dA_prev_pad[i]
        
        for h in range(n_H):                   # loop over vertical axis of the output volume
            for w in range(n_W):               # loop over horizontal axis of the output volume
                for c in range(n_C):           # loop over the channels of the output volume
                    
                    # Find the corners of the current "slice"
                    vert_start = h
                    vert_end = vert_start + f
                    horiz_start = w
                    horiz_end = horiz_start + f
                    
                    # Use the corners to define the slice from a_prev_pad
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :]

                    # Update gradients for the window and the filter's parameters using the code formulas given above
                    da_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] += W[:,:,:,c] * dZ[i, h, w, c]
                    dW[:,:,:,c] += a_slice * dZ[i, h, w, c]
                    db[:,:,:,c] += dZ[i, h, w, c]
    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))    
    return dA_prev, dW, db

## Defining softmax function

In [38]:
def softmax(x):
    s=np.exp(x)/np.sum(np.exp(x))
    return s

### This function takes our images after convolution and rehape it to a 2D matrices(No. of images,features). Then we convert this matrices to a matrices with 10 features (by multiplying with weight matrices and adding bias) so that we can apply softmax regression to it.

In [39]:
def ultimate_function(a):
    a=a.reshape(500,196)
    w1=np.random.rand(196,10)
    b1=np.random.rand(1,10)
    Z2=np.dot(a,w1)+b1
    ep=0.00000001
    Z2_mean=np.mean(Z2)
    Z2_std=np.std(Z2)
    Z2_norm=(Z2-np.mean(Z2))/(np.sqrt((Z2_std**2)+ep))
    gamma1=np.random.rand(1)
    beta1=np.random.rand(1)
    Z2_tilde=gamma1*Z2_norm+beta1
    a2=np.zeros((500,10))
    for i in range(500):
        a2[i]=softmax(Z2_tilde[i])
    return a2,Z2_tilde

## Defining Cross Entropy Cost Function

In [40]:
def Cost(y,a):
    J=np.zeros((500,10))    #
    for i in range(500):
        J= - np.multiply(y[i], np.log(a[i])).sum()
        return J

In [41]:
def softmax_backprop(a):
    b=1-a
    return np.multiply(a+0.00000001,b)

In [42]:
def cost_back(y_d,a):
    return np.multiply(y_d,1/(a+0.00000001))

In [43]:
def wz_back(X):
    dzw=np.sum(X,axis=0)
    return dzw

## Applying Forward Propagation
### In this we initialize our weight, bias and hperparameters. Then we divide our input into mini batches of 500 images and apply convolution. After convolution batch normalization is done. Then we apply ReLU activation Function. Then we calculate our prediction values using softmax  regression. Cost Function is calculated with Cross Entropy. Then we apply backward Propagation.

In [44]:
W = np.random.randn(2, 2, 1, 1)
b = np.random.randn(1, 1, 1, 1)
hparameters = {"pad" : 0,
               "stride": 2}
for i in range(0,42000,500):
    Z, cache_conv = conv_forward(X[i:i+500,:,:,:], W, b, hparameters)
    print("Z's mean =", np.mean(Z))
    Z_mean=np.mean(Z)
    Z_std=np.std(Z)
    ep=0.00000001
    Z_norm=(Z-Z_mean)/(np.sqrt(Z_std**2+ep))
    gamma=np.random.rand(1)
    beta=np.random.rand(1)
    Z_tilde=gamma*Z_norm+beta
    a=ReLU(Z_tilde)    
    a2,Z2_tilde=ultimate_function(a)
    print(a2)
    cost_func=Cost(y_d[i:i+500,:],a2)
    print(cost_func)  
    g=softmax_backprop(a2)
    h=cost_back(y_d[i:i+500],a2)
    gh=np.multiply(g,h)
    dbeta2=np.sum(gh)
    k=wz_back(Z2_tilde)    
    dW2=np.multiply(gh,k)
    dgamma2=np.sum(dW2)
    dA, dW, db = conv_backward(a, cache_conv)
    print("dW_mean =", np.mean(dW))
    print("dW2_mean =", np.mean(dW2))
    print("dbeta2_mean =", np.mean(dbeta2))
    print("dgamma2_mean =", np.mean(dgamma2))

Z's mean = 78.47236359146893
[[0.10502581 0.07183839 0.09584435 ... 0.07118828 0.09004855 0.10378371]
 [0.05666067 0.09088748 0.07805167 ... 0.09787025 0.09442361 0.1233466 ]
 [0.0937565  0.08590126 0.10574528 ... 0.09244159 0.07589612 0.10336874]
 ...
 [0.09317032 0.07328391 0.11378171 ... 0.09071478 0.07630867 0.10240715]
 [0.08670371 0.08400083 0.09041736 ... 0.102565   0.08284212 0.11753713]
 [0.04790104 0.11257273 0.07727807 ... 0.09226926 0.09809524 0.13524156]]
2.6333362387769683
dW_mean = 1835979.9095747336
dW2_mean = 20.866162015195975
dbeta2_mean = 452.0863768983447
Z's mean = 81.87716759370196
[[0.05347142 0.11982812 0.07351406 ... 0.11064238 0.11504557 0.09848264]
 [0.07290332 0.1171029  0.09207952 ... 0.08050479 0.1028367  0.09529827]
 [0.0915042  0.11122905 0.05594764 ... 0.0945005  0.11093946 0.10212286]
 ...
 [0.0565909  0.10651733 0.10338801 ... 0.07557402 0.12886733 0.08837886]
 [0.07683235 0.13802845 0.07569188 ... 0.11070791 0.09825937 0.10477067]
 [0.10337918 0.105

dW_mean = 1326942.4061790062
dW2_mean = 6.911241227808542
dbeta2_mean = 450.03481751884385
Z's mean = 78.99501369890925
[[0.10182364 0.08613369 0.10588335 ... 0.10440075 0.09091009 0.10459486]
 [0.14373764 0.12159571 0.09032359 ... 0.11159942 0.10487977 0.084227  ]
 [0.10031287 0.12936915 0.09375119 ... 0.0742754  0.09614874 0.07521894]
 ...
 [0.10815538 0.08535847 0.12202529 ... 0.11154755 0.11568921 0.08408841]
 [0.10543397 0.10826308 0.10717372 ... 0.09947926 0.11243791 0.07647436]
 [0.1045094  0.11027488 0.10235282 ... 0.07560645 0.10845223 0.09178208]]
2.4903966372875703
dW_mean = 1325893.236307495
dW2_mean = 1.2037930311869824
dbeta2_mean = 448.38105300952105
Z's mean = 80.24435452057095
[[0.09293451 0.06942677 0.10311827 ... 0.0944411  0.11118531 0.07648351]
 [0.06172341 0.11868933 0.08678058 ... 0.11950662 0.1339822  0.06291909]
 [0.05472318 0.08300617 0.13514233 ... 0.09802057 0.09451365 0.07087472]
 ...
 [0.09424761 0.08877842 0.12276301 ... 0.06219392 0.12624954 0.05167462]


Z's mean = 78.461418483353
[[0.11424756 0.08839902 0.12503406 ... 0.09632477 0.0829956  0.09376396]
 [0.11299693 0.08955791 0.12405614 ... 0.09930113 0.08412149 0.09637612]
 [0.11335201 0.09464237 0.12348807 ... 0.09315119 0.0850954  0.09442405]
 ...
 [0.12073443 0.08968267 0.11783067 ... 0.09582856 0.09301292 0.0910009 ]
 [0.1170748  0.08628544 0.12395956 ... 0.095771   0.08066599 0.09767081]
 [0.11521838 0.08923006 0.11894608 ... 0.10291124 0.08785866 0.09070829]]
2.3669747649803816
dW_mean = 627029.0778839217
dW2_mean = 41.257652687014506
dbeta2_mean = 450.0864374755081
Z's mean = 76.18386139405727
[[0.09386685 0.10287745 0.09743389 ... 0.09960092 0.11167075 0.10546083]
 [0.08174335 0.12202027 0.11689763 ... 0.11050025 0.08690018 0.10108153]
 [0.0910443  0.11128746 0.0946186  ... 0.0905219  0.08856974 0.1246237 ]
 ...
 [0.08244088 0.10104898 0.10515064 ... 0.10097396 0.09669544 0.09910853]
 [0.10229022 0.09958464 0.11587542 ... 0.10321381 0.08866187 0.10098352]
 [0.10329103 0.116385

dW_mean = 418043.22256298194
dW2_mean = 44.676202925332994
dbeta2_mean = 451.29890105987624
Z's mean = 78.53499030243455
[[0.13616264 0.06957739 0.12708335 ... 0.05365714 0.10960974 0.12782307]
 [0.15860165 0.05585857 0.12174487 ... 0.07588074 0.16168063 0.11362447]
 [0.13374518 0.05996679 0.13651879 ... 0.05880813 0.12568636 0.12237031]
 ...
 [0.11310898 0.08205624 0.11970307 ... 0.05499638 0.17262434 0.10113567]
 [0.13476787 0.08021502 0.09277613 ... 0.05462992 0.16091773 0.11199963]
 [0.12551609 0.07972728 0.11950515 ... 0.05712416 0.16282557 0.09880843]]
2.9251408034444326
dW_mean = 1292374.0192632005
dW2_mean = 42.16716412847614
dbeta2_mean = 449.8682421941799
Z's mean = 76.42685764753435
[[0.10144259 0.10109864 0.09344577 ... 0.10337029 0.0917669  0.09543426]
 [0.09964406 0.10331251 0.09309374 ... 0.10583343 0.09789653 0.09149647]
 [0.10155823 0.10279296 0.09443461 ... 0.10194113 0.09619595 0.09503746]
 ...
 [0.10005739 0.10589257 0.09482976 ... 0.10450176 0.10150181 0.09058103]


Z's mean = 78.94078944636125
[[0.09738741 0.0973504  0.10201335 ... 0.11108604 0.09690681 0.09777339]
 [0.09580045 0.08843855 0.11096511 ... 0.10878267 0.1090197  0.09868244]
 [0.09838046 0.08872682 0.1014445  ... 0.11094986 0.11154993 0.09483957]
 ...
 [0.10467647 0.09199426 0.10923674 ... 0.09828297 0.10228604 0.09311131]
 [0.10159517 0.09758702 0.10251009 ... 0.1031904  0.09471525 0.0960029 ]
 [0.09404432 0.09486994 0.10295765 ... 0.10725288 0.09456877 0.10014366]]
2.329438402247109
dW_mean = 688240.9417280453
dW2_mean = 19.129024920697482
dbeta2_mean = 450.3573624761749
Z's mean = 79.77937161452033
[[0.09736675 0.11180658 0.10055604 ... 0.0994085  0.0967501  0.10257397]
 [0.098528   0.1004602  0.09462809 ... 0.09641489 0.100312   0.10267128]
 [0.09879128 0.10407575 0.09460095 ... 0.09777881 0.10018529 0.10405472]
 ...
 [0.09760062 0.10343373 0.09549646 ... 0.10138121 0.09862067 0.10178496]
 [0.10345687 0.10057627 0.09661407 ... 0.10580089 0.09726016 0.09744205]
 [0.09436089 0.10300

dW_mean = 2780711.1761939027
dW2_mean = 32.89407159915343
dbeta2_mean = 449.4073836832025
Z's mean = 77.00159668233891
[[0.10881042 0.09509668 0.08879567 ... 0.08759387 0.10134523 0.10747426]
 [0.09502186 0.09709423 0.08752208 ... 0.09615373 0.09288494 0.11602775]
 [0.09733627 0.09173606 0.08951296 ... 0.09454362 0.09792775 0.1177959 ]
 ...
 [0.09243489 0.09195438 0.09282099 ... 0.09446834 0.09630472 0.1180945 ]
 [0.09872847 0.10156413 0.08583677 ... 0.10019393 0.10817567 0.11061796]
 [0.10585437 0.09395823 0.08449957 ... 0.09184058 0.10146805 0.10981703]]
2.2181482227768767
dW_mean = 2233116.098482105
dW2_mean = 27.653788762334234
dbeta2_mean = 449.51673640901856
Z's mean = 77.18044524889471
[[0.11522708 0.10074929 0.09739638 ... 0.1036512  0.08059777 0.10905913]
 [0.09758891 0.08551367 0.10091615 ... 0.11128518 0.11218685 0.09728426]
 [0.1311385  0.04412195 0.09591425 ... 0.15300364 0.1113147  0.09693104]
 ...
 [0.09879266 0.07187838 0.07512229 ... 0.08826076 0.12278266 0.11221638]
 