## Mini-Batch and Modified MiniBatch
### Imports

In [1]:
import numpy as np
import matplotlib as plt
import math
from random import*
from numpy.testing import assert_almost_equal

In [2]:
def GA(n,d):
    C = np.zeros((d,d)) # covariance matrix
    for i in range(d):
        for j in range(d):
            C[i][j] = 2*(0.5**abs(i-j))
    mean = np.ones(d)
    X = np.random.multivariate_normal(mean,C,(n)) # this is A
    beta = list()
    beta.extend(np.ones(10))
    beta.extend(0.1*np.ones(d-20))
    beta.extend(np.ones(10))
    beta = np.array(beta)
    noise = np.random.normal(loc=0.0,scale =9,size=n)
    y = X.dot(beta) + noise # this is b
    return X,y

In [3]:
def T3(n,d):
    df = 3
    C = np.zeros((d,d)) # covariance matrix
    for i in range(d):
        for j in range(d):
            C[i][j] = 2*(0.5**abs(i-j))
    mean = np.ones(d)
    x = np.random.chisquare(df, n)/df
    X = np.random.multivariate_normal(mean,C,(n))
    X = X/np.sqrt(x)[:,None]
    beta = list()
    beta.extend(np.ones(10))
    beta.extend(0.1*np.ones(d-20))
    beta.extend(np.ones(10))
    beta = np.array(beta)
    noise = np.random.normal(loc=0.0,scale =9,size=n)
    y = X.dot(beta) + noise # this is b
    return X,y

In [4]:
def T1(n,d):
    df = 1
    C = np.zeros((d,d)) # covariance matrix
    for i in range(d):
        for j in range(d):
            C[i][j] = 2*(0.5**abs(i-j))
    mean = np.ones(d)
    x = np.random.chisquare(df, n)/df
    X = np.random.multivariate_normal(mean,C,(n))
    X = X/np.sqrt(x)[:,None]
    beta = list()
    beta.extend(np.ones(10))
    beta.extend(0.1*np.ones(d-20))
    beta.extend(np.ones(10))
    beta = np.array(beta)
    noise = np.random.normal(loc=0.0,scale =9,size=n)
    y = X.dot(beta) + noise # this is b
    
    return X,y

# Given Inputs:
### b: batch size
### k: number of cluster
### t: number of iteration

In [5]:
batch_size = int(input('enter the batch size: '))
k = int(input('enter number of cluster required: '))
t = int(input('enter number of iterations: '))

In [6]:
n = int(input('# rows = '))
d = int(input('# columns = '))

In [7]:
def mini_batch():
    global Cluster_centre,Matrix,k
    v = np.zeros(Cluster_centre.shape[0])
    for i in range(10):
        totalCost_mini_batch=0
        rows = Matrix.shape[0]
        index = np.random.choice(rows,batch_size,replace=False)
        M = Matrix[index,:]
        dx = np.zeros(M.shape[0])# for storing the nearest cluster
        dist = np.full((M.shape[0]), np.inf)
        
        for point in range(M.shape[0]):
            for cc in range(k):
                eucd_dist = np.square(np.linalg.norm(M[point]-Cluster_centre[cc],ord=2))
                if(eucd_dist < dist[point]):
                    dx[point] = cc
                    dist[point]=eucd_dist
        #print(dx)
        for point in range(M.shape[0]):
            nearest_cluster = Cluster_centre[int(dx[point])]
            v[int(dx[point])] +=1
            n = 1/v[int(dx[point])]
            Cluster_centre[int(dx[point])] = Cluster_centre[int(dx[point])]*(1-n)  + np.multiply(M[point],n)
            
        for x in Matrix:
            min_dist = np.inf
            for c in Cluster_centre:
                eucd_dist = np.square(np.linalg.norm(x-c,ord=2))
                if(min_dist>eucd_dist):
                    min_dist=eucd_dist
            totalCost_mini_batch +=min_dist
        print("Iteration",i)
        print("Total Cost----",totalCost_mini_batch)

## Implementing modified Mini-Batch

In [8]:
def modified_MB():
    global Cluster_centre,Matrix,k
    v = np.zeros(Cluster_centre.shape[0])
    prob = computeProb(Cluster_centre,Matrix)
    for i in range(10):
        totalCost_modified_mini_batch=0
        index = np.random.choice(Matrix.shape[0],batch_size,replace=False,p=prob)
        M = Matrix[index,:]
        dx = np.zeros(M.shape[0])# for storing the nearest cluster
        dist = np.full((M.shape[0]), np.inf)
        
        for point in range(M.shape[0]):
            for cc in range(k):
                eucd_dist = np.square(np.linalg.norm(M[point]-Cluster_centre[cc],ord=2))
                if(eucd_dist < dist[point]):
                    dx[point] = cc
                    dist[point]=eucd_dist
                    
        for point in range(M.shape[0]):
            nearest_cluster = Cluster_centre[int(dx[point])]
            v[int(dx[point])] +=1
            n = 1/v[int(dx[point])]
            Cluster_centre[int(dx[point])] = Cluster_centre[int(dx[point])]*(1-n)  + np.multiply(M[point],n)
            
        for x in Matrix:
            min_dist = np.inf
            for c in Cluster_centre:
                eucd_dist = np.square(np.linalg.norm(x-c,ord=2))
                if(min_dist>eucd_dist):
                    min_dist=eucd_dist
            totalCost_modified_mini_batch +=min_dist
        print("Iteration",i)
        print("Total Cost----",totalCost_modified_mini_batch)

In [9]:
def computeProb(CC,Data):
    prob = np.zeros(Data.shape[0])
    total_cost = 0
    for i in range(Data.shape[0]):
        min_dist=np.inf
        for c in CC:
            eucd_dist = np.square(np.linalg.norm(Data[i]-c,ord=2))
            if(min_dist>eucd_dist):
                min_dist=eucd_dist
        total_cost +=min_dist
        prob[i]=min_dist
    return prob/total_cost

### Data Near Uniform: GA

In [10]:
A,b = GA(n,d)
Matrix = np.c_[A,b]
indexes = np.random.choice(Matrix.shape[0],k)
Cluster_centre = Matrix[indexes,:]

In [11]:
mini_batch()

Iteration 0
Total Cost---- 343588.9394863283
Iteration 1
Total Cost---- 313596.2467425782
Iteration 2
Total Cost---- 296297.40185575275
Iteration 3
Total Cost---- 283502.03865349694
Iteration 4
Total Cost---- 277689.6736670269
Iteration 5
Total Cost---- 273946.1467253812
Iteration 6
Total Cost---- 271051.5043327612
Iteration 7
Total Cost---- 268566.24246084224
Iteration 8
Total Cost---- 264400.6908516976
Iteration 9
Total Cost---- 264055.787280249


In [12]:
modified_MB()

Iteration 0
Total Cost---- 335501.99659586645
Iteration 1
Total Cost---- 305912.5924311203
Iteration 2
Total Cost---- 292039.34515037783
Iteration 3
Total Cost---- 282175.7262333858
Iteration 4
Total Cost---- 276864.6886378493
Iteration 5
Total Cost---- 271155.10921418783
Iteration 6
Total Cost---- 268863.7053346334
Iteration 7
Total Cost---- 266324.6095860981
Iteration 8
Total Cost---- 262912.1099116412
Iteration 9
Total Cost---- 261697.96633287575


### Data Moderatey Uniform

In [13]:
A,b = T3(n,d)
Matrix = np.c_[A,b]
indexes = np.random.choice(Matrix.shape[0],k)
Cluster_centre = Matrix[indexes,:]

In [14]:
mini_batch()

Iteration 0
Total Cost---- 1601173.3263358143
Iteration 1
Total Cost---- 1537109.2402090544
Iteration 2
Total Cost---- 1510503.324110093
Iteration 3
Total Cost---- 1379258.2554521752
Iteration 4
Total Cost---- 1368540.503347455
Iteration 5
Total Cost---- 1307832.047341842
Iteration 6
Total Cost---- 1305440.6482441593
Iteration 7
Total Cost---- 1261757.2792936054
Iteration 8
Total Cost---- 1256401.84173533
Iteration 9
Total Cost---- 1254930.3505263075


In [15]:
modified_MB()

Iteration 0
Total Cost---- 1347414.66617503
Iteration 1
Total Cost---- 1257636.9137802008
Iteration 2
Total Cost---- 1133643.1847305747
Iteration 3
Total Cost---- 1169675.8668203435
Iteration 4
Total Cost---- 1210898.2041265373
Iteration 5
Total Cost---- 1167368.2893259248
Iteration 6
Total Cost---- 1112523.1326488748
Iteration 7
Total Cost---- 1062384.4661968246
Iteration 8
Total Cost---- 1053769.0515050392
Iteration 9
Total Cost---- 1009887.7230271802


### Data Very Non-Uniform

In [16]:
A,b = T1(n,d)
Matrix = np.c_[A,b]
indexes = np.random.choice(Matrix.shape[0],k)
Cluster_centre = Matrix[indexes,:]

In [17]:
mini_batch()

Iteration 0
Total Cost---- 54744240675.55991
Iteration 1
Total Cost---- 54716722255.32324
Iteration 2
Total Cost---- 54716590604.33299
Iteration 3
Total Cost---- 54712412054.69331
Iteration 4
Total Cost---- 54646671240.64488
Iteration 5
Total Cost---- 54646223155.11331
Iteration 6
Total Cost---- 54645597694.54866
Iteration 7
Total Cost---- 53544495658.775345
Iteration 8
Total Cost---- 53543742420.78437
Iteration 9
Total Cost---- 53543748102.16358


In [18]:
modified_MB()

Iteration 0
Total Cost---- 28223615618.19861
Iteration 1
Total Cost---- 23719946847.087116
Iteration 2
Total Cost---- 22229734667.199688
Iteration 3
Total Cost---- 21581166728.90958
Iteration 4
Total Cost---- 21223389293.24581
Iteration 5
Total Cost---- 20833905022.297077
Iteration 6
Total Cost---- 20619883290.74041
Iteration 7
Total Cost---- 20459060387.54444
Iteration 8
Total Cost---- 20354173538.964417
Iteration 9
Total Cost---- 20277616034.985195
