# Exercise 2
### Deep neural networks with Keras

Look at data in the file “secreckeys_exe.csv”, which is placed in the google folder.
1. Assuming that there is an invariance of the results with respect to shift of the digits
in a data sample (e.g. 1234567 and 7123456 give the same result y=0 or y=1), try
to improve the accuracy of the model over the validation data set by “augmenting”
the data. In practice, for every sample there are L-1 equivalent ones.
2. Implement a “grid search” as shown in NB11 to improve one or more of the aspects
or parameters of the model. Possible tests include: different activation units
(sigmoid, relu, elu, etc.), different minimization algorithms (adam, rmsprop, sgd with
momentum, etc.), different dropouts, etc.
3. See if any rescaling of data (x: the sequences of 0 and 1 used to feed the network)
may improve the results. 

In [89]:
import math
import csv
import numpy as np
import numpy.random as npr

from keras.models import Sequential
from keras.layers import Dense, Dropout

## Reading the data

In [23]:
fname = 'secretkeys_exe.csv'
data = np.loadtxt(fname, delimiter = ',', dtype = int)

# Digits 1...9
D = 9
# Length of each numeric string
L = len(str(data[0][0]))
print(f'Length of each string: {L}')
# Length of the dataset
N = len(data)
print(f'Length of the dataset: {N} \n')

# Selecting data and labels
s = data[:,0]
y = data[:,-1]

# Visualizing some data
for i in range(10):
    print(s[i], y[i])

Length of each string: 7
Length of the dataset: 3000 

4573627 0
9393629 0
9334246 0
1861971 0
5527227 1
9614284 0
5793135 0
6265761 1
7731649 1
1862956 1


## Augmentation

In [81]:
# Function that returns all the permutations of the given number of L = 7 digits

def cyclic(n):
    # initialiting the list that will contain the permutations
    a = []
    num = n
    while (1): 
        a.append(int(num)) 
    
        # Following three lines generates a circular permutation of a number. 
        rem = num % 10; 
        div = math.floor(num / 10); 
        num = ((math.pow(10, L - 1)) * rem + div); 
          
        # If all the permutations are checked and we obtain original number exit from loop.  
        if (num == n): 
            break; 
    return np.array(a, dtype = int)
        
# See if it works:
cyclic(1234567)

# References for the cyclic function, from which part of the algorithm was taken: https://www.geeksforgeeks.org/generate-cyclic-permutations-number/

array([1234567, 7123456, 6712345, 5671234, 4567123, 3456712, 2345671])

In [82]:
# Adding the permuted data to the initial dataset

# Initializing the empty array
aug_s = np.empty((len(s), L), dtype = int) # each row will contain all the permutations of the original data
aug_y = np.empty((len(s), L), dtype = int)
for i in range(len(s)):
    aug_s[i, :] = cyclic(s[i])
    aug_y[i, :] = y[i]

# print the first 5 rows to see if everything's ok
print(aug_s[0:5])
print(aug_y[0:5], '\n')

# reshaping the data by flattening the arrays
aug_s = aug_s.reshape(len(s)*L)
aug_y = aug_y.reshape(len(s)*L)

# print the first 30 rows to see if everything's ok
print(aug_s[0:30])
print(aug_y[0:30], '\n')

[[4573627 7457362 2745736 6274573 3627457 7362745 5736274]
 [9393629 9939362 2993936 6299393 3629939 9362993 3936299]
 [9334246 6933424 4693342 2469334 4246933 3424693 3342469]
 [1861971 1186197 7118619 9711861 1971186 6197118 8619711]
 [5527227 7552722 2755272 2275527 7227552 2722755 5272275]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1]] 

[4573627 7457362 2745736 6274573 3627457 7362745 5736274 9393629 9939362
 2993936 6299393 3629939 9362993 3936299 9334246 6933424 4693342 2469334
 4246933 3424693 3342469 1861971 1186197 7118619 9711861 1971186 6197118
 8619711 5527227 7552722]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1] 



In [91]:
# Shuffling the data after being certain to maintain the respective correlation between s and y:
# aug_data merges data and label
aug_data = np.empty((len(s)*L, 2), dtype = int)
for i in range(len(aug_s)):
    aug_data[i, :] = np.array([aug_s[i], aug_y[i]], dtype = int)

# to check everything is ok
#print(aug_data)

# Shuffing data: this shuffles only the rows, that is what we wanted
npr.shuffle(aug_data)
print(aug_data)

[[7677348       0]
 [5821559       0]
 [3869821       0]
 ...
 [9414168       0]
 [1521389       0]
 [8491721       0]]


In [112]:
#Going back to data and labels array:
s = aug_data[:, 0]
y = aug_data[:, 1]

### Using Categories: expand function

In [105]:
# Selecting categories using the function "expand", as in lecture 2 by Baiesi. It's actually the same function of Baiesi but it returns a np.array
# For instance, 1 -----> 1000000, 2------> 0100000, etc...
LD = L*D #that's the true dimension of a given datasample that we're going to use in a given NN

def expand(S):
    # If error from beginning
    if len(str(S))!=L:
        print('Mismatch!')
        return []
    # If all good:
    x = [0] * LD
    p = 10**(L-1)
    j = 0
    while j < L:
        q = int(S/p)
        x[j*D + (q-1)] = 1 
        j += 1
        S = S - q*p 
        p = int(p/10) 
    return np.array(x, dtype = int) 

print(s[0], '---->', expand(s[0]))

7677348 ----> [0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0]


## Split training and test data

In [120]:
# Length of the augmented dataset
N = len(s)
print(f'Length of the augmented dataset: {N} \n')

# Fraction of training data (the remaining will be validation test -we merge them today-)
perc_train = 0.8
N_train = int(perc_train * N)
print(f'data: {N} \n#train: {N_train} \n')

x_all = np.empty((N, L*D))
for i in range(N):
    x_all[i] = expand(s[i])
print('x_all[0] ---> ',x_all[0], '\n')

x_train, y_train = (x_all[0:N_train], y[0:N_train])
x_test, y_test = (x_all[N_train:], y[N_train:])

print('#train = ',len (x_train), '\n#test =', len(x_test))

Length of the augmented dataset: 21000 

data: 21000 
#train: 16800 

x_all[0] --->  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] 

#train =  16800 
#test = 4200
