In [1]:
# Implementing Echo State Network for Artificial Grammar Learning(Reber)

# Importing required modules

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import linalg
from ipywidgets import *
from IPython.display import *

# Generating Reber Strings

import random as rnd

class ReberGrammarLexicon(object):

    lexicon = set() #contain Reber words
    graph = [ [(1,'T'), (2,'P')], \
            [(1, 'S'), (3, 'X')], \
            [(2,'T') ,(4, 'V')],  \
            [(2, 'X'), (5,'S')],           \
            [(3, 'P'),(5, 'V')],  \
            [(6,'E')] ]  #store the graph

    def __init__(self, num, maxSize = 1000): #fill Lexicon with num words

        self.maxSize = maxSize

        if maxSize < 5:
            raise NameError('maxSize too small, require maxSize > 4') 

        while len(self.lexicon) < num:

            word = self.generateWord()
            if word != None:
                self.lexicon.add(word)

    def generateWord(self): #generate one word

        c = 2
        currentEdge = 0
        word = 'B'

        while c <= self.maxSize:
            
            if(((currentEdge==3) | (currentEdge==4)) & (c<(self.maxSize/9))):
                inc=0
            else:
                inc = rnd.randint(0,len(self.graph[currentEdge])-1)
            
            nextEdge = self.graph[currentEdge][inc][0]
            word += self.graph[currentEdge][inc][1]
            currentEdge = nextEdge
            if currentEdge == 6 :
                break
            c+=1

        if c > self.maxSize :
            return None

        return word

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data = ['B', 'E', 'P', 'S', 'T', 'V', 'X']
values = np.array(data)

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit(integer_encoded)

def func(word,start=0):
    word = np.array(list(word[start:]))
    encoded = label_encoder.transform(word)
    return onehot_encoded.transform(encoded.reshape(-1,1))    

def preprocessing(dictionary,start=0):
    length = len(dictionary)
    result = func(list(dictionary)[0])
    
    for word in list(dictionary)[1:]:
        b = func(word)
        result = np.vstack((result,b))
        
        
    return result     

generator = ReberGrammarLexicon(2000,maxSize=20)

generator.lexicon

training_set = preprocessing(list(generator.lexicon)[:2000])

training_set.shape

# Setting random seed

def set_seed(seed=None):
    """Making the seed (for random values) variable if None"""

    if seed is None:
        import time
        seed = int((time.time()*10**6) % 4294967295)
        print(seed)
    try:
        np.random.seed(seed)
        print("Seed used for random values:", seed)
    except:
        print("!!! WARNING !!!: Seed was not set correctly.")
    return seed

# Creating Network Class

class Network(object):

    def __init__(self, trainLen=2000, testLen=2000, initLen=100) :
        self.initLen = initLen
        self.trainLen = trainLen
        self.testLen = testLen
        self.data = training_set
        self.inSize = self.outSize = 7 #Input/Output dimensions
        self.resSize = 400 #Reservoir size (prediction)
        #self.resSize = 1000 #Reservoir size (generation)
        self.a = 1 #Leak rate alpha
        self.spectral_radius = 1.25 #Spectral raidus
        self.input_scaling = 1. #Input scaling
        self.reg =  1e-8 #None #Regularization factor - if None,
        #we'd use pseudo-inverse rather than ridge regression

        self.mode = 'prediction'
        #self.mode = 'generative'

        #Change the seed, reservoir performances should be averaged accross
        #at least 20 random instances (with the same set of parameters)
        seed = None #42

        set_seed(seed)
        
nw = Network()

# Generating Win,W randomly and then generating X,Ytarget 

from scipy.sparse import rand

def initialization(nw) :

    #Weights
    #nw.Win = (np.random.rand(nw.resSize,1+nw.inSize)) * nw.input_scaling
    #nw.W = np.random.rand(nw.resSize,nw.resSize) 
    nw.Win = np.array(rand(nw.resSize,1+nw.inSize, density=0.25, format="csr", random_state=42).todense())
    nw.W = np.array(rand(nw.resSize,nw.resSize, density=0.25, format="csr", random_state=42).todense())
    
    #Matrices
    #Allocated memory for the design (collected states) matrix
    nw.X = np.zeros((1+nw.inSize+nw.resSize,nw.trainLen-nw.initLen))
    #Set the corresponding target matrix directly
    nw.Ytarget = nw.data[None,nw.initLen+1:nw.trainLen+1]

    #Run the reservoir with the data and collect X
    nw.x = np.zeros((nw.resSize,1))  
    
    return(nw)

# Computing spectral radius(biggest of the absolute eigen values of W matrix) and then scaling W matrix using that

def compute_spectral_radius(nw):
    print('Computing spectral radius...',end=" ")
    rhoW = max(abs(linalg.eig(nw.W)[0]))
    print('Done.')
    nw.W *= nw.spectral_radius / rhoW
    
    return(nw)

# Learning phase

# 𝑥𝑛 = (1−𝛼)𝑥𝑛 +  𝛼tanh(𝑊𝑖𝑛.𝑢𝑛 − 1) + 𝑊.𝑥𝑛−1

def learning_phase(nw) :
    for t in range(nw.trainLen):
        #Input data
        nw.u = nw.data[t]
        
      #  if (nw.u == np.array((0, 1, 0, 0, 0, 0, 0))).all(): 
      #      nw.x = np.zeros((nw.resSize,1))
      #  else:
        nw.x = (1-nw.a)*nw.x + nw.a*np.tanh( np.dot(nw.Win, np.vstack((1,nw.u.reshape(7,1))) ) + np.dot( nw.W, nw.x ) )
        #After the initialization, we start modifying X
        if t >= nw.initLen:
            nw.X[:,t-nw.initLen] = np.vstack((1,nw.u.reshape(7,1),nw.x.reshape(nw.resSize,1)))[:,0]
            
    return(nw)

# Training output weights using ridge regression
# 𝑊𝑜𝑢𝑡 = (𝑌𝑡.𝑋𝑇) . (𝑋.𝑋𝑇+𝑟𝑒𝑔.𝐼)^-1

def train_output(nw) :
    nw.X_T = nw.X.T
    if nw.reg is not None:
        # Ridge regression (linear regression with regularization)
        nw.Wout = np.dot(np.dot(nw.Ytarget[0].T,nw.X_T), linalg.inv(np.dot(nw.X,nw.X_T) + \
            nw.reg*np.eye(1+nw.inSize+nw.resSize) ) )
    else:
        # Pseudo-inverse
        nw.Wout = np.dot(nw.Ytarget, linalg.pinv(nw.X) )
        
    return(nw)

# Testing in a particular mode

def test(nw) :
    #Run the trained ESN in a generative mode. no need to initialize here, 
    #because x is initialized with training data and we continue from there.
    nw.Y = np.zeros((nw.testLen,nw.outSize))
    nw.u = nw.data[nw.trainLen]
    nw.reservoir = np.zeros((nw.testLen,nw.resSize))
    for t in range(nw.testLen):
#        if (nw.u == np.array((0, 1, 0, 0, 0, 0, 0))).all(): 
#            nw.x = np.zeros((nw.resSize,1))
#        else:
        nw.x = (1-nw.a)*nw.x + nw.a*np.tanh( np.dot(nw.Win, np.vstack((1,nw.u.reshape(7,1))) ) + np.dot( nw.W, nw.x ) )
        
        nw.reservoir[t] = nw.x.reshape(nw.resSize,)
        nw.y = np.dot(nw.Wout, np.vstack((1,nw.u.reshape(7,1),nw.x.reshape(nw.resSize,1))) )
        nw.Y[t][:] = nw.y.reshape(1,7)
        if nw.mode == 'generative':
            #Generative mode:
            nw.u = nw.y
        elif nw.mode == 'prediction':
            #Predictive mode:
            nw.u = nw.data[nw.trainLen+t+1] 
        else:
            raise(Exception, "ERROR: 'mode' was not set correctly.")
    
    return(nw)

def compute_error(nw) :
    # Computing MSE for the first errorLen iterations
    errorLen = 500
    mse = sum( np.square( nw.data[nw.trainLen+1:nw.trainLen+errorLen+1] - nw.Y[0,0:errorLen] ) ) / errorLen
    print('MSE = ' + str( mse ))
    
    return(nw)

def compute_network(nw) :
    nw = initialization(nw)
    nw = compute_spectral_radius(nw)
    nw = learning_phase(nw)
    nw = train_output(nw)
    nw = test(nw)  
    nw = compute_error(nw)
    return(nw)

# Definition of the network parameters

select_mode = ToggleButtons(description='Mode:',
    options=['prediction', 'generative'])
var1 = FloatSlider(value=300, min=0, max=1000, step=1, description='resSize')
var2 = FloatSlider(value=100, min=0, max=2000, step=1, description='initLen')
var3 = FloatSlider(value=2000, min=0, max=30000, step=1, description='trainLen')
var4 = FloatSlider(value=2000, min=0, max=8000, step=1, description='testLen')
var5 = FloatSlider(value=1.25, min=0, max=10, step=0.05, description='spectral radius')
var6 = FloatSlider(value=0.3, min=0, max=1, step=0.01, description='leak rate')
valid = Button(description='Validate')

def record_values(_) :
    clear_output()
    nw.mode=select_mode.value
    nw.resSize=int(var1.value)
    nw.initLen=int(var2.value)
    nw.trainLen=int(var3.value)
    nw.testLen=int(var4.value)
    nw.spectral_radius=float(var5.value)
    nw.a=float(var6.value)
    print("InitLen:", nw.initLen, "TrainLen:", nw.trainLen, "TestLen:", nw.testLen) 
    print("ResSize:", nw.resSize, "Spectral Radius:", nw.spectral_radius, "Leak Rate:", nw.a)
    compute_network(nw)
    return(nw)

display(select_mode)
display(var1)
display(var2)
display(var3)
display(var4)
display(var5)
display(var6)
display(valid)

valid.on_click(record_values)

y_pred = nw.Y

y_test = training_set[nw.trainLen+1:nw.trainLen+nw.testLen+1]

def top_2_accuracy(y_test,y_pred):
    k = 0
    for i in range(nw.testLen):
        if (y_test[i].argmax() == y_pred[i].argsort()[-1]) | (y_test[i].argmax() == y_pred[i].argsort()[-2]):
            k += 1
    print(k/nw.testLen)        

top_2_accuracy(y_test,y_pred)

chars='BEPSTVX'

y_test[:20]

y_pred[:20].round(2)

from plotly import __version__
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.decomposition import PCA

pca = PCA(n_components=10).fit(nw.reservoir)

df = pd.DataFrame({'Number of Components':list(range(1,nw.resSize+1)),'Cummulative Variance Explained':list(np.cumsum(pca.explained_variance_ratio_))})

plt.figure(figsize=(10,8))
df.iplot(kind='scatter',x='Number of Components',y='Cummulative Variance Explained')

np.cumsum(pca.explained_variance_ratio_)[:20]

reduced_internal_representations = pca.transform(nw.reservoir)

reduced_internal_representations[0].round(2)

corpus = generator.lexicon

def label_generator(corpus):
    result = []
    
    for word in corpus:
        
        for i in range(len(word)):
            result.append(word[:i+1])
            
    return result        

labels = label_generator(corpus)
test_labels = labels[nw.trainLen+1:nw.trainLen+nw.testLen+1]

len(test_labels)

import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import sklearn.metrics as sm

plt.figure(figsize=(12,12))
dendrogram = sch.dendrogram(sch.linkage(reduced_internal_representations[:40],method='ward'),orientation='right',labels=test_labels[:40])

nw.Wout.shape

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=5)

lsa = svd.fit_transform(nw.Wout)

lsa

print(svd.explained_variance_ratio_)

print(svd.singular_values_)

from numpy.linalg import svd as SVD

U, S, VT = SVD(nw.Wout,full_matrices=False)

print("Left Singular Vectors:")
print(U)
print()
print("Singular Values:") 
print(np.diag(S))
print()
print("Right Singular Vectors:") 
print(VT)

# check that this is an exact decomposition
# @ is used for matrix multiplication in Py3, use np.matmul with Py2
print(U @ np.diag(S) @ VT)

chars='BEPSTVX'
print(list(chars))
print(np.round(y_pred,decimals=2)[:20])

y_test[:20]

# Graph 1: Plotting neurons activations (total)

var10 = FloatSlider(value=2000,min=10,max=nw.trainLen-nw.initLen,step=10,description='time steps')
var11 = FloatSlider(value=10, min=1, max=nw.resSize, step=1, description='number of neurons')
valid = Button(description='Validate')

def trace_graph3(_) :
    clear_output()
    f=int(var10.value)
    nb=int(var11.value)
    plt.figure(3).clear()
    plt.figure(figsize=(10,7))
    plt.plot( nw.X[2:2+nb,0:f].T )
    print(nw.X.shape)
    plt.ylim([-1.1,1.1])
    plt.title('Activations $\mathbf{x}(n)$ from Reservoir Neurons ID 0 to '+str(nb-1)+' for '+str(f)+' time steps')
    
valid.on_click(trace_graph3)
    
display(var10)
display(var11)
display(valid)

# Graph 2: Plotting single neuron activation

var12 = FloatSlider(value=2000,min=10,max=nw.trainLen-nw.initLen,step=10,description='time steps')
var13 = FloatSlider(value=2, min=0, max=nw.resSize-1, step=1, description='neuron ID')
valid = Button(description='Validate')

def trace_graph4(_) :
    clear_output()
    f=int(var12.value)
    num=int(var13.value)
    plt.figure(4).clear()
    plt.figure(figsize=(10,5))
    plt.plot( nw.X[2+num,:f].T )
    plt.ylim([-1.1,1.1])
    plt.title('Activations $\mathbf{x}(n)$ from Reservoir Neuron ID '+str(num)+' for '+str(f)+' time steps')

valid.on_click(trace_graph4)

display(var12)
display(var13)
display(valid)

# Graph 3: Output weights at the end of the simulation

valid = Button(description='Show')

def trace_graph5(_) :
    clear_output()
    plt.figure(5).clear()
    plt.figure(figsize=(12,7))
    plt.bar(range(1+nw.inSize+nw.resSize), np.squeeze(nw.Wout.T) )
    plt.title('Output weights $\mathbf{W}^{out}$')

valid.on_click(trace_graph5)

display(valid)