In [1]:
import keras
import numpy as np
import tensorflow as tf
from tqdm.auto import tqdm, trange


# Prepare data

In [2]:
import unicodedata
import regex as re

def normalize_text(text):
    clean_text=text.lower()
    clean_text=re.sub(r"[^a-zA-Z0-9\säåö]"," ",clean_text)
    #clean_text=re.sub(r"[^\S\n]+"," ",clean_text)
    clean_text=re.sub(r"\s+"," ",clean_text)
    clean_text=re.sub(r"^\s","",clean_text)
    clean_text=re.sub(r"\s$","",clean_text)
    return clean_text

normalize_text("äåöjkjh jhjk j1hkj"),normalize_text("jFjh.jhjk,.,.ef jhkj "),normalize_text(" jkjh jhjk jhkj")

('äåöjkjh jhjk j1hkj', 'jfjh jhjk ef jhkj', 'jkjh jhjk jhkj')

In [None]:
import csv
finnish_company_names=[]
with open('fullprhdata.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=';')
    for row in csv_reader:
        finnish_company_names.append(row[0])
json.dump(finnish_company_names,open("finnish_registry_companies.json","w+"))

In [21]:
import json 
import random
finnish_companies = json.load(open("finnish_registry_companies.json","r+"))

Tx=13

# finnish 
finnish_names_arr=[normalize_text(company) for company in finnish_companies if len(company)<Tx]
random.shuffle(finnish_names_arr)
finnish_names_text="".join(finnish_names_arr)

characters=sorted(list(set(finnish_names_text)))
characters=[".","E"]+characters # . is none, E is end of line
char_indices = dict((c, i) for i, c in enumerate(characters))
indices_char = dict((i, c) for i, c in enumerate(characters))

print("\n".join(random.choices(finnish_names_arr,k=5)))
print()
print(f"Found names:", len(finnish_names_arr))
print("Characters:","".join(characters))
print("Chars: ",len(characters))
print("Tx: ",Tx)

provendix oy
trakai oy
zerochaos oy
sanmet oy
oy el ho ab

Found names: 72385
Characters: .E 0123456789abcdefghijklmnopqrstuvwxyzäåö
Chars:  42
Tx:  13


In [147]:
def str_to_vec(word, start_with_null=False):
    """
    Converts word to vec
    
    word -- string
    
    returns array of shape (Tx, len(chars))
    """
    if start_with_null:
        word="."+word
    x = np.zeros((len(word), len(characters)), dtype=np.bool)
    for t, char in enumerate(word):
        x[t, char_indices[char]] = 1
    return x
    
def vec_to_str(vec, debug=False):
    """
    Converts vec to word
    
    vec -- array of shape (Tx, len(chars))
    
    """
    word=""
    for i in range(vec.shape[0]):
        word+=indices_char[np.argmax(vec[i])]
    
    if debug==False:
        word=word.replace("E","").replace(".","")
    return word

a=str_to_vec("hello world",start_with_null=True)
print(vec_to_str(a))

hello world


In [27]:
def vectorization(words, n_x, Tx=None):
    """
    Convert X and Y (lists) into arrays to be given to a recurrent neural network.
    
    Arguments:
    X -- 
    Y -- 
    Tx -- integer, sequence length
    
    Returns:
    x -- array of shape (m, Tx, len(chars))
    y -- array of shape (m, Tx, len(chars))
    """
    if Tx is None:
        Tx=len(max(words,key=len))
        print(Tx)
    
    m = len(words)
    x = np.zeros((m, Tx, n_x), dtype=np.bool)
    y = np.zeros((m, Tx, n_x), dtype=np.bool)
    
    for w, word in enumerate(tqdm(words)):
        word=word[:Tx]
        x[w, 0:len(word)+1, :] = str_to_vec(word,start_with_null=True)
        x[w, len(word)+1:, char_indices["E"]] = 1
        
        y[w, 0:len(word),:] = str_to_vec(word)
        y[w, len(word):, char_indices["E"]] = 1
        
    return x, y 

X,Y=vectorization(finnish_names_arr, n_x=len(characters), Tx=Tx)
X.shape, Y.shape 

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=72385.0), HTML(value='')))




((72385, 13, 42), (72385, 13, 42))

In [34]:
id_=3430
for id_ in random.choices(range(X.shape[0]),k=5):
    print(vec_to_str(X[id_]))
    print(vec_to_str(np.array(Y[id_])))
    print()

.minimer oyEE
minimer oyEEE

.lhmed oyEEEE
lhmed oyEEEEE

.aher oyEEEEE
aher oyEEEEEE

.destaplan oy
destaplan oyE

.b1g lahti oy
b1g lahti oyE



In [54]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    out = np.random.choice(range(len(characters)), p = probas.ravel())
    out=characters[out]
    return out

def generate_output(model, text_start, length=5):
    
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print(f"Diversity: {diversity}")
        for l in range(length):
            generated = '\n'
            sentence = ('{0:\n>' + str(Tx) + '}').format(text_start).lower()
            generated += text_start 
            gens=0

            generated = ""
            inp=text_start
            sentence = inp

            x_pred = np.zeros((1, Tx, len(characters)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.0
            preds = model.predict(x_pred, verbose=0)[0]
            for pred in preds:
                next_index = sample(pred)
                next_char = next_index
                sentence = sentence + next_char
                generated += next_char
            print(f"{inp}{generated}")
        
generate_output(model,"")

Diversity: 0.2
ghai tEEEEEEE
uclaeeEEEEEEE
msjio EEEEEEE
arlmatEEEEEEE
mmatieEEEEEEE
Diversity: 0.5
eulua EEEEEEE
wcia  EEEEEEE
jkse aEEEEEEE
mdt ttEEEEEEE
nn ou EEEEEEE
Diversity: 1.0
cmzta EEEEEEE
kart yEEEEEEE
mnro  EEEEEEE
tmie aEEEEEEE
tjsoeaEEEEEEE
Diversity: 1.2
krewtuEEEEEEE
bta a EEEEEEE
yaroo EEEEEEE
1eeie EEEEEEE
it e eEEEEEEE


# Model

In [146]:
def make_name(model, beginning=""):
    name = beginning
    x = np.zeros((1, Tx, len(characters)))
    x[0,0:len(beginning),:]=str_to_vec(beginning,start_with_null=False)

    for i in range(len(beginning),Tx-1):
        prediction=model.predict(x)[0]
        probs = list(prediction[i])
        probs = probs / np.sum(probs)
        #index = np.random.choice(range(len(characters)), p=probs)
        index = np.argmax(prediction[i])
        character = indices_char[index]
        #print(f"{vec_to_str(x[0])} -> {vec_to_str(prediction)} -> {character}")
        if character=="E":
            break
        name+=character
        x[0, i+1, index] = 1

        i += 1
    
    print(name)

make_name(model)

one oy


In [38]:
from keras import layers, models
from keras.layers import Dense, Input, LSTM,GRU

drp=0
model=models.Sequential()
model.add(LSTM(64, input_shape=(Tx, len(characters)), return_sequences=True))
model.add(Dense(len(characters),activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 13, 64)            27392     
_________________________________________________________________
dense (Dense)                (None, 13, 42)            2730      
Total params: 30,122
Trainable params: 30,122
Non-trainable params: 0
_________________________________________________________________


In [45]:
from keras.callbacks import LambdaCallback

def on_epoch_end(epoch, logs):
    print()
    for i in range(3):
        make_name(model,"")
        
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)


In [None]:
opt=keras.optimizers.Adam(learning_rate=0.0001)

model.compile(loss='categorical_crossentropy', optimizer="adam")

history=model.fit(Xf, Yf, batch_size=64, validation_split=0.1,  epochs=1000, shuffle=True, callbacks=[print_callback],verbose=1)

In [None]:
import matplotlib
from matplotlib import pyplot as plt

results=history

plt.figure(figsize=(8, 16))
plt.title("Learning curve")
plt.plot(results.history["loss"], label="loss")
plt.plot(results.history["val_loss"], label="val_loss")
plt.plot( np.argmin(results.history["val_loss"]), np.min(results.history["val_loss"]), marker="x", color="r", label="best model")
plt.xlabel("Epochs")
plt.ylabel("log_loss")
plt.legend();


In [None]:
from keras.preprocessing import sequence

empty="."
eos="E"
eos_index=char_indices[eos]
xn=len(characters)


def predict_tree(x, position, k):
    prediction=model.predict(x.astype(bool).astype(float))[0]
    pred=prediction[position]
    
    indices = pred.argsort()[-k:]
    #print(f"{vec_to_str(x[0])} -> {vec_to_str(prediction)} -> {indices}")
    results=np.zeros((0, Tx, xn))
    for index in indices:
        res=np.array(x, copy=True)
        res[0,position+1,index]=pred[index]
        if index==eos_index:
            results=np.append(results,res,axis=0)
            break
        if position==Tx-2:
            results=np.append(results,res,axis=0)
            break
        results=np.append(results,predict_tree(res,position+1,k), axis=0)
    return results


def beamsearch(k, cnt):
    x = np.zeros((1, Tx, xn)) # starting with 0
    res=predict_tree(x, 0, k)
    probs=[]
    for sample in res:
        sample_prob=1
        for char in sample:
            ch_prob=char.max()            
            if ch_prob>0:
                sample_prob*=ch_prob
        probs.append(sample_prob)
    samples=[vec_to_str(a) for a in np.take(res,np.argsort(probs)[-cnt:],axis=0)]
    probs=np.take(probs,np.argsort(probs))[-cnt:]
    return list(zip(probs,samples))
    
    
res=beamsearch(k=2,cnt=10)
for prob, sample in res:
    print(f"{prob:0.5f}: {sample}")

In [131]:
len(res)

1

In [89]:
for prob, sample in res:
    print(f"{prob:0.2f}: {sample}")

0.48: oi a b5dddE..
0.48: oi a bbddvE..
1.48: oi a bbdddE..


In [None]:
# beam search
# train language model for the beam search
# end of line character should be zeros
# dropout
# try GRU
# save best model



In [None]:
from keras import layers, models
from keras.layers import Dense, Input, LSTM,GRU

drp=0
model=models.Sequential()
model.add(LSTM(64, input_shape=(Tx, len(characters)),return_sequences=True,activation="softmax"))
model.add(Dense(len(characters),activation="softmax"))
model.summary()

In [None]:
np.sum(model.predict(Xf[2:3,:,:]), axis=2)

In [None]:
model.predict(Xf[:1,:,:])