In [1]:
import keras
import numpy as np
import tensorflow as tf
from tqdm import tqdm


# Prepare data

In [3]:
import unicodedata
import regex as re

def normalize_text(text):
    clean_text=text.lower()
    clean_text=re.sub(r"[^a-zA-Z0-9\s]"," ",clean_text)
    #clean_text=re.sub(r"[^\S\n]+"," ",clean_text)
    clean_text=re.sub(r"\s+"," ",clean_text)
    clean_text=re.sub(r"^\s","",clean_text)
    clean_text=re.sub(r"\s$","",clean_text)
    return clean_text

normalize_text("jkjh jhjk j1hkj"),normalize_text("jFjh.jhjk,.,.ef jhkj "),normalize_text(" jkjh jhjk jhkj")

('jkjh jhjk j1hkj', 'jfjh jhjk ef jhkj', 'jkjh jhjk jhkj')

In [7]:
import json 
import random
finnish_companies = json.load(open("finnish_kaggle_companies.json","r+"))
all_companies = json.load(open("all_kaggle_companies.json","r+"))

names_arr=[normalize_text(company) for company in tqdm(all_companies)]
random.shuffle(names_arr)
names_text="\n".join(names_arr)
characters=sorted(list(set(names_text)))

char_indices = dict((c, i) for i, c in enumerate(characters))
indices_char = dict((i, c) for i, c in enumerate(characters))

Tx=20


100%|██████████| 7173427/7173427 [01:25<00:00, 84151.57it/s]


In [11]:
print("Characters: ","".join(characters))
print("Chars: ",len(characters))
print("Tx: ",Tx)

Characters:  
 0123456789abcdefghijklmnopqrstuvwxyz
Chars:  38
Tx:  20


In [12]:
def str_to_vec(word):
    """
    Converts word to vec
    
    word -- string
    
    returns array of shape (Tx, len(chars))
    """
    x = np.zeros((len(word), len(characters)), dtype=np.bool)
    for t, char in enumerate(word):
        x[t, char_indices[char]] = 1
    return x
    
def vec_to_str(vec):
    """
    Converts vec to word
    
    vec -- array of shape (Tx, len(chars))
    
    """
    word=""
    for i in range(vec.shape[0]):
        word+=indices_char[np.argmax(vec[i])]
    return word

a=str_to_vec("hello world")
#print(a)
print(vec_to_str(a))

hello world


In [13]:
def vectorization(text, stride, n_x, Tx):
    """
    Convert X and Y (lists) into arrays to be given to a recurrent neural network.
    
    Arguments:
    X -- 
    Y -- 
    Tx -- integer, sequence length
    
    Returns:
    x -- array of shape (m, Tx, len(chars))
    y -- array of shape (m, Tx, len(chars))
    """
    
    m = int((len(text)-Tx-1)/stride)
    x = np.zeros((m, Tx, n_x), dtype=np.bool)
    y = np.zeros((m, n_x), dtype=np.bool)
    
    for t, i in enumerate(tqdm(range(0,m*stride,stride))):
        fragment=text[i:i+Tx]
        pred=text[i+Tx+1]
        x[t, :, :] = str_to_vec(fragment)
        
        y[t, :] = str_to_vec(pred)
    return x, y 


X,Y=vectorization(names, 10, n_x=len(characters), Tx=Tx)
X.shape, Y.shape

100%|██████████| 15347887/15347887 [01:44<00:00, 146752.55it/s]


((15347887, 20, 38), (15347887, 38))

In [14]:
id_=2000000
vec_to_str(X[id_]),vec_to_str(np.array([Y[id_]]))

('ino uav\ndesch\nsmall ', 'o')

# Model

In [18]:
from keras import layers, models
from keras.layers import Dense, Input, LSTM,GRU


drp=0
model=models.Sequential()
model.add(GRU(128, input_shape=(Tx, len(characters)),return_sequences=True, dropout=drp))
#model.add(GRU(128,return_sequences=True, dropout=drp))
model.add(GRU(128,return_sequences=False, dropout=drp))
model.add(Dense(256,activation="relu"))
model.add(Dense(len(characters),activation="softmax"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_3 (GRU)                  (None, 20, 128)           64512     
_________________________________________________________________
gru_4 (GRU)                  (None, 128)               99072     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_3 (Dense)              (None, 38)                9766      
Total params: 206,374
Trainable params: 206,374
Non-trainable params: 0
_________________________________________________________________


In [19]:
from keras.callbacks import LambdaCallback

def on_epoch_end(epoch, logs):
    generate_output(model,"acc")

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)


In [None]:
model.compile(loss='categorical_crossentropy', optimizer="adam")

history=model.fit(X, Y, batch_size=128, validation_split=0.1,  epochs=20, shuffle=True, callbacks=[print_callback])

Epoch 1/20
 16594/107915 [===>..........................] - ETA: 1:05:30 - loss: 2.2954

In [191]:
tests=np.array([str_to_vec(a) for a in ["ap"]])

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    out = np.random.choice(range(len(characters)), p = probas.ravel())
    out=characters[out]
    return out


for diversity in [0.2, 0.5, 1.0, 1.2]:
    print("...Diversity:", diversity)

    generated = ""
    inp=" accen"
    sentence = inp
    print('...Generating with seed: "' + sentence + '"')

    for i in range(30):
        x_pred = np.zeros((1, Tx, len(characters)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.0
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = next_index
        sentence = sentence + next_char
        generated += next_char
    print(inp+generated)


#[vec_to_str(res) for res in ]

...Diversity: 0.2
...Generating with seed: " accen"
 accenetrinoeeaeaeeeael i
oeeoesa
ae
...Diversity: 0.5
...Generating with seed: " accen"
 accenneer
eeleear
t eetnaclln
lt
ii
...Diversity: 1.0
...Generating with seed: " accen"
 accenim
iritipswaonp
uu sbrpbrsreco
...Diversity: 1.2
...Generating with seed: " accen"
 accenun oort 
roua
cdsdimay
a
kinrn


In [219]:
def generate_output(model, text_start, length=30):
    generated = '\n'
    sentence = ('{0:0>' + str(Tx) + '}').format(text_start).lower()
    generated += text_start 

    for i in range(length):
        x_pred = np.zeros((1, Tx, len(characters)))

        for t, char in enumerate(sentence):
            if char != '0':
                x_pred[0, t, char_indices[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temperature = 1.0)
        next_char = next_index

        generated += next_char
        sentence = sentence[1:] + next_char

        if next_char == '\n':
            print(generated)
            return
    print(generated)
    
generate_output(model,"accent")


accenti

