## Data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('users.csv')
df.drop('n', axis=1, inplace=True)

In [None]:
# convert users.csv to a text file of usernames separated by newlines
# this is safe because usernames cannot contain backslashes
df_string = df.sample(n=10000).to_string(index=False, header=False)
df_string = df_string.replace(' ', '')

In [None]:
with open("sample.txt", "w") as file:
  file.write(df_string)

In [None]:
with open("sample.txt", "r") as file:
  data = file.read()

## Model

In [None]:
from tensorflow.keras.layers import LSTM, Dense, Activation
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import RMSprop
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import random

In [None]:
# training
with open("sample.txt", "r") as file:
  data = file.read()
chars = sorted(list(set(data))) 

# should be 26 letters, 10 nums, dash, underscore, and newline, 65 total
if len(chars) != 65:
  raise Exception

char_to_int = dict((c,i) for i,c in enumerate(chars))
int_to_char = dict((i,c) for i,c in enumerate(chars))

print('Number of Chars in Training Data: {}'.format(len(data)))
print('Number of Unique Chars: {}'.format(len(chars)))


Number of Chars in Training Data: 120246
Number of Unique Chars: 65


In [None]:
# convert data into a sequence of characters paired with a target next_char
data_length = 10
step = 3
# Split into feature and target
inp = []
out = []
for i in range(0, len(data) - data_length, step):
    inp.append(data[i:i+data_length])
    out.append(data[i+data_length])

# Vectorize with our char_to_int mapping
x = np.zeros((len(inp), data_length, len(chars)), dtype=bool)
y = np.zeros((len(out), len(chars)), dtype=bool)
for i, sequence in enumerate(inp):
    for t, char in enumerate(sequence):
        x[i, t, char_to_int[char]] = 1
    y[i, char_to_int[out[i]]] = 1

In [None]:
# Model Definition
hidden_units = 64
model = Sequential()
model.add(LSTM(hidden_units, input_shape=(data_length, len(chars))))
model.add(Dense(units=len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer)

In [None]:
print("{}: Number of Documents in Batch: {}".format(i, len(x)))
model.fit(x, y, epochs=8, verbose=1)

40078: Number of Documents in Batch: 40079
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f94a5b77c50>

In [None]:
# Gets a random seed
def get_seed():
    seed = ""
    for i in range(10):
        num = random.randint(0,len(chars)-1)
        seed += int_to_char[num]
    return seed

# One hot encodes a string
def string_to_vec(string):
    vec = np.zeros((1, len(string), len(chars)), dtype=bool)
    for t, char in enumerate(string):
        vec[0, t, char_to_int[char]] = 1
    return vec


In [None]:
# Sampling with temperature

def sample_temperature(preds, temp):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temp
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# generates a list of names from the model and an initial seed
def generate_with_temp(model, seed, num_results=10, temp = 0.7):
    generated = []
    text = seed
    word = seed
    while num_results>0:
        vec = string_to_vec(text)
        preds = model.predict(vec, verbose=0)[0]
        predicted = sample_temperature(preds, temp)
        character = int_to_char[predicted]
        text = text[1:] + character
        word += character
        if character == '\n':
            num_results -=1
            try:
              generated.append(word[:-1])
            except:
              print(word[:-1])
            word = ""
        
    return generated

In [None]:
# Let's see some results
result = generate_with_temp(model, get_seed(), 10, 0.7)
print(result)

['80uPYw62X9', 'lelesnogoto', 'kearruses_', 'BenninWot', 'Misterroa', 'throwawayback', 'sazzeasin', 'hery12345', 'buynoting_atit', 'ihssted_ogh']


In [None]:
# top k sampling
def softmax(z):
   return np.exp(z)/sum(np.exp(z))

def sample_top_k(preds, k):
  k_probs, k_values = tf.math.top_k(preds, k=k, sorted=True)
  k_probs = np.asarray(k_probs).astype("float32")
  k_values = np.asarray(k_values).astype("int32")
  k_probs = softmax(k_probs)
  probas = np.random.multinomial(1, k_probs, 1)
  return k_values[np.argmax(probas)]

def generate_with_top_k(model, seed, num_results=10, k=10):
  generated = []
  text = seed
  word = seed
  while num_results > 0:
    vec = string_to_vec(text)
    preds = model.predict(vec, verbose=0)[0]
    predicted = sample_top_k(preds, 10)
    character = int_to_char[predicted]
    text = text[1:] + character
    word += character
    if character == '\n':
      num_results-=1
      generated.append(word[:-1])
      word = ""
  return generated

In [None]:
result = generate_with_top_k(model, get_seed(), 10, 10)
print(result)

['rmodb9jZYG33300304', 'tgantupubatigaisstrly', 'saadtonnes1977779', 'Ththanndagumo', 'airincemst', 'Sooky', 'gllway28540o0803', 'ambmnd181077r4884', 'Scthcon', 'amialim']


In [None]:
# beam search

def generate_with_beam_search(model, seed, beam_size=10):
  beams = []
  seeds = [seed+"\n"] * beam_size
  for i, seed in enumerate(seeds):
    vec = string_to_vec(seed)
    preds = model.predict(vec, verbose=0)[0]
    
    beams.append()

  print(beams)

generate_with_beam_search(model, get_seed(), 10)

['', '', '', '', '', '', '', '', '', '']
