In [2]:
import pandas as pd
import os
import sys
from datetime import date
from math import fsum, log
import operator
from glob import glob
from ipywidgets import FloatProgress
from IPython.display import display
import pickle
import numpy as np
import itertools

curr_year = date.today().year

# must end in '/'
corpus_loc = "../names/"
ngram_data_loc = "./ngram_data/"

# scaling factor for discounting old names; the higher, the more discounted
# range: 0 to infinity
ago_scaling= 0.25

# how many characters to condition on + 1
n = 5

In [109]:
def chop_row(row, n):
    buffered = ((n - 1) * '<') + row.text + '>'
    return [buffered[i:i+n] for i in range(len(buffered) - n + 1)]
    
def chop_df(df, n=3):
    return df.apply(lambda x: chop_row(x, n), axis=1)

def normalize(d):
    factor = 1.0 / fsum(d.values())
    return { k: v * factor for k, v in d.items() }

def sum_to_one(p):
    probs = np.array(p)
    probs /= probs.sum()
    probs
    

## Making the probability tables per ngram

In [117]:
counts = dict()
fcounts = dict()
mcounts = dict()
names = set()
fnames = set()
mnames = set()

prog = FloatProgress(min=0, max=len(glob(corpus_loc + "*.txt")))
display(prog)

for f in glob(corpus_loc + "*.txt"):
    ago = curr_year - int(f.replace(corpus_loc, "").replace("yob", "").replace(".txt", ""))
        
    df = pd.read_csv(f, names=['text','gender','freq'], sep=',', index_col=False)
        
    for (name, gender) in zip(df.text, df.gender):
        names.add(name)
        if gender == 'F':
            fnames.add(name)
        if gender == 'M':
            mnames.add(name)

    for (arr, freq, gender) in zip(chop_df(df, n), df.freq, df.gender):
        for ngram in arr:
            counts[ngram] = counts.get(ngram, 0) + (freq / np.power((ago + 1), ago_scaling))
            if gender == 'F':
                fcounts[ngram] = fcounts.get(ngram, 0) + (freq / np.power((ago + 1), ago_scaling))
            if gender == 'M':
                mcounts[ngram] = mcounts.get(ngram, 0) + (freq / np.power((ago + 1), ago_scaling))
    prog.value += 1

pickle.dump(counts, open(ngram_data_loc + 'counts' + str(n) + '.p', "wb"))
pickle.dump(fcounts, open(ngram_data_loc + 'fcounts' + str(n) + '.p', "wb"))
pickle.dump(mcounts, open(ngram_data_loc + 'mcounts' + str(n) + '.p', "wb"))

## Loading the probability tables once made

In [3]:
n = 4

counts = pickle.load(open(ngram_data_loc + 'counts' + str(n) + '.p', 'rb'))
fcounts = pickle.load(open(ngram_data_loc + 'fcounts' + str(n) + '.p', 'rb'))
mcounts = pickle.load(open(ngram_data_loc + 'mcounts' + str(n) + '.p', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: './ngram_data/mcounts4.p'

In [137]:
deep = dict()
fdeep = dict()
mdeep = dict()

for k in counts:
    cond = k[0:(n-1)]
    consequent = k[(n-1):]
    if not cond in deep:
        deep[cond] = dict()
    deep[cond][consequent] = counts[k]

for k in fcounts:
    cond = k[0:(n-1)]
    consequent = k[(n-1):]
    if not cond in fdeep:
        fdeep[cond] = dict()
    fdeep[cond][consequent] = fcounts[k]

for k in mcounts:
    cond = k[0:(n-1)]
    consequent = k[(n-1):]
    if not cond in mdeep:
        mdeep[cond] = dict()
    mdeep[cond][consequent] = mcounts[k]

In [138]:
# # add a small value for combinations not seen, rather than backing off

# alphabet = set(list(''.join(names).lower())).union(set('>'))

# for karr in itertools.permutations(alphabet, r = n - 1):
#     k = ''.join(karr)
#     if k not in deep:
#             deep[k] = dict()

# for k in deep:
#     if k != ((n - 1) * "<"):
#         for a in alphabet:
#             deep[k][a] = deep[k].get(a, 0) + 0.001

In [139]:
# now normalize

for k in deep:
    deep[k] = normalize(deep[k])

for k in fdeep:
    fdeep[k] = normalize(fdeep[k])

for k in mdeep:
    mdeep[k] = normalize(mdeep[k])

In [140]:
howmany = 25
maxlength = 50

def generate(howmany, gender, maxlength):
    newnames = list()
    if (gender == "F"):
        cond = fdeep
        extant = fnames
    elif (gender == "M"):
        cond = mdeep
        extant = mnames
    else:
        cond = deep
        extant = names

    for i in range(howmany):
        new = False
        sofar = "<" * (n - 1)
        while (not new):
            given = cond[sofar[(1-n):]]
            proposed = np.random.choice(list(given.keys()), 1, False, list(given.values()))[0]
            if proposed == ">" or len(sofar[(n-1):]) >= maxlength:
                if sofar[(n-1):] not in extant:
                    newnames.append(sofar[(n-1):])
                    new = True
                sofar = "<" * (n - 1)
            else:
                sofar = sofar + proposed
                new = False
    return(newnames)
    
newnames = generate(howmany, "F", maxlength)
print("{}".format(", ".join(newnames)))

newnames = generate(howmany, "M", maxlength)
print("{}".format(", ".join(newnames)))

newnames = generate(howmany, "", maxlength)
print("{}".format(", ".join(newnames)))


Annifer, Astrel, Kell, Phyline, Karet, Wand, Nicoletty, Gildrey, Rachellie, Kaylorah, Brancesca, Audreda, Carrieanna, Marlotte, Kyriquelina, Abigailey, Josita, Claudith, Fernee, Leth, Cecellen, Barbarah, Blonice, Bonnah, Stephian
Lore, Micharles, Salake, Richael, Dant, Dome, Christonio, Michard, Stevelt, Roques, Marker, Richell, Zacheal, Emmed, Yohampton, Westophen, Richael, Lawrent, Josepher, Clyden, Denn, Reynar, Tristopher, Larrentin, Jero
Jessicarley, Kimbert, Karence, Sebanie, Patri, Karette, Cheyenneth, Maristophen, Avonnielle, Felie, Antjuanie, Aliyanaya, Harmenee, Dennegan, Brank, Vaneth, Jeres, Nancine, Wylerica, Denix, Coope, Earles, Andredo, Benja, Andreano
