In [34]:
from collections import Counter, defaultdict

import numpy as np
import pandas as pd


In [4]:
df = pd.read_csv("data.csv")

In [350]:
df.head()

Unnamed: 0,year,name,gender,count,clean_name
0,1880,Mary,F,7065,^mary$
1,1880,Anna,F,2604,^anna$
2,1880,Emma,F,2003,^emma$
3,1880,Elizabeth,F,1939,^elizabeth$
4,1880,Minnie,F,1746,^minnie$


In [6]:
df.shape

(2085158, 4)

In [352]:
df["clean_name"] = '^' + df.name.str.lower() + '$'


In [355]:
df.head()

Unnamed: 0,year,name,gender,count,clean_name
0,1880,Mary,F,7065,^mary$
1,1880,Anna,F,2604,^anna$
2,1880,Emma,F,2003,^emma$
3,1880,Elizabeth,F,1939,^elizabeth$
4,1880,Minnie,F,1746,^minnie$


In [356]:
df.tail()

Unnamed: 0,year,name,gender,count,clean_name
2085153,2022,Zuberi,M,5,^zuberi$
2085154,2022,Zydn,M,5,^zydn$
2085155,2022,Zylon,M,5,^zylon$
2085156,2022,Zymeer,M,5,^zymeer$
2085157,2022,Zymeire,M,5,^zymeire$


In [212]:
small = df[(df.year > 2010) & (df.gender == "M")]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small["clean_name"] = '^' + small.name.str.lower() + '$'


In [213]:
small.shape

(169781, 5)

In [187]:
small.head()

Unnamed: 0,year,name,gender,count,clean_name
0,1880,Mary,F,7065,^mary$
1,1880,Anna,F,2604,^anna$
2,1880,Emma,F,2003,^emma$
3,1880,Elizabeth,F,1939,^elizabeth$
4,1880,Minnie,F,1746,^minnie$


In [25]:
def get_name_transition_probs(name):
    return Counter((name[i], name[i+1]) for i in range(len(name) - 1))

In [29]:
def get_n_grams(text, n):
    """
    Returns a list of n-grams from a text.
    """
    n_grams = []
    for i in range(len(text) - n + 1):
        n_grams.append(text[i:i + n])
    return n_grams

In [31]:
get_n_grams("^baba$", 2)

['^b', 'ba', 'ab', 'ba', 'a$']

In [335]:
class TransitionCount(defaultdict):
    def __init__(self, *args, **kwargs):
        super(TransitionCount, self).__init__(Counter, *args, **kwargs)
        # super(*args, **kwargs)

    def __add__(self, other):
        all_keys = self.keys() | other.keys()
        return TransitionCount({k: self[k] + other[k] for k in all_keys})
    
    def __mul__(self, scalar):
        new = TransitionCount()
        for k in self.keys():
            for k2 in self[k].keys():
                new[k][k2] = self[k][k2] * scalar
        return new

In [357]:
def get_transition_counts(text):
    out_dict = TransitionCount()
    for i in range(len(text) - 1):
        out_dict[text[i]][text[i + 1]] += 1
    return out_dict

def get_transition_counts2(text):
    out_dict = TransitionCount()
    for i in range(len(text) - 2):
        out_dict[text[i:i+2]][text[i + 2]] += 1
    return out_dict

def get_transition_countsN(text, n):
    out_dict = TransitionCount()
    for i in range(len(text) - n):
        out_dict[text[i:i+n]][text[i + n]] += 1
    return out_dict

In [82]:
get_transition_counts("^babc$")

TransitionCount(collections.Counter,
                {'^': Counter({'b': 1}),
                 'b': Counter({'a': 1, 'c': 1}),
                 'a': Counter({'b': 1}),
                 'c': Counter({'$': 1})})

In [84]:
get_transition_counts("mary") + get_transition_counts("mom")


TransitionCount(collections.Counter,
                {'m': Counter({'a': 1, 'o': 1}),
                 'a': Counter({'r': 1}),
                 'r': Counter({'y': 1}),
                 'o': Counter({'m': 1})})

In [85]:
get_transition_counts("mary") * 10

TransitionCount(collections.Counter,
                {'m': Counter({'a': 10}),
                 'a': Counter({'r': 10}),
                 'r': Counter({'y': 10})})

In [97]:
cnts = [get_transition_counts(c) * freq for c, freq in zip(small.clean_name, small["count"])]

In [98]:
cnts[1]

TransitionCount(collections.Counter,
                {'^': Counter({'a': 2604}),
                 'a': Counter({'n': 2604, '$': 2604}),
                 'n': Counter({'n': 2604, 'a': 2604})})

In [100]:
all_transition_counts = sum(cnts, start=TransitionCount())

In [101]:
all_transition_counts["c"]

Counter({'e': 2937082,
         'h': 2425822,
         'a': 1354325,
         'i': 972175,
         'l': 955181,
         'k': 584719,
         'o': 449505,
         'y': 381564,
         't': 111253,
         'q': 69085,
         'u': 65416,
         '$': 62094,
         'c': 53569,
         'r': 23134,
         'd': 357,
         'z': 159,
         'n': 59})

In [102]:
def normalize_transition_count_inplace(transition_count):
    """
    This function normalizes the transition count dictionary by dividing each
    count by the total count for the first word in the transition.
    """
    for letter, next_letters in transition_count.items():
        total_count = sum(next_letters.values())
        for l in next_letters:
            next_letters[l] /= total_count

In [296]:
def create_transition_probs(transition_count):
    """
    This function normalizes the transition count dictionary by dividing each
    count by the total count for the first word in the transition.
    """
    new_tc = TransitionCount()
    for ngram, next_letters in transition_count.items():
        total_count = sum(next_letters.values())
        for l in next_letters:
            new_tc[ngram][l] = transition_count[ngram][l] / total_count
    return new_tc

In [214]:
all_transition_counts2 = sum((get_transition_counts2(c) * freq 
                             for c, freq in zip(small.clean_name, small["count"]))
                            , start=TransitionCount())

In [215]:
all_transition_counts2["ch"]

Counter({'a': 465688,
         'r': 203262,
         'o': 76707,
         'e': 51003,
         'i': 43551,
         '$': 11427,
         'l': 7625,
         'u': 2401,
         'm': 761,
         'y': 434,
         'n': 201,
         't': 41,
         's': 5})

In [216]:
normalize_transition_count(all_transition_counts2)

In [217]:
generate_name(all_transition_counts2, 2)

ValueError: 'a' cannot be empty unless no samples are taken

In [198]:
import random

In [219]:
start_states = [k for k in all_transition_counts2.keys() if k[0] == '^']

In [263]:
n=2
# name = "^j"
name = random.choice(start_states)
while name[-1] != "$":
    next_char = np.random.choice(list(all_transition_counts2[name[-n:]].keys()),
                                    p=list(all_transition_counts2[name[-n:]].values()))
    name += next_char
# return name[1:-1]
print(name[1].upper() + name[2:-1])

Briffraylaco


In [359]:
df.shape

(2085158, 5)

In [404]:
df["count_obj"] = df["clean_name"].apply(get_transition_counts2)

In [405]:
df.head()

Unnamed: 0,year,name,gender,count,clean_name,count_obj
0,1880,Mary,F,7065,^mary$,"{'^m': {'a': 1}, 'ma': {'r': 1}, 'ar': {'y': 1..."
1,1880,Anna,F,2604,^anna$,"{'^a': {'n': 1}, 'an': {'n': 1}, 'nn': {'a': 1..."
2,1880,Emma,F,2003,^emma$,"{'^e': {'m': 1}, 'em': {'m': 1}, 'mm': {'a': 1..."
3,1880,Elizabeth,F,1939,^elizabeth$,"{'^e': {'l': 1}, 'el': {'i': 1}, 'li': {'z': 1..."
4,1880,Minnie,F,1746,^minnie$,"{'^m': {'i': 1}, 'mi': {'n': 1}, 'in': {'n': 1..."


In [406]:
df.tail()

Unnamed: 0,year,name,gender,count,clean_name,count_obj
2085153,2022,Zuberi,M,5,^zuberi$,"{'^z': {'u': 1}, 'zu': {'b': 1}, 'ub': {'e': 1..."
2085154,2022,Zydn,M,5,^zydn$,"{'^z': {'y': 1}, 'zy': {'d': 1}, 'yd': {'n': 1..."
2085155,2022,Zylon,M,5,^zylon$,"{'^z': {'y': 1}, 'zy': {'l': 1}, 'yl': {'o': 1..."
2085156,2022,Zymeer,M,5,^zymeer$,"{'^z': {'y': 1}, 'zy': {'m': 1}, 'ym': {'e': 1..."
2085157,2022,Zymeire,M,5,^zymeire$,"{'^z': {'y': 1}, 'zy': {'m': 1}, 'ym': {'e': 1..."


In [407]:
summarized = df.groupby(["gender", "year"]).count_obj.sum()

In [408]:
summarized.head()

gender  year
F       1880    {'ur': {'a': 2, 'i': 2, 'e': 2, 'o': 1, '$': 1...
        1881    {'ur': {'a': 3, 'i': 2, 'e': 2, 'o': 1, 't': 1...
        1882    {'nu': {'e': 1}, 'uc': {'y': 1, 'i': 7, 'r': 1...
        1883    {'nu': {'e': 1}, 'uc': {'y': 1, 'i': 6, 'r': 1...
        1884    {'nu': {'e': 1}, 'uc': {'y': 1, 'i': 6, 'r': 1...
Name: count_obj, dtype: object

In [409]:
summarized.tail()

gender  year
M       2018    {'kc': {'$': 1}, 'nu': {'e': 19, 's': 8, '$': ...
        2019    {'kc': {'$': 1, 'e': 1}, 'nu': {'e': 18, '$': ...
        2020    {'kc': {'$': 1}, 'nu': {'e': 18, '$': 4, 's': ...
        2021    {'kc': {'$': 1}, 'nu': {'e': 16, '$': 8, 's': ...
        2022    {'kc': {'$': 1}, 'nu': {'e': 16, '$': 8, 's': ...
Name: count_obj, dtype: object

In [410]:
summarized = summarized.reset_index()

In [418]:
summarized.head()

Unnamed: 0,gender,year,count_obj
0,F,1880,"{'ur': {'a': 2, 'i': 2, 'e': 2, 'o': 1, '$': 1..."
1,F,1881,"{'ur': {'a': 3, 'i': 2, 'e': 2, 'o': 1, 't': 1..."
2,F,1882,"{'nu': {'e': 1}, 'uc': {'y': 1, 'i': 7, 'r': 1..."
3,F,1883,"{'nu': {'e': 1}, 'uc': {'y': 1, 'i': 6, 'r': 1..."
4,F,1884,"{'nu': {'e': 1}, 'uc': {'y': 1, 'i': 6, 'r': 1..."


In [420]:
summarized_saved = summarized.copy()

AttributeError: 'DataFrame' object has no attribute 'deepcopy'

In [422]:
id(summarized_saved.iloc[0].count_obj)

29135662896

In [426]:
summarized_saved["count_obj"] = [dict(c) for c in summarized_saved["count_obj"]]

In [416]:
!ls -lh

total 100976
-rw-r--r--  1 chris  staff    36M Jul 27 14:33 data.csv
-rw-r--r--  1 chris  staff   1.4K Jul 27 14:34 download_data.py
-rw-r--r--  1 chris  staff     0B Jul 27 14:28 generate_names_ngram.py
-rw-r--r--  1 chris  staff   644K Jul 27 21:09 namegen_2grams.pkl
-rw-r--r--  1 chris  staff   5.4M Jul 28 13:14 namegen_3grams.pkl
-rw-r--r--  1 chris  staff    12K Jul 27 15:00 namegen_ngrams.ipynb
-rw-r--r--  1 chris  staff   3.7K Jul 27 16:08 namegen_ngrams.py
-rw-r--r--  1 chris  staff   7.1M Jul 27 14:33 names.zip


In [427]:
summarized_saved.to_pickle('namegen_3grams.pkl')

In [428]:
import pickle
with open('namegen_3grams.pkl', 'rb') as f:
    ngrams = pickle.load(f)

In [429]:
foo = ngrams.iloc[0].count_obj

In [430]:
foo

{'ur': Counter({'a': 2, 'i': 2, 'e': 2, 'o': 1, '$': 1, 's': 1}),
 'dy': Counter({'$': 2, 't': 2, 's': 1}),
 '^j': Counter({'e': 13, 'o': 13, 'u': 9, 'a': 6, 'i': 1}),
 'gw': Counter({'e': 1}),
 'nn': Counter({'i': 23, 'a': 20, 'e': 11, '$': 4, 'y': 2}),
 'nu': Counter({'e': 1}),
 'eo': Counter({'r': 7, 'n': 4, 'l': 3, '$': 3, 'd': 2, 't': 1}),
 'ph': Counter({'i': 7, 'e': 3, 'a': 3, 'o': 1, 'r': 1, '$': 1, 'y': 1}),
 'ro': Counter({'s': 12,
          'n': 6,
          'l': 4,
          'b': 3,
          't': 2,
          'x': 2,
          'w': 1,
          'r': 1,
          'e': 1}),
 'el': Counter({'l': 42,
          'i': 39,
          'e': 13,
          'm': 11,
          '$': 9,
          'a': 6,
          'v': 5,
          'd': 3,
          's': 2,
          'y': 2,
          'o': 2,
          'p': 2,
          'n': 1}),
 'uc': Counter({'i': 7, 'y': 1, 'r': 1}),
 'dd': Counter({'i': 5, 'a': 1}),
 'on': Counter({'a': 12,
          'i': 11,
          'n': 6,
          '$': 3,
      

In [340]:
def get_starting_probs(transition_counts):
    """
    Returns a dictionary of starting probabilities.
    """
    start_states = [k for k in transition_counts.keys() if k[0] == '^']
    start_probs = {}
    for state in start_states:
        start_probs[state] = sum(transition_counts[state].values())
    total = sum(start_probs.values())
    for state in start_probs.keys():
        start_probs[state] /= total
    return start_probs

def generate_name(transition_counts):
    trnstn_probs = create_transition_probs(transition_counts)
    n = len(list(trnstn_probs.keys())[0])

    start_probs = get_starting_probs(transition_counts)
    name = np.random.choice(list(start_probs.keys()), p=list(start_probs.values()))
    while name[-1] != "$":
        next_char = np.random.choice(list(trnstn_probs[name[-n:]].keys()),
                                        p=list(trnstn_probs[name[-n:]].values()))
        name += next_char
    # return name[1:-1]
    return(name[1].upper() + name[2:-1])

In [433]:

generate_name(foo)

Maude


In [310]:
foo["^j"]

Counter({'o': 14, 'e': 13, 'u': 12, 'a': 6, 'i': 2})

In [383]:
summarized.shape

(286, 3)

In [402]:
row = foo.iloc[random.randint(0, len(summarized))]
print(row)
generate_name(row.count_obj)

IndexError: single positional indexer is out-of-bounds

In [436]:
foo = summarized.assign(decade = (summarized.year // 10) * 10).groupby(["gender", "decade"]).count_obj.sum().reset_index()

In [437]:
foo.head()

Unnamed: 0,gender,decade,count_obj
0,F,1880,"{'nu': {'e': 12, 's': 3}, 'uc': {'y': 10, 'i':..."
1,F,1890,"{'nu': {'e': 18, 's': 10}, 'uc': {'y': 10, 'i'..."
2,F,1900,"{'nu': {'e': 19, 's': 10}, 'uc': {'y': 17, 'i'..."
3,F,1910,"{'nu': {'e': 31, 's': 12, 'n': 8}, 'uc': {'i':..."
4,F,1920,"{'nu': {'e': 46, 's': 14, 'k': 1, 'i': 4, 'n':..."


In [435]:
summarized.head()

Unnamed: 0,gender,year,count_obj
0,F,1880,"{'ur': {'a': 2, 'i': 2, 'e': 2, 'o': 1, '$': 1..."
1,F,1881,"{'ur': {'a': 3, 'i': 2, 'e': 2, 'o': 1, 't': 1..."
2,F,1882,"{'nu': {'e': 1}, 'uc': {'y': 1, 'i': 7, 'r': 1..."
3,F,1883,"{'nu': {'e': 1}, 'uc': {'y': 1, 'i': 6, 'r': 1..."
4,F,1884,"{'nu': {'e': 1}, 'uc': {'y': 1, 'i': 6, 'r': 1..."


In [438]:
foo.shape

(30, 3)

In [439]:
foo.sort_values(["decade", "gender"])

Unnamed: 0,gender,decade,count_obj
0,F,1880,"{'nu': {'e': 12, 's': 3}, 'uc': {'y': 10, 'i':..."
15,M,1880,"{'nu': {'e': 20, 't': 3, 's': 8}, 'av': {'i': ..."
1,F,1890,"{'nu': {'e': 18, 's': 10}, 'uc': {'y': 10, 'i'..."
16,M,1890,"{'nu': {'e': 21, 's': 10, 't': 4, 'g': 1, 'm':..."
2,F,1900,"{'nu': {'e': 19, 's': 10}, 'uc': {'y': 17, 'i'..."
17,M,1900,"{'nu': {'e': 23, 's': 11, 't': 1, 'n': 1, 'l':..."
3,F,1910,"{'nu': {'e': 31, 's': 12, 'n': 8}, 'uc': {'i':..."
18,M,1910,"{'nu': {'e': 38, 's': 52, 'a': 12, 'n': 11, 't..."
4,F,1920,"{'nu': {'e': 46, 's': 14, 'k': 1, 'i': 4, 'n':..."
19,M,1920,"{'nu': {'e': 48, 'n': 15, 's': 62, 'l': 12, 'a..."
