In [1]:
import csv
import re

FILES = [
    'data/Black-Female-Names.csv',
    'data/Black-Male-Names.csv',
    'data/White-Female-Names.csv',
    'data/White-Male-Names.csv',
    'data/Hispanic-Male-Names.csv',
    'data/Hispanic-Female-Names.csv',
    'data/Indian-Male-Names.csv',
    'data/Indian-Female-Names.csv',
]

STOP_WORDS = {
    "@",
    "r/o",
    "d/o",
    'name',
    'and',
    'his',
    'her',
    'witness',
    'arizona',
    'same',
    'said',
    'not',
    'land',
    'for',
    "man",
    'known',
    'husband',
    'part',
    'subject',
    'wife',
    "with",
    "any",
    "one",
    'day',
    'both',
    'necessary',
}

LETTERS_ONLY_REGEX = re.compile("([a-z]+)", re.IGNORECASE)

def is_only_letters(text):
    return(LETTERS_ONLY_REGEX.fullmatch(text))


def is_valid_ascii(text):
    try:
        text.encode('ascii')
    except UnicodeEncodeError:
        return False
    return True

NAME_SPLITTER_REGEX = re.compile("[\.\s@]") #dots and spaces

def clean_parts(parts):
    parts = [p for p in parts if not p in STOP_WORDS]
    parts = [p for p in parts if is_valid_ascii(p)]
    parts = [p for p in parts if len(p) > 2]
    parts = [p for p in parts if is_only_letters(p)]
    return parts

def get_names(filename):
    last_names = []
    first_names = []
    with open(filename, 'r') as f:
        lines = csv.reader(f)
        for line in list(lines)[1:]:
            try:
                (last_name, first_name, _gender, _racial) = tuple(line)
                if last_name not in STOP_WORDS:
                    last_names.append(last_name)
                parts = first_name.strip().split(" ")
                parts = clean_parts(parts)
                first_names += parts
            except ValueError:
                (name, _gender, _racial) = tuple(line)
                parts = NAME_SPLITTER_REGEX.split(name)
                parts = clean_parts(parts)
                len_parts = len(parts)
                if len_parts == 1:
                    first_names.append(parts[0])
                    last_names.append(parts[0])
                else:
                    for (index, part) in enumerate(parts):
                        if index == len_parts-1:
                            last_names.append(part)
                        else:
                            first_names.append(part)
    return (last_names, first_names)

last_names = []
first_names = []
for filename in FILES:
    (lasts, firsts) = get_names(filename)
    last_names += lasts
    first_names += firsts

    
last_set = set(last_names)
first_set = set(first_names)
print("last_names:", len(last_set))
print("first_names:", len(first_set))



last_names: 28577
first_names: 18076


In [2]:
with open("data/first_names.txt", "w+") as f:
    f.write("\n".join(list(first_set)))
with open("data/last_names.txt", "w+") as f:
    f.write("\n".join(list(last_set)))

In [3]:
x = "सादीक"

In [4]:
x.encode("utf-8").decode("utf-8")

'सादीक'

In [5]:
first_names

['tashanika',
 'denetra',
 'tomesha',
 'trellany',
 'cynthia',
 'crystal',
 'beverly',
 'robin',
 'sophia',
 'marquita',
 'sylvia',
 'dorothy',
 'zakiyyah',
 'crystal',
 'allissia',
 'mahogany',
 'melonise',
 'monique',
 'sherline',
 'veronica',
 'beverly',
 'brenda',
 'delta',
 'genesha',
 'margaret',
 'stephanie',
 'pachadgin',
 'shaina',
 'latonia',
 'ashley',
 'brianna',
 'carlas',
 'donna',
 'ethel',
 'juanetta',
 'leslie',
 'paula',
 'radja',
 'tonya',
 'jessica',
 'monique',
 'udreka',
 'rosenia',
 'shirley',
 'shelda',
 'brianna',
 'jasmimn',
 'stephanie',
 'april',
 'cheryl',
 'tiffany',
 'lanadieal',
 'ladonne',
 'annette',
 'helen',
 'eniee',
 'eugenia',
 'linda',
 'sierra',
 'star',
 'bernadette',
 'jameelah',
 'krystle',
 'sharon',
 'derica',
 'annie',
 'billie',
 'kizzie',
 'lacresha',
 'meeka',
 'raquel',
 'shanearia',
 'lakisha',
 'tammy',
 'robin',
 'jamyira',
 'jeanette',
 'melissa',
 'sherry',
 'minouche',
 'jessica',
 'jessica',
 'tamisha',
 'courtney',
 'shakeria',

In [6]:
import sys

letters = "abcdefghijklmnopqrstuvwxyz"
hashes = set()
count = 0
for i in letters:
    for j in letters:
        for k in letters:
            hashes.add(hash(i + j + k))
            count += 1


In [7]:
count

17576

In [8]:
len(hashes)

17576

In [9]:
sys.getsizeof(hashes)

524512

In [10]:
import re

In [11]:
'k' in first_set

False