
# WHITELIST and RESERVED subdomains

### Whitelist:

1) most common first names, last names, and full names (of these, the MOST common first and last names will be designated "premium")

2) common (but not "reserved") words and bigrams

- top 100 single words and top 100 bigrams are reserved (200 total)
- the next top 400 single words and bigrams each are whitelisted (800 total)

### Reserved:

(based on NYT data)

- Top 100 single words

- Top 100 bigrams

- Others can be added/reclassified manually as needed


In [3]:
#BORING

import os
import glob
import re
import pandas as pd
import itertools

### Top first names

Source: https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv

Take baby names given to >0.05% of population (split by sex) in any year 1980 or later

In [4]:
#load the raw data
#https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv

df = pd.read_csv('csvs/raw/baby-names.csv')


#All first names make it into the whitelist

whitelisted_names = [name.lower() for name in df['name'].unique()]

#Premium are determined by all who were >0.1% of babies named in the year 1980 (partitioned by sex)
#It's not perfect, but who among us can claim perfection?
df_1980 = df[df['year']==1980][['name','percent','sex']]
boy_names = df_1980[df_1980['sex']=='boy']
girl_names = df_1980[df_1980['sex']=='girl']

premium_boy_names = list(boy_names[boy_names['percent']>.001]['name'])
premium_girl_names = list(girl_names[girl_names['percent']>.001]['name'])
premium_first_names = [name.lower() for name in set(premium_boy_names + premium_girl_names)]
normie_first_names = [name for name in whitelisted_names if name not in premium_first_names]

#twerk some dataframes to combine, separating out Premium vs. Normie tier first names
df_premium = pd.DataFrame(premium_first_names,columns=['name'])
df_premium['level']='premium'

df_normie = pd.DataFrame(normie_first_names,columns=['name'])
df_normie['level']='normie'

first_names_df = df_premium.merge(df_normie,how='outer')
first_names_df['category']='whitelist'
first_names_df['name_type']='first'

### Top last names

Top 1000 surnames according to https://www.thoughtco.com/most-common-us-surnames-1422656


In [5]:
%%capture

#load the raw data
#https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv

df = pd.read_csv('csvs/raw/top_1000_surnames.csv')

#oops there are some janky rows in there
df = df[~df['count'].isna()]

#Premium: top 100 surnames
#Normie: all other top 1000
df['level'] = ''
df[0:100]['level'] = 'premium'
df[101:]['level'] = 'normie'
df['name'] = [x.lower() for x in df['name']]
df['category'] = 'whitelist'
df['name_type'] = 'last'

surnames_df = df[['name','category','level','name_type']]

#combine firstname and lastname dataframes
top_names_df = surnames_df.merge(first_names_df,how='outer')

# Top Full Names
- combine premium first names + premium last names and add those full names to the whitelist
- it's 299 premium first names * 100 premium surnames = 29900 rows, I think the database can handle it lol


In [7]:
df_premium_names = top_names_df[top_names_df['level']=='premium']
first_names = list(df_premium_names[df_premium_names['name_type']=='first']['name'])
last_names = list(df_premium_names[df_premium_names['name_type']=='last']['name'])

full_names_list=[]
for first_name in first_names:
    for last_name in last_names:
        full_name = first_name+last_name
        full_names_list.append(full_name)
        
full_names_df = pd.DataFrame(full_names_list, columns=['name'])        
full_names_df['category']='whitelist'
full_names_df['level']='normie'
full_names_df['name_type']='full'

#combine first names, last names, and full names into a single dataframe
top_names_df = top_names_df.merge(full_names_df,how='outer')

# Top Words and Ngrams
- based on NYT data
- some of these will be set aside as "reserved" (TBD)

## Reserved subdomains
- top 100 words
- top 100 bigrams

## Whitelisted, available common words
- top 400 non-reserved single words and bigrams 
- (800 total; just the next 400 in the top 500 of each)

In [8]:
#get_top_words_and_bigrams.py

#source data: https://www.cs.utexas.edu/~gdurrett/courses/fa2020/nyt.txt

import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer

#read in text data
string = open('csvs/raw/nyt.txt', 'r').read().lower() #from https://www.cs.utexas.edu/~gdurrett/courses/fa2020/nyt.txt

#tokenize it (this flavor ignores punctuation)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(string)

#get list of single word frequencies
unifreq = nltk.FreqDist(tokens)

#get list of bigrams from the tokens list
bigrams_list = list(nltk.bigrams(tokens))
bifreq = nltk.FreqDist(bigrams_list)

#Top 500 individual words (excluding numbers and single-word tokens)
top_words = unifreq.most_common(1000)
rank=1
unigram_list = []
unigram_ranks = []
for word in top_words:
    if rank<=500:
        word = word[0]
        if len(word)>1 and not word.isnumeric():
            unigram_list.append(word)
            unigram_ranks.append(rank)
            rank+=1

#Top 500 bigrams        
top_bigrams = bifreq.most_common(500)
rank=1
bigram_list = []
bigram_ranks = []
for bigram in top_bigrams:
    bigram = bigram[0][0]+bigram[0][1]
    bigram_list.append(bigram)
    bigram_ranks.append(rank)
    rank+=1

In [9]:
%%capture

top_words_df = pd.DataFrame(zip(unigram_list,unigram_ranks),columns=['name','rank'])
top_words_df['category']='sup'
top_words_df[0:99]['category']='reserved'
top_words_df[100:]['category']='whitelist'

top_bigrams_df = pd.DataFrame(zip(bigram_list,bigram_ranks),columns=['name','rank'])
top_bigrams_df['category']='sup'
top_bigrams_df[0:99]['category']='reserved'
top_bigrams_df[100:]['category']='whitelist'

#set up the final "top common words and bigrams" dataframe
combined_words_df = top_words_df.merge(top_bigrams_df,how='outer')
combined_words_df['level']='normie'
combined_words_df['name_type']='words'
top_words_df_final = combined_words_df[['name','category','level','name_type']]

In [11]:
#combine the top names and top words dataframes. This is our final whitelist YAY
whitelist_and_reserved_df = combined_words_df.merge(top_names_df,how='outer')
whitelist_and_reserved_df = whitelist_and_reserved_df[['name','category','level']]

In [12]:
#write our whitelist and reserved CSV
whitelist_and_reserved_df.to_csv('csvs/whitelist_and_reserved.csv',index=False)

# Adding CSV to the database

In [33]:
#for i in range(len(whitelist_and_reserved_df)):

all_row_strings = []
for i in range(len(whitelist_and_reserved_df)):
    name = whitelist_and_reserved_df.loc[i]['name']
    category = whitelist_and_reserved_df.loc[i]['category']
    level = whitelist_and_reserved_df.loc[i]['level']
    row_string = "('"+name+"','"+category+"','"+level+"')"
    all_row_strings.append(row_string)
    
full_insert_string = ','.join(all_row_strings)
sql = f'''INSERT INTO whitelist (name,category,level) VALUES {full_insert_string};
'''

print(sql)
# why yes I DID just copy-paste this into my terminal to run it locally.
# Pandas is scuffed in Cpanel (deeply mysterious)
# but this isn't huge data anyway, it's only like ~38k rows
# and I have other things I need to do