# Data Import & Cleaning

In [1]:
import re
import numpy as np
import nltk

clubs = []
with open('/Users/ericknudson/Desktop/Fable club name generator/club titles.txt', encoding='utf-16') as fp:
    clubs = fp.readlines()
clubs = [x.strip('\n').lower() for x in clubs] 

In [60]:
with open('/Users/ericknudson/Desktop/Fable club name generator/club-generator/static/club_titles.txt', 'w', encoding='utf-16') as fp:
    for i in clubs:
        fp.write(i + "\n")

In [61]:
z = []
with open('/Users/ericknudson/Desktop/Fable club name generator/club-generator/static/club_titles.txt', encoding='utf-16') as f:
     for line in f:
        z.append(line)

In [59]:
z

['the book of secrets ‚Äìtest club\n',
 'hola\n',
 'cupcake book club\n',
 'regular folks reading\n',
 'sosp\n',
 'michelle\n',
 'nixie‚Äôs bookclub\n',
 'mount gilead library book club\n',
 'metamodern book club\n',
 'ladies reading books\n',
 'res colleges summer test\n',
 "reading n' color\n",
 'prose before hoes\n',
 'moms&books üëØ\u200d‚ôÄÔ∏èüìö\n',
 'book\n',
 'telecommunications\n',
 'our rainbow shelf\n',
 'shonas\n',
 'commons reading summer club 11\n',
 'teresa\n',
 'a girl with glasses\n',
 '.\n',
 'test\n',
 'testing book club\n',
 'mark‚Äôs newest club\n',
 'üå¥ the pool book club üå¥\n',
 'historical fiction club\n',
 'books!\n',
 'bodacious besties\n',
 'murakami fans\n',
 'test\n',
 'mo mo\n',
 'reading club\n',
 'hty\n',
 'work club\n',
 'booktok reads\n',
 'commons reading summer club 12\n',
 'commons reading summer club 16\n',
 'reading the lunar chronicles\n',
 'commons reading summer club 14\n',
 'commons reading summer club 17\n',
 'commons reading summer clu

# Generate n-gram probability dictionaries

In [2]:
import math
from collections import Counter
import nltk

START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'

def calc_probabilities(training_corpus):
    unigram_counts = Counter()
    bigram_counts = Counter()
    trigram_counts = Counter()
    for club in training_corpus:

        #tokens = club.split()
        #club = club.title()
        tokens = nltk.word_tokenize(club)

        unigram_tokens = tokens + [STOP_SYMBOL]
        unigram_counts.update(unigram_tokens)

        bigram_tokens =  [START_SYMBOL] + tokens + [STOP_SYMBOL]
        bigram_tuples = list(nltk.bigrams(bigram_tokens))
        bigram_counts.update(bigram_tuples)

        trigram_tokens =  [START_SYMBOL] + [START_SYMBOL] + tokens + [STOP_SYMBOL]
        trigram_tuples = list(nltk.trigrams(trigram_tokens))
        trigram_counts.update(trigram_tuples)

    n_unigrams = sum(unigram_counts.values())
    n_bigrams = sum(bigram_counts.values())
    n_trigrams = sum(trigram_counts.values())

    unigram_counts = dict(unigram_counts)
    bigram_counts = dict(bigram_counts)
    trigram_counts = dict(trigram_counts)

    #unigram_probabilities = {k:math.log(x / n_unigrams,2) for (k,x) in unigram_counts.items()}
    unigram_probabilities = {}
    for (unigram, count) in unigram_counts.items():
        #prob = math.log(count,2) - math.log(n_unigrams,2)
        prob = count / n_unigrams
        unigram_probabilities.update({unigram: prob})

    bigram_probabilities = {}
    for (bigram, count) in bigram_counts.items():
        if bigram[0] == START_SYMBOL:
            #prob = math.log(count,2) - math.log(unigram_counts[STOP_SYMBOL],2) #set denom to number of sentences
            prob = count / unigram_counts[STOP_SYMBOL]
            bigram_probabilities.update({bigram: prob})
        else:
            #prob = math.log(count,2) - math.log(unigram_counts[bigram[0]],2)
            prob = count / n_bigrams
            bigram_probabilities.update({bigram: prob})

    trigram_probabilities = {}
    for (trigram, count) in trigram_counts.items():
        if trigram[0:2] == (START_SYMBOL,START_SYMBOL):
            #prob = math.log(count,2) - math.log(unigram_counts[STOP_SYMBOL],2) #set denom to number of sentences
            prob = prob = count / unigram_counts[STOP_SYMBOL]
            trigram_probabilities.update({trigram: prob})
        else:
            #prob = math.log(count,2) - math.log(bigram_counts[trigram[0:2]],2)
            prob = count / n_trigrams
            trigram_probabilities.update({trigram: prob})
    return unigram_probabilities, bigram_probabilities, trigram_probabilities

In [3]:
unigram_probabilities, bigram_probabilities, trigram_probabilities = calc_probabilities(clubs)

# N-Gram Name Generators

### unigram model

In [4]:
from numpy.random import choice

def gen_club(*args):
    output = []
    while True:
        first_token = choice(np.array(list(unigram_probabilities.keys())), 1, p = np.array(list(unigram_probabilities.values())))[0]
        if first_token != 'STOP':
            break
    output.append(first_token)
    next_token = ""
    while next_token != 'STOP':
        next_token = choice(np.array(list(unigram_probabilities.keys())), 1, p = np.array(list(unigram_probabilities.values())))[0]
        output.append(next_token)
    output = output[0:-1] #remove stop
    new_club = " ".join(output)
    if new_club in clubs or new_club == " ":
        print("dupe:", new_club)
        new_club = gen_club(bigram_probabilities, clubs)
    else:
        print(new_club)

In [37]:
gen_club(unigram_probabilities, clubs)

anotherteenwriter willi testing read the


### bigram model

In [8]:
def gen_bigram_club(*args):
    bigram_probabilities = args[0]
    clubs = args[1]
    output = ["*"]
    next_token = ""
    i = 1
    while next_token != 'STOP':
        current_history = output[i-1]
        possible_bigrams = [(key, value) for key, value in bigram_probabilities.items() if key[0] == current_history]
        bigrams = [x[0] for x in possible_bigrams]
        bigrams = [x[1] for x in bigrams]
        probs = [x[1] for x in possible_bigrams]
        sprobs = np.sum(probs)
        probs = np.divide(probs,sprobs)
        if i == 1:
            next_token = choice(np.array(bigrams), 1)[0]
        else:
            #next_token = choice(np.array(bigrams), 1)[0]
            next_token = choice(np.array(bigrams), 1, p = np.array(probs))[0]
        #look back 2, choose bigram based on distribution of bigrams with those first words in common
        output.append(next_token)
        i = i + 1
    output = output[1:-1] #remove stop
    output = " ".join(output)
    if output in clubs:
        print("[duplicate]", output)
        output = gen_bigram_club(bigram_probabilities, clubs)
    else:
        print(output)

In [48]:
gen_bigram_club(bigram_probabilities, clubs)

sarcasm squad


In [12]:
def gen_trigram_club(*args):
    trigram_probabilities = args[0]
    tokenized_clubs = args[1]
    output = ["*", "*"]
    next_token = ""
    i = 2
    while next_token != 'STOP':
        current_history = tuple(output[i-2:i])
        possible_trigrams = [(key, value) for key, value in trigram_probabilities.items() if key[0:2] == current_history]
        trigrams = [x[0][2] for x in possible_trigrams]
        probs = [x[1] for x in possible_trigrams]
        sprobs = np.sum(probs)
        probs = np.divide(probs,sprobs)
        if i == 2:
            next_token = choice(np.array(trigrams), 1)[0]
        else:
            #next_token = choice(np.array(trigrams), 1, p = np.array(probs))[0]
            next_token = choice(np.array(trigrams), 1)[0]
        output.append(next_token)
        i = i + 1
    output = output[2:-1] #remove stop
    if output in clubs:
        print("dupe:", output)
        output = gen_club(trigram_probabilities, clubs)
    new_name = " ".join(output)
    return(new_name)

In [18]:
name = gen_trigram_club(trigram_probabilities, clubs)
print(name)

booksluts


# Generate databases for web app

In [19]:
import csv

w = csv.writer(open("bigram_probabilities.csv","w"))
for (key, val) in bigram_probabilities.items():
    w.writerow([key[0], key[1], val])