# Spamku 
## May 4, 2020
## Version 2.0.1

### Import Libraries and Setup Variables and Define Helper Functions

Import nltk libraries

Setup dictionaries for storing learning.  Define a couple helper functions.

In [1]:
# https://www.nltk.org/
import nltk;

# http://www.nltk.org/book/ch05.html
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')
from nltk.corpus import cmudict

# https://docs.python.org/3/library/os.html
import os

# https://docs.python.org/3/library/pickle.html
import pickle

# https://docs.python.org/3/library/collections.html
from collections import defaultdict


pos_dict=defaultdict(list)
# declare custom dictionary for determining part of speech
pronunciation_dict = cmudict.dict()

# https://docs.python.org/3/library/random.html
import random


import string

def clean_word(word):
    word= word.translate(str.maketrans('', '', string.punctuation))
    word=word.lower()
    return word

def get_syllable_count(word):
    # print(pronunciation_dict[word])
    if word == 'spamku':
        return 2
    else:
        syl=[ x for x in pronunciation_dict[word][0] if x[-1].isdigit()]
        return len(syl)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\Nate\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


### Read in Corpus of Sample Spam as a text file and also make a list of haiku, so each item in the list is a complete three line haiku

This file contains 8893 spam-ku


In [2]:
# the name of the corpus is text. It is just one long string, no newlines
filename="corpus/spamku.txt"
file = open(filename, mode = 'r')
text = ""
for line in file:
    text += line.strip()+' '
file.close()
c=0
l=[]
# haiku_list is the name of our list which holds the haiku in their original form
haiku_list=[]
file = open(filename, mode = 'r')
for line in file:
    c+=1
    l.append(line)
    if c==3:
        haiku_list.append(l)
        c=0
        l=[]
file.close()

##### The first 3 spamku:

In [3]:
haiku_list[:3]

[['See the pretty SPAM\n',
  'Flying through the bedroom door\n',
  'Heading towards your face.\n'],
 ['Ears, snouts and innards,\n',
  'A homogeneous mass--\n',
  'Pass another slice.\n'],
 ['Pink tender morsel,\n',
  'Glistening with salty gel.\n',
  'What the hell is it?\n']]

### Read in pickle files 

pos_patterns is a dictionary with parts of speech patterns as keys, and how often that pattern occured in the corpus as values.  Each patterns is from one line of a haiku
For example: 'JJ NN NN' : 140  
  
pos_dict is a dictionary with indivdual parts of speech as keys, and the value is a list of all words from our spamku corpus that match that part of speech  
  
These dictionaries take a while to generate on my i5-3320M computer, so I pickled them.  See the end of the notebook to see how I generated them (those cells will consist of commented out code)



In [4]:
def read_pickle(filename): 
    file = open(filename, 'rb')      
    data = pickle.load(file) 
    file.close()
    return data

pos_patterns = read_pickle("pos_patterns.p")
pos_dict = read_pickle("pos_dict.p")
pos_patterns5 = read_pickle("pos_patterns5.p")
pos_patterns7 = read_pickle("pos_patterns7.p")
pattern_syl5 = read_pickle("pattern_syl5.p")
pattern_syl7 = read_pickle("pattern_syl7.p")



##### pos_dict  -->  Key is a part of speech, value is a list of all words found having that part of speech.  Note duplicates are allowed.  This increases the probability of selecting common words when using random.choice to select a word

In [13]:
pos_dict['VB'][:10]

['see', 'give', 'be', 'do', 'keep', 'have', 'be', 'do', 'do', 'do']

In [None]:
##### pos_patterns

### Perform Markov Chain Analysis on Sample Poems

Determining the most likely word to follow each word in sample

In [5]:
def make_markov_dict(text, filename):
    
    #text is a long string, filename is a file to save rejects to
    f = open(filename, 'a')
    # Tokenize the text by word, though including punctuation
    words = text.split(' ')
    
    # Initialize a default dictionary to hold all of the words and next words
    markov_dict = defaultdict(list)
    
    # Create a zipped list of all of the word pairs and put them in word: list of next words format
    for current_word, next_word in zip(words[0:-1], words[1:]):
        try:
            if get_syllable_count(clean_word(current_word)):
                if get_syllable_count(clean_word(next_word)):
                    markov_dict[clean_word(current_word)].append(clean_word(next_word) )    
        except:
            #f.write(clean_word(current_word)+'\n')
            print(clean_word(current_word), file=f)
            pass
    # Convert the default dict back into a dictionary
    markov_dict = dict(markov_dict)
    f.close()
    return markov_dict

mc=make_markov_dict(text, "words_without_syllable_count.txt")


In [13]:
len(list(mc.keys()))

11389

### Define functions to count syllables for each word

Testing with 3 words

In [15]:
def get_syllable_count(word):
    # print(pronunciation_dict[word])
    if word == 'spamku':
        return 2
    else:
        syl=[ x for x in pronunciation_dict[word][0] if x[-1].isdigit()]
        return len(syl)

print(get_syllable_count('pregnant'))
print(get_syllable_count('sausage'))
print(get_syllable_count('ribs'))


2
2
1


### Define function to return part of speech for each word

Testing with 3 words

In [16]:
def get_pos_label(word):
    return pos_tag(word)[0][1]

print(get_pos_label(['capicola']))
print(get_pos_label(['pastrami']))
print(get_pos_label(['andouille']))


NN
NN
NN


#### Create functions to choose POS pattern and syllable structure

In [22]:
import random
       
def get_word(pos, count, prevword):
    for i in range(200):
        try:
            word = random.choice(mc[clean_word(prevword)]) if prevword else random.choice(pos_dict[pos])
            if get_syllable_count(clean_word(word)) == count:
                return clean_word(word)
        except:
            print('error in get word' , word, pos , count)
            return("ham")
    return('spamku')
#print(get_word('NN', 3, False))
#print(get_word('NN', 2, 'spam'))
def make_line(line_pattern, line_syl):
    l=""
    prev_word=False
    #print("make line: ", line_pattern, line_syl)
    #print("length of  line pattern: ", len(line_pattern.split()), "len of line syl: ", len(line_syl))
    for pos, count in zip(line_pattern.split(), line_syl):
        #print("I'm here:", pos, count)
        word = get_word(pos,count, prev_word)
        l += word + ' '
        prev_word = word
    return l
#print(make_line(firstlinepat, firstlinesyl))

def get_line_info():
    line_info =[]
    for i in range(3):
        if i != 1:
            tempPat=random.choice(pos_patterns5)
            line_info.append([tempPat, random.choice(pattern_syl5[tempPat])])
        else:
            tempPat=random.choice(pos_patterns7)
            line_info.append([tempPat, random.choice(pattern_syl7[tempPat])])
    return line_info



line_info = get_line_info()
for line in line_info:
    print(make_line(line[0], line[1]))

    


error in get word pepto NN 3
ham what makes 
the spam visit mom the spam 
error in get word porcine NN 2
ham maybe not 


In [26]:
if 'pepto' in pos_dict['NN']:
    print('uh-oh')
else:
    print("safe")

uh-oh


### Process each line of sample

Determine the arrangement of syllables

Determine the parts of speech of each word

### Pick a pattern for each line

Choose a syllable sequence for the 1st, 2nd and 3rd line

### Seed the poems 1st word of syllable length from pattern

### Use chain to pick next word with correct number of syllables

In [17]:
print(pos_dict['NN'][:12])

['spam', 'bedroom', 'door', 'face', 'mass', 'pass', 'slice', 'pink', 'tender', 'morsel', 'salty', 'gel']


### Functions used to create pickled dictionaries

In [None]:
# # Make pos patterns from haiku
# pos_patterns =  defaultdict(list)
# pos_patterns_markov_data = []
# haiku_pat = []
# for haiku in haiku_list:
#     for line in haiku:
#         l=""
#         line = line.strip()
#         for word in line.split():
#             word = clean_word(word)
#             try:
#                 l += pos_tag([word])[0][1]+" "
#             except:
#                 #I discovered that word is "" sometimes, which raises an error
#                 #print("error when building l", word)
#                 pass
#         haiku_pat.append(l)
#         if l in pos_patterns:
#             pos_patterns[l] += 1
#         else:
#             pos_patterns[l] = 1
#     pos_patterns_markov_data.append(l)

In [14]:
# Make pos pattern - syllable information dict
pos_patterns5 =  []
pos_patterns7 =  []
pattern_syl5 = defaultdict(list)
pattern_syl7 = defaultdict(list)
#pos_patterns_syl_markov_data_syl = []
for haiku in haiku_list:
    for line in haiku:
        
        l=""
        syl_pattern = []
        line = line.strip()
        for word in line.split():
            word = clean_word(word)
            try:
                l += pos_tag([word])[0][1]+" "
                syl_pattern.append(get_syllable_count(word))
            except:
                #I discovered that word is "" sometimes, which raises an error
                #print("error when building l", word)
                pass
        if sum(syl_pattern) == 5:
            pattern_syl5[l].append(syl_pattern)
            pos_patterns5.append(l)
        if sum(syl_pattern) == 7:
            pattern_syl7[l].append(syl_pattern)
            pos_patterns7.append(l)


In [80]:
import random
       
def get_word(pos, count, prevword):
    for i in range(200):
        try:
            word = random.choice(mc[clean_word(prevword)]) if prevword else random.choice(pos_dict[pos])
            if get_syllable_count(clean_word(word)) == count:
                return clean_word(word)
        except:
            print("mc error from preword: ", prevword)
            print('nate' , word, pos , count)
            return("ham")
    return('spamku')
#print(get_word('NN', 3, False))
#print(get_word('NN', 2, 'spam'))
def make_line(line_pattern, line_syl):
    l=""
    prev_word=False
    #print("make line: ", line_pattern, line_syl)
    #print("length of  line pattern: ", len(line_pattern.split()), "len of line syl: ", len(line_syl))
    for pos, count in zip(line_pattern.split(), line_syl):
        #print("I'm here:", pos, count)
        word = get_word(pos,count, prev_word)
        l += word + ' '
        prev_word = word
    return l
#print(make_line(firstlinepat, firstlinesyl))

def get_line_info():
    line_info =[]
    for i in range(3):
        if i != 1:
            tempPat=random.choice(pos_patterns5)
            line_info.append([tempPat, random.choice(pattern_syl5[tempPat])])
        else:
            tempPat=random.choice(pos_patterns7)
            line_info.append([tempPat, random.choice(pattern_syl7[tempPat])])
    return line_info

def generate_spamku(num_wanted):
    try:
        for i in range(num_wanted):
            line_info = get_line_info()
            for line in line_info:
                print(make_line(line[0], line[1]))
            print()
    except:
        print("\nError.  Restarting...\n")
        generate_spamku(num_wanted-i)
        pass
              

generate_spamku(5)   


spam i will he ate 
low on my friend i perform 
blew it taste not no 

aisle i force to your 
cans of hideous spamku 
three and fart in this 

burst your pork taste charles 
gotta have no food was no 
king of tang viscous 

spam dies on the vibe 
mc error from preword:  helpings

Error.  Restarting...

me i heed spamku 
what was wrong with the fridge one 
spamku i 

if spam te spamku whats 
freud spamku in the desert 
the toilet it is 



In [41]:
generate_spamku(30)

tricky pink greasy 
when pigs fly fish pig lagoons 
wont touch this spam a 

nine days i wonder 
well later postal i want 
sham has no fuzzy 

blue can as a man 
luscious spamku has spam and 
piggy cry greasy 

hockey spamku ballpoint 
spam has problems large snake was 
heart million swine were 

to knowledge spamku 
see next patient ms spam cans 
life turned to suffer 

for your spam maybe 
while eating spam crust first cold 
tin of spam them some 

particular i 
raw bimbo i was really 
hanging out excess 

here in the full of 
annoying spamku sure you 
thou hid her spam spam 

flies what the long too 
i ever understand you 
honey flowing to 

demented spam melt 
joke for reading i have you 
pink spamku 

hungry cry out for 
just trailer park trash saddam 
whole masters spamku 

love alone with food 
what day and south west north east 
a one more vile than 

to explain to do 
matter what is heavily 
just sitting at night 

youth charged chuck spamku 
i lather spamku is the 
lies palat

UnboundLocalError: local variable 'word' referenced before assignment

In [45]:
generate_spamku(30)

fingers dance and flushed 
of spam gods angel sing the 
fish bait the little 

eat one expected 
hanging spamku makes good 
battle then go to 

thirteen months mom the 
a whiff spamku all hang spam 
with spam is maps tell 

bad hair tin bong jim 
still looks like spam gay pink and 
my ear you squealing 

hah while i shot the 
vodka deaths pigs take it to 
beautiful weather 

pork bits of the milk 
says spam lump oozing pleasant 
you can fit for years 

wag the hormel spam 
i await you spam oh if 
square loaf twentieth 

conservation of 
you to fly in can give me 
dick where is truly 

and thong spam can of 
spam in a real bad but she 
reproduces spam 

public pinched a man 
feeling very good its not 
mountain the third world 

comfortable sleep 
crime was a can and spam is 
kisses are meat oh 

bile are you the can 
spam trails spamku of tasting 
you know their mentors 

taken spamku all 
of money consume only 
to glory i fry 

blew the pink goo wake 
santa left of any place 
to call u

UnboundLocalError: local variable 'word' referenced before assignment

In [56]:
mc['fading']

KeyError: 'fading'

In [57]:
get_syllable_count('fading')

2

In [75]:
generate_spamku(5)

it and be a just 
murder bought spamku sex pressed 
clinton repeat spam 

saves nine spam the road 
mc error from preword:  olde

Error.  Restarting...
as i must vote for 
pork cold running spamku 
it in space food real 

i thrust spam loaves and 
created for cherry tree 
spam is hard workers 

in spam when shadows 
and peas high upon my yeast 
altitude spam and 

spam meat heavy and 
sweet spam spam cans on top with 
spam cold enough spam 

anticipates the 
beantown e f loser must 
all boulder on my 



In [55]:
generate_spamku(30)

and pork in a time 
fried in the net greasy pink 
the virtue of spam 

my pillow marks the 
instead spamku 
spam man has no pane 

always lick spam suit 
loser mistaken spamku 
meaty and let the 

eat it at me spam 
sticky spamku star wars have 
table no more sense 

silent suspicions 
headlines state zoning spamku 
not a ship lands with 

until you getting 
what the room for meat the spam 
like to the farm while 

spam it for midnight 
one two in a lump of spam 
cough spamku reach for 

me with de feet am 
how do you for revenge spam 
oneness wet poop is 

repulsive is maps 
larry lowe spamku i can 
the john open the 

he blows close walden 
with pink soldiers in the moon 
bowel tornado 

feed from blue grass spam 
their really knows answer was 
enigmatic box 

it twitches slightly 
they live love and ends spamku 
step in a pink guy 

pink meat treat open 
ish spamku no more spam morning 
are just find new hit 

will spam spam its a 
dirty ape butler spamku 
lube for spam ask me 

me 

UnboundLocalError: local variable 'word' referenced before assignment

In [31]:
# Make pos_dict from spamku

import string
import sys
c=0
pos_dict=defaultdict(list)
for word in text.split(' '):
    try:
        c+=1
        word = clean_word(word) 
        if c % 5000 == 0:
            print(c, word)
        tok = pos_tag([word])
        # This line is here to throw an error if we don't have a syllable count for this word
        syl = get_syllable_count(word)
        pos_dict[tok[0][1]].append(tok[0][0])
    except:
        #print("SYL ERROR", tok, word)
        continue
print('***************  Done  ***************')

5000 ive
10000 to
15000 bill
20000 western
25000 below
30000 deadly
35000 ate
40000 godzilla
45000 was
50000 their
55000 the
60000 rises
65000 love
70000 were
75000 stock
80000 stronger
85000 pan
90000 a
95000 an
100000 i
105000 ammo
110000 blend
***************  Done  ***************


In [43]:
if 'porcine' in pos_dict['NN']:
    print('uh-oh')
else:
    print("safe")

safe


In [32]:
pickle.dump(pos_dict, open("pos_dict.p", "wb"))
pickle.dump(pos_patterns, open("pos_patterns.p", "wb"))
pickle.dump(pos_patterns5, open("pos_patterns5.p", "wb"))
pickle.dump(pos_patterns7, open("pos_patterns7.p", "wb"))
pickle.dump(pattern_syl5, open("pattern_syl5.p", "wb"))
pickle.dump(pattern_syl7, open("pattern_syl7.p", "wb"))


In [None]:
pos_dict.keys()

In [None]:
len(pos_dict['NN'])

In [None]:
NNfreq = {x:pos_dict['NN'].count(x) for x in pos_dict['NN']}

In [None]:
#NNfreqKeys = list(NNfreq.keys())
#NNfreqKeys[:20]
NNfreq['spam']

In [None]:
try:
    print butt
except:
    print(err)
    print("in except block")
    pass
print(burt)

In [None]:
>>> a = [1,1,1,1,2,2,2,2,3,3,4,5,5]
>>> d = {x:a.count(x) for x in a}
>>> d
{1: 4, 2: 4, 3: 2, 4: 1, 5: 2}
>>> a, b = d.keys(), d.values()
>>> a
[1, 2, 3, 4, 5]
>>> b
[4, 4, 2, 1, 2]

In [None]:
def clean_word(word):
    word= word.translate(str.maketrans('', '', string.punctuation))
    word=word.lower()
    return word

In [12]:
# f=open("rejected_words.py", 'r')
# rejects=[]
# for word in f:
#     word = clean_word(word)
#     rejects.append(word)
# rejects=set(rejects)
# f.close()
# f=open("clean_rejects.txt", 'a')
# for word in rejects:
#     f.write(word.strip()+':  ,\n')
# f.close()


In [None]:
# pattern_prob = []
# for k,v in pos_patterns.items():
#     for i in range(v):
#         try:
#             if v != "":
#                 pattern_prob.append(k)
#         except:
#             print("error: ", k, v)
# num_diff_patterns = len(pattern_prob)

# syllable_lengths = {
#     'VB': 1,
#     'DT': 1,
#     'RB': 2,
#     'NN': 5,
#     'VBG': 3,
#     'IN': 1,
#     'NNS': 4,
#     'PRP$': 1,
#     'CC': 1,
#     'JJ': 2,
#     'WP': 1,
#     'VBZ': 1,
#     'PRP': 1, 
#     'VBN': 2,
#     'JJS': 2,
#     'MD': 1, 
#     'TO': 1,
#     'VBD': 2,
#     'VBP': 2,
#     'WRB': 1,
#     'CD': 1, #cardinal digit, could be more
#     'RBR': 2,
#     'JJR': 2,
#     'WDT': 1,
#     'WP$': 1
# }


# # Maybe do a probability analysis on these?
# # CC coordinating conjunction
# # CD cardinal digit
# # DT determiner
# # EX existential there (like: “there is” … think of it like “there exists”)
# # FW foreign word
# # IN preposition/subordinating conjunction
# # JJ adjective ‘big’
# # JJR adjective, comparative ‘bigger’
# # JJS adjective, superlative ‘biggest’
# # LS list marker 1)
# # MD modal could, will
# # NN noun, singular ‘desk’
# # NNS noun plural ‘desks’
# # NNP proper noun, singular ‘Harrison’
# # NNPS proper noun, plural ‘Americans’
# # PDT predeterminer ‘all the kids’
# # POS possessive ending parent’s
# # PRP personal pronoun I, he, she
# # PRP$ possessive pronoun my, his, hers
# # RB adverb very, silently,
# # RBR adverb, comparative better
# # RBS adverb, superlative best
# # RP particle give up
# # TO, to go ‘to’ the store.
# # UH interjection, errrrrrrrm
# # VB verb, base form take
# # VBD verb, past tense took
# # VBG verb, gerund/present participle taking
# # VBN verb, past participle taken
# # VBP verb, sing. present, non-3d take
# # VBZ verb, 3rd person sing. present takes
# # WDT wh-determiner which
# # WP wh-pronoun who, what
# # WP$ possessive wh-pronoun whose
# # WRB wh-abverb where, when

# def choose_target_syllable_count(pattern, number_of_syllables_wanted):
#     syllable_counts = []
#     possible = False
#     pattern_len = len(pattern.split())
#     max_per_word = number_of_syllables_wanted - pattern_len
#     for i in range(len(pattern.split())):
#         while possible == False:
#             x = randint(1, syllable_lengths[pattern.split()[i]])
#             print(x)
#             if x <= max_per_word:
#                 possible = True
#         syllable_counts.append(x)
#         number_of_syllables_wanted -= x
#         pattern_len -= 1
#         max_per_word = number_of_syllables_wanted - pattern_len
#         print('***********************', syllable_counts)
#     return syllable_counts

# #pass in poem, pattern, 
                    
# def build_line(line, pattern, max_length):
#     if len(pattern) == 0:
#         return line
#     else:
#         pos = pattern[-1]
#     try:
#         new_max_length = max_length
#         #print(max_length)
#         word = random.choice(pos_dict[pos])
#         new_max_length = max_length - get_syllable_count(word)
#         if new_max_length >= 0:
#             line = word + ' ' + line
#         else:
#             new_max_length = max_length
#     except:
#         print("funny word: ", word)        
#     #if max_length >= 0:
#     pattern=pattern[:-1]
#     return build_line(line, pattern, new_max_length)

## word1.capitalize()        
        
# line1text=""
# word=""
# c=0
# for pos in line1:
#     word=""
#     possible = False
#     try:
#         while possible == False:
#             word = random.choice(pos_dict[pos])
#             if get_syllable_count(word) = l1[c]
#                 line1text += word " "
#                 c += 1
#                 possible = true
#         except:
#             print("Unexpected error when adding to dict:", sys.exc_info()[0])
#             print("pos:", pos, l1[c], word)
    