In [1]:
%load_ext autoreload
%autoreload 2
from src.data_processing import load_data
import itertools
import string

from src.ngrams import *
from src.string_similarity import levenshtein
import operator
from src.data_processing import print_progress
from nltk import word_tokenize, pos_tag
from src.data_processing import load_cmu
from listener.ipatoarpabet import translate
from string import punctuation

In [13]:
from listener.ipatoarpabet import translate as ph_translate
from src.ngrams import *
from src.pronunciations import get_closest_sounding_words

In [3]:
cmu = load_cmu()

In [10]:
with open("data/punderstanding.json") as f:
    puns = json.load(f)

In [24]:
def translate(context, pun_word, ngram_length=3):
    
    def phonetic_translation(pun_word):
        phonemes = []
        for word in pun_word.split():
            if word in cmu:
                pun_phone = cmu[word][0]
            else:
                pun_phone = ph_translate(word)[0].split()
            phonemes.extend(pun_phone)
        return phonemes
    
    def switch_score(distance, frequency_difference):
        if frequency_difference == 0:
            return 0
        return frequency_difference / ((distance + 1) ** 2)
    
    pun_phone = phonetic_translation(pun_word)
    
    def phonetic_distance(cand_phone, switch_phone):
        
        try:
            if switch_word in cmu:
                switch_phone = cmu[switch_word][0]
            else:
                switch_phone = ph_translate(switch_word)[0].split()
        except:
            pass
        return levenshtein(cand_phone, switch_phone)
    
    words = context.split(" ")
    window = len(pun_word.split())
    try:
        for i in range(len(context.split())):
            if all(context.split()[i+w] == pun_word.split()[w] for w in range(window)):
                pun_index = i
                break
#         pun_index = words.index(pun_word)
    except ValueError as e:
        return []
    
    
    original_frequency = ngram_frequency(words[pun_index-1:pun_index+(ngram_length-1)])
#     substituions = get_three_gram_wildcard(words[pun_index-1], words[pun_index+1])
    similar_words = get_closest_sounding_words(pun_word)
    grams = [[words[pun_index-1], sim_word, words[pun_index+window]] for sim_word in similar_words]
    print("Generating substitutions")
    print(grams)
    substituions = {' '.join(gram): ngram_frequency(gram) for gram in grams}
    ranked_candidates = []
    
    print("Generating scores")
    for i, (sub, freq) in zip(range(len(substituions)), substituions.items()):

        switch_word = sub.split()[1]
        ph_d = phonetic_distance(pun_phone, pun_word)
    
        if ph_d <= 2:
            freq_d = freq - original_frequency
            ranked_candidates.append((switch_word, switch_score(ph_d, freq_d)))

    return list(sorted(ranked_candidates, key=operator.itemgetter(1), reverse=True))

In [16]:
translate()

TypeError: translate() missing 2 required positional arguments: 'context' and 'pun_word'

In [25]:
results = []
for i, p in enumerate(puns):
    print("{} / {}".format(i, len(puns)))
    print(p)
    r = translate(p['pun'],p['pun_word'])
    if len(r) > 0:
        results.append(r[0][0])
    else:
        results.append('')

0 / 75
{'pun': 'my wedding was so beautiful the cake was in tiers .', 'pun_word': 'tiers', 'target_word': 'tears'}
T/IY/R/Z
Generating substitutions
[['in', 'shears', '.'], ['in', 'teas', '.'], ['in', 'teens', '.'], ['in', "t's", '.'], ['in', 'tears', '.'], ['in', 'teams', '.'], ['in', "lear's", '.'], ['in', 'tees', '.'], ['in', "beer's", '.'], ['in', 'teems', '.'], ['in', 'tease', '.'], ['in', 'jeers', '.'], ['in', "team's", '.'], ['in', 'tier', '.'], ['in', "weir's", '.'], ['in', 'tours', '.'], ['in', "tour's", '.'], ['in', 'seers', '.'], ['in', 'biers', '.'], ['in', 'hears', '.'], ['in', "shear's", '.']]
Generating scores
tears: 1, 103797, 25949.25
1 / 75
{'pun': 'bladder infection ? urine trouble .', 'pun_word': 'urine', 'target_word': "you're in"}
Y/ER/AH/N
Generating substitutions
[['?', 'yearn', 'trouble']]
Generating scores
2 / 75
{'pun': 'i tried to catch some fog but i mist .', 'pun_word': 'mist', 'target_word': 'missed'}
M/IH/S/T
Generating substitutions
[['i', 'misty', '.']

Generating substitutions
[['he', 'pasta', '.'], ['he', 'castaway', '.'], ['he', 'pastas', '.']]
Generating scores
15 / 75
{'pun': "did you hear about the kidnapping at school ? It's okay . He woke up .", 'pun_word': 'kidnapping', 'target_word': 'kid napping'}
K/IH/D/N/AE/P/IH/NG
Generating substitutions
[['the', 'kidnaping', 'at'], ['the', 'kidnaped', 'at'], ['the', 'kidnaps', 'at'], ['the', 'kidnap', 'at'], ['the', 'kidnapped', 'at'], ['the', 'kidnappers', 'at'], ['the', 'kidnapper', 'at'], ['the', 'kidnappings', 'at']]
Generating scores
16 / 75
{'pun': "instead of 'the John' I call my toilet 'the Jim' . That way it sounds better when I say I go to the Jim first thing every morning .", 'pun_word': 'Jim', 'target_word': 'gym'}
JH/IH/M
Generating substitutions
[['the', 'tim', 'first'], ['the', "jim's", 'first'], ['the', 'whim', 'first'], ['the', 'hymn', 'first'], ['the', 'jill', 'first'], ['the', 'rim', 'first'], ['the', 'jimmy', 'first'], ['the', 'gym', 'first'], ['the', 'vim', 'first'

Generating scores
24 / 75
{'pun': 'when the smog lifts in los angeles  u c l a .', 'pun_word': 'u c l a', 'target_word': 'you see L.A.'}
Y/UW/S/IY/EH/L/AH
Generating substitutions
[]
Generating scores
25 / 75
{'pun': 'a door is not a door when it is ajar .', 'pun_word': 'ajar', 'target_word': 'a jar'}
AH/JH/AA/R
Generating substitutions
[['is', 'afar', '.'], ['is', 'adar', '.'], ['is', 'jar', '.']]
Generating scores
26 / 75
{'pun': "a bicycle can't stand alone . it's two tired .", 'pun_word': 'two', 'target_word': 'too'}
T/UW
Generating substitutions
[["it's", 'lu', 'tired'], ["it's", 'lou', 'tired'], ["it's", 'pu', 'tired'], ["it's", 'you', 'tired'], ["it's", 'do', 'tired'], ["it's", 'coup', 'tired'], ["it's", 'ta', 'tired'], ["it's", 'lieu', 'tired'], ["it's", 'boo', 'tired'], ["it's", 'tomb', 'tired'], ["it's", 'toot', 'tired'], ["it's", "two's", 'tired'], ["it's", 'shoo', 'tired'], ["it's", 'tool', 'tired'], ["it's", 'dew', 'tired'], ["it's", 'whew', 'tired'], ["it's", 'too', 'tire

P/IY/L/IH/NG
Generating substitutions
[["wasn't", 'peking', 'well'], ["wasn't", 'poling', 'well'], ["wasn't", 'heeling', 'well'], ["wasn't", 'pauling', 'well'], ["wasn't", 'pilling', 'well'], ["wasn't", 'piecing', 'well'], ["wasn't", 'wheeling', 'well'], ["wasn't", 'pearling', 'well'], ["wasn't", 'polling', 'well'], ["wasn't", 'pulling', 'well'], ["wasn't", 'sealing', 'well'], ["wasn't", 'dealing', 'well'], ["wasn't", 'peeking', 'well'], ["wasn't", 'ceiling', 'well'], ["wasn't", 'piling', 'well'], ["wasn't", 'pooling', 'well'], ["wasn't", 'feeling', 'well'], ["wasn't", 'keeling', 'well'], ["wasn't", 'peaking', 'well'], ["wasn't", 'reeling', 'well'], ["wasn't", 'healing', 'well'], ["wasn't", 'kneeling', 'well'], ["wasn't", 'appealing', 'well'], ["wasn't", 'peeping', 'well'], ["wasn't", 'peering', 'well']]
Generating scores
40 / 75
{'pun': 'how does moses make tea ? hebrews it .', 'pun_word': 'hebrews', 'target_word': 'he brews'}
HH/IY/B/R/UW/Z
Generating substitutions
[['?', 'hebrew', '

Generating scores
55 / 75
{'pun': 'he who farts in church sits in own pew .', 'pun_word': 'pew', 'target_word': 'few'}
P/Y/UW
Generating substitutions
[['own', 'queue', '.'], ['own', 'peru', '.'], ['own', 'pugh', '.'], ['own', 'q', '.'], ['own', 'u', '.'], ['own', 'few', '.'], ['own', 'ewe', '.'], ['own', 'pu', '.'], ['own', 'hugh', '.'], ['own', 'cue', '.'], ['own', 'you', '.'], ['own', 'spew', '.'], ['own', 'pooh', '.'], ['own', 'hue', '.'], ['own', 'pews', '.'], ['own', 'hew', '.'], ['own', "pugh's", '.'], ['own', 'puke', '.'], ['own', 'mew', '.'], ['own', 'yew', '.'], ['own', 'view', '.']]
Generating scores
56 / 75
{'pun': "i'd make a veggie joke but no one would carrot all .", 'pun_word': 'carrot', 'target_word': 'care at'}
K/AE/R/AH/T
Generating substitutions
[['would', 'carat', 'all'], ['would', 'cassatt', 'all'], ['would', 'karat', 'all'], ['would', 'cabot', 'all'], ['would', 'carroll', 'all'], ['would', 'carrots', 'all'], ['would', 'carrel', 'all'], ['would', 'carol', 'all'], 

IndexError: list index out of range

In [32]:
acc = 0
acc2 = 0
tot = 0
for r, p in zip(results, puns):
    print(r, p['target_word'])
    acc += int(r == p['target_word'])
    if len(p['target_word'].split()) == 1:
        acc2 += int(r == p['target_word'])
        tot += 1

tears tears
yearn you're in
missed missed
time tie
pressed arrest
raising raising
appall them all
 prince
ordeal ordeal
 so it seems
week week
souls souls
sheets shit zoo
tyson bye son
pastas passed away
kidnappers kid napping
gym gym
profit profit
last least
for for
invert in verse
planned plan it
closed closed
pale pale
 you see L.A.
afar a jar
do too
waste waste
spreadsheet spread sheets
debt do it
sea sea
antares auntie freeze
intent innocent
standing out standing
 an other
titian fish and
property property
 Napolean Bonaparte
mistake mistake
pearling feeling
highbrows he brews
 her before
doesn't doesn't it
self self
otto ought to
merry daring
tire attire
maelstrom main stream
upon upon
mini me know
 transcendental meditation
tuner ten or
granted granted
naught not
ad ass
view few
cassatt care at
africans a frequent
gonna gonna


In [33]:
acc / len(puns), acc2 / tot

(0.25333333333333335, 0.6129032258064516)