In [1]:
from src.data_processing import load_data
import itertools
import string

from src.ngrams import *
from src.string_similarity import levenshtein
import operator
from src.data_processing import print_progress
from nltk import word_tokenize, pos_tag
from src.data_processing import load_cmu
from src.ipatoarpabet import translate
from string import punctuation
from src.pronunciations import phonetic_distance
import os
from pattern.en import lexeme
from src.pronunciations import get_closest_sounding_words as csw

In [2]:
from src.pun_algorithms import *

Loading Model, this could take a while...


In [3]:
with open("data/ngram_searchspace/ngram_totals2.json") as f:
    search_space = json.load(f)

In [4]:
task1, task2, task3, min_pairs, strings, pun_strings = load_data()

In [4]:
len(search_space)

1780

(66639115, 7263)

In [14]:
def classification_accuracy(run):
    tp, fp, tn, fn = 0, 0, 0, 0
    
    for i in range(len(task1)):
        gold_pun = task1[i]['pun']
        if run[i] and gold_pun:
            tp += 1
        if not run[i] and not gold_pun:
            tn += 1
        if run[i] and not gold_pun:
            fp += 1
        if not run[i] and gold_pun:
            fn += 1
    
    results = {}
    results['acc'] = (tp + tn) / len(task1)
    results['recall'] = tp / (tp + fn)
    results['prec'] = tp / (tp + fp)
    results['f1'] = ((2*(results['recall'] * results['prec']))
                      / (results['recall'] + results['prec']))
    
    return results

# No Trigram Baseline

In [15]:
def no_trigram_baseline(index):
    current_context = search_space[index]
    for original_trigram in current_context:
        if current_context[original_trigram]['original_frequency'] == 0:
            return True
    return False

In [16]:
results = [no_trigram_baseline(index) for index in range(len(search_space))]

In [17]:
classification_accuracy(results)

{'acc': 0.7825842696629214,
 'f1': 0.8457552809884415,
 'prec': 0.8570274636510501,
 'recall': 0.8347757671125098}

In [11]:
ngram_frequency('a staring contest'.split())

4058

# No Quadgram Baseline

In [10]:
all_quadgrams_in_task1 = set()
for context in task1:
    words = context['words']
    for i in range(len(words)-3):
        all_quadgrams_in_task1.add(' '.join(words[i:i+4]))

In [11]:
all_quadgrams_in_task1 = list(sorted(all_quadgrams_in_task1))

In [None]:
groupings = defaultdict(list)
for quadgram in all_quadgrams_in_task1:
    groupings[quadgram.split()[0][:3]].append(quadgram)

In [103]:
def quad_to_dict(text):
    ret = {}
    for line in text.split('\n'):
        lsplit = line.split()
        try:
            ret[' '.join(lsplit[:4])] = int(lsplit[4])
        except:
            pass
    return ret

In [112]:
quadgram_frequencies ={}
for i, beginning_letters in enumerate(groupings):
    try:
        ngram_output = subprocess.check_output(['zcat', get_gram_file(beginning_letters, 4)]).decode('latin-1')
    except:
        pass
    quad_dict = quad_to_dict(ngram_output)
    for subquads in groupings[beginning_letters]:
        try:
#             print(subquads)
            quadgram_frequencies[subquads] = quad_dict[subquads]
        except KeyError:
            pass
    print_progress(i, len(groupings))

 |████████████████████████████████████████████████████████████████████████████████████████████████████| 99.9% 

In [115]:
with open("data/quadgram_frequencies.json") as f:
    quadgram_frequencies = json.load(f)

In [13]:
def no_quadgram_baseline(index):
    context = task1[index]['words']
    for i in range(0, len(context)-3):
        if ' '.join(context[i:i+4]) not in quadgram_frequencies:
            return True
    return False

In [137]:
results = [no_quadgram_baseline(i) for i in range(len(task1))]

In [138]:
classification_accuracy(results)

{'acc': 0.7893258426966292,
 'f1': 0.8700173310225303,
 'prec': 0.7775712515489467,
 'recall': 0.987411487018096}

In [11]:
all_frequencies = defaultdict(int)
for context in search_space:
    for og, sub in context.items():
        all_frequencies[og] = sub['original_frequency']
        for ssub, f in sub['substitutions'].items():
            all_frequencies[ssub] = f

# All Trigrams, No Position

In [6]:
def score(original_frequency, new_frequency, original_word, new_word, position, ph_penalty=2):
    return ( (new_frequency - original_frequency)
           * ((phonetic_distance(original_word, new_word, translated=True)**ph_penalty) 
           * position)) # pos is normalised

In [7]:
def single_score(original_trigram, new_trigram, ph_penalty=2):
    original_freq = all_frequencies[original_trigram]
    new_freq = all_frequencies[new_trigram]
    og_word, new_word = original_trigram.split()[1], new_trigram.split()[1]
    return score(original_freq, new_freq, og_word, new_word, 1, ph_penalty)

In [8]:
def sort_answers(unsorted_dict):
    sd = {}
    for k, d in unsorted_dict.items():
        sd[k] = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return sd

In [9]:
accepted_pos = {'ADV', 'ADJ', 'VERB', 'NOUN'}
def rank_substitutions(index):
    
    full_path = "results/{}/{}".format(path, index)
    
#     if os.path.exists(full_path):
#         print(index)
#         with open(full_path) as f:
#             res = json.load(f)
#         return res
    
    space = search_space[index]
    context = task1[index]['words']
    
    # takes in list of subs, context is list of words
    res = defaultdict(dict)
    context_length = len(context)

    for trigram, candidate in space.items():

        # No Pos experiment, set to 1
        position = context.index(trigram.split()[1])
        end_position = context_length - position
        
        # take position and normalise it wrt length of context
        if use_position:
            normal_position = position / context_length
        else:
            normal_position = 1
        
        original_freq = candidate['original_frequency']     
        original_word = trigram.split()[1]
        
        if original_word in cmu:
            original_ph = cmu[original_word][0]
        else:
            # skip words not in new cmu
            continue
         
        if use_filter:
            phoneme_filter = set(csw(original_word))
            
        lexemes = lexeme(original_word)
        
        for sub, new_freq in candidate['substitutions'].items():
            
            new_word = sub.split()[1]
            
            
            if use_filter:
                if new_word not in phoneme_filter:
                    continue
            
            # ignore lexical derivatives
            if new_word in lexemes:
                continue
            
            new_context = [w for w in context]
            new_context[position-1:position+2] = sub.split()
            
            
            if new_word in cmu:
                new_ph = cmu[new_word][0]
            else:
                # skip words not in new cmu
                continue
            
            if any([w in string.punctuation for w in new_word]):
                continue
                
            tags = ([w[1] for w in 
                     pos_tag(new_context, tagset='universal')])
            
            if tags[position] not in accepted_pos:
                continue

            s = score(original_freq, 
                      new_freq, 
                      original_ph,
                      new_ph,
                      normal_position,
                      ph_penalty=penalty)
            
            res[trigram][sub] = s
            
            
        # might not need to write these to file, but write whole thing to file instead
    with open(full_path, 'w') as f:
        json.dump(sort_answers(res), f, indent=4)
    
    return sort_answers(res)        

In [21]:
import time

use_position = False
use_filter = False
penalty=8
path = "all_trigram_no_pos-ph-8"

before = time.time()
p = Pool(4)
ngram_search_space = p.map(rank_substitutions,   range(len(task1)))
length = time.time() - before

print("Total time taken in seconds: {}".format(length))

Total time taken in seconds: 7112.115962028503


Process ForkPoolWorker-16:
Process ForkPoolWorker-14:
Process ForkPoolWorker-15:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
Process ForkPoolWorker-13:
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 

In [11]:
import time
from multiprocessing import Pool

use_position = False
use_filter = True
penalty=8
path = "phonetic_filter_no_pos-ph-8"

before = time.time()
p = Pool(4)
ngram_search_space = p.map(rank_substitutions,   range(len(task1)))
length = time.time() - before

print("Total time taken in seconds: {}".format(length))

D/AO/T/ER
G/IY/K
AH/L/EH/JH/D
D/IH/S/T/R/OY/ER
HH/AE/F/W/EY
D/IH/Z/AY/N/D
P/EH/D/AH/L/D
M/AH/TH/AA/L/AH/JH/IY
SH/IH/P/B/IH/L/D/ER
AE/M
EH/S
HH/OW/M/OW/S/EH/K/SH/AH/W/AH/L
T/R/AY/D
K/UW/L/IY
IH/Z
G/EY/L/IY
N/UW/L/IY
L/EY/B/ER/ER
W/AY/Z
K/W/IH/R/IY/D
IH/K/S/T/R/IY/M/L/IY
OY/L
F/AY/N
B/OW/T/S
S/EY/L/ER/Z
D/EH/D
B/ER/D/Z
S/IH/K/AH/F/AE/N/T/IH/K
AO/F/AH/N
B/EY/B/IY/Z
M/EY/K
P/L/EH/ZH/ER
K/AA/M/P/L/AH/M/EH/N/T
L/AH/M/B/AA/R
S/K/W/IY/Z/IH/NG
P/R/AE/K/T/AH/S
K/AA/N/SH/AH/S
B/EY/B/IY
F/ER/S/T
B/AO/R/D
R/AH/K/R/UW/T
K/AH/M
W/IY/V
S/T/IH/F
W/IH/D/TH/S
D/AH/Z
K/IH/L/ER
AA/R
AO/R/AH/F/AH/S
L/UH/K
K/AA/R/P/AH/N/T/ER
HH/AY/T/S
P/AH/L/IY/S
DH/EH/N
L/IY/P
W/AY/L/D/IH/B/IY/S/T/S
W/IH/L
B/AA/D/IY
P/R/OW/G/R/AE/M
B/AE/D
V/AA/R/N/IH/SH
S/IH/R/IY/AH/L
G/AH/N/UW/Z
EH/D/AH/T/ER
S/EH/D/IH/M/EH/N/T/L
K/W/AY/T
S/AH/G/JH/EH/S/T/AH/D
W/EY/T
B/IY
B/AA/R/B/ER
S/T/AA/R/T/AH/D
W/UH/D
G/OW
HH/EH/R/IY/D
G/IH/V/Z
R/IH/D/AE/K/SH/AH/N
L/AE/N/D
IH/Z
F/ER/S/T
IH/Z
W/UH/D
AY/R/AH/S
HH/AE/NG
B/ER/D/Z
G/UH/D
D/AO/G
IH/M/P/R/UW/

M/Y/UW/Z/IH/K/AH/L
P/ER/F/EH/K/T
L/AO/N
Y/UW/ZH/AH/W/AH/L/IY
SH/AA/P
S/P/R/IH/NG/Z
K/AE/M/AH/L/Z
D/R/IY/M/Z
R/IH/L/IY
D/IH/F/EH/N/S/IH/V
AE/N/ER/K/AH/S/T
HH/AE/V
K/IH/L/N
HH/AE/D
S/W/IY/T
P/EY/P/ER/Z
P/AA/T/ER/IY
T/OW/D
AA/R
G/R/EY/D/AH/D
JH/AH/S/T
AA/R
K/AA/R
T/EH/N
T/AY/M
F/ER/M/L/IY
M/AH/D
S/IY/K/R/AH/T
M/EY/K
R/EH/D
B/IY
S/T/R/EY/N/JH
V/EY/N/Z
P/IY/R/IY/AA/D/IH/K/AH/L/IY
K/AE/M/ER/AH
B/AE/T
V/EH/R/IY
P/IH/K/CH/ER
B/EH/L/AH/K/OW/S
HH/AE/D
M/AE/G/AH/Z/IY/N/Z
K/AE/N/D/IY/D
K/AE/S/AH/L
W/ER
AE/S/K/T
F/EH/L
HH/AE/D
S/AY/AH/N/T/IH/S/T
T/AA/F/IY
G/AH/V/ER/N/ER
B/L/AH/D
HH/AE/V/IH/NG
V/EH/R/IY
K/W/AO/R/AH/L/S/AH/M
M/AE/N/SH/AH/N
L/AO/NG/Z
F/R/UW/G/AH/L
S/T/AA/R
EH/S
S/K/UW/L
D/EH/R/D
S/IH/R/IH/AH/S
EH/S
T/AY/M
K/W/AY/T
S/EH/D
N/OW/N
D/EY/Z
AY/D/IY/AH
HH/AH/Z/B/AH/N/D
M/EY/D
F/AA/R/S
K/AE/P/IH/T/AH/L
IH/S/T/EY/T
N/AA/T
IH/K/S/EH/S/IH/V
L/AA/R/JH
W/AO/T/ER
D/AH/B/AH/L
K/R/IH/T/AH/S/AY/Z/D
M/AE/N/SH/AH/N
EY/N
EH/N/AH/M/IY
F/R/IY/K/W/AH/N/T/L/IY
B/AE/D/L/IY
T/ER/M/AH/N/AH/L
F/R/EH/N/D
N/AA/T
D

D/AO/G
OW/M/AH/N
R/OW/L/Z
Y/UW/Z/IH/NG
V/Y/UW
B/ER/D
M/EH/N/IY
M/AW/TH
P/IY/P/AH/L
P/IH/JH/AH/N
G/AA/R/L/IH/K
L/IH/S/P
S/P/EH/L/Z
K/AE/N
IH/M/P/L/AH/M/AH/N/T
W/EY/T
G/IH/V/Z
S/EH/D
K/IH/CH/AH/N
AA/R
D/IH/F/EH/N/D/AH/D
P/EY/SH/AH/N/T/L/IY
Y/UW/S/AH/Z
JH/EH/N/ER/AH/S
S/AY/T/IH/NG
G/R/EY/T/ER
IH/Z
V/AO/L/T
AA/R/M/Z
TH/R/OW/IH/NG
AA/R
SH/R/EH/D
B/EH/R
JH/AH/S/T
AA/R
R/AE/SH
EH/S
D/OW/N/AH/T/S
F/AO/L
F/AW/L
B/UH/L/Y/AH/N
W/ER
P/EY
HH/EH/R
M/AO/R
M/EY/K
S/T/AY/L/IH/S/T
HH/IH/L/Z
K/AA/N/T/IH/N/AH/N/T/L/IY
JH/AH/S/T
SH/AA/K
L/IH/B/ER/T/IY/Z
SH/UH/D
S/EH/D
W/EH/N
Y/UW/AA/N
EH/K/S/EH/S
N/IY/D/Z
D/AY/D
K/ER/AH/N/S/IY
S/T/AA/P
IH/Z
S/T/IY/L
EH/S
M/EH/M/ER/IY/Z
D/IH/G/IH/NG
W/IH/N/Z
M/AH/CH
P/AA/R/T/AH/D
IH/N/S/T/R/AH/K/SH/AH/N/Z
IH/Z
F/AE/S/T/ER
F/AE/NG/Z
K/AA/M/P/L/AH/K/EY/T/AH/D
IH/G/N/ER/AH/N/T
M/AE/N
W/EH/L/TH
AE/P/AH/TH/EH/T/IH/K
T/EH/K/N/IH/K/AH/L
B/AE/T/AH/L
JH/AH/S/T
S/T/R/AO/NG/ER
T/AA/K
D/IH/S/AY/F/ER
S/AW/TH
IH/Z
K/AE/N
N/UW
M/AY/T
D/UW
AA/R/T/AH/K/AH/L
W/AA/Z
HH/EH/D/EY/K
B/EY/T/AH
S/T

AE/B/S/IH/N/TH
S/EY
D/AO/T/ER
P/EY/N
N/AY/S
EH/S
R/AA/B/ER
N/AA/T
W/EH/N
V/AY/S
W/AA/Z
SH/IY/P
G/ER/L
N/OW/Z
K/IH/L/N
B/IY
N/OW/W/EH/R
HH/ER/D
M/EY/K/IH/NG
HH/AE/NG/D
M/AO/R/N/IH/NG
K/AH/L/EH/K/SH/AH/N
L/AH/V/D
SH/R/IH/M/P
G/R/UW
JH/AH/S/T
D/EY/T
HH/AA/R/V/AH/S/T
S/ER/CH
T/AY/M
S/T/AE/M/P
SH/IY/P
P/ER/M/IH/T
P/AA/T/ER/IY
L/AO/S/T
T/R/AE/K/IH/NG
L/AE/S/T
AE/D/M/AY/R
G/R/EY/T
D/IH/S/AY/D/IH/D
IH/K/S/P/L/EY/N/D
K/R/EY/Z/AH/L/IY
N/IY/D
T/EH/N/D
G/AE/NG/S/T/ER/Z
HH/AE/D
B/R/EH/D
D/EY/L/IY
L/AY/S/AH/N/S
S/EH/D
M/AH/T
G/OW/Z
M/OW/T/ER/B/AY/K
T/EH/L/IH/K/AH/N/IY/S/IH/S
S/AW/N/D/Z
D/EY/Z
W/EY/T
F/L/AW/ER
S/T/R/EY/N/JH
R/EY/Z
W/IH/S/P/ER/D
W/EY/W/ER/D
HH/IY/R
N/EY/V
AH/P/AA/R/T
F/L/AA/K/S
M/EY/D
B/IH/N
G/AA/R/D/AH/N/IH/NG
P/AH/P
S/IH/N
M/IH/S/T
L/AO/S/T
V/IH/Z/AH/T/AH/D
G/R/OW/N
W/ER
EH/R/IY/AH
P/R/IH/N/S/EH/S
CH/EY/S/IH/NG
D/EY
AH/W/EY
R/UW/M/ER/Z
F/AA/G/IY
S/ER/JH/ER/IY
JH/AH/S/T
R/IY/UW/N/AY/T/IH/D
D/IH/L/EH/M/AH
D/AO/T/ER
S/EY/L
T/AY/M
S/EY
EH/S
P/EH/N/S/AH/L
ER/AY/Z/AH/Z
EH/S
IH/K/S/K/Y/UW/

T/ER/N/D
L/IH/L/IY/Z
K/AE/N/AA/T
HH/AE/V
B/R/EY/K/S
N/EY/V/AH/L
V/AY/AH/L/AH/N/T
SH/EH/L/F/IH/SH
W/AA/Z
B/EH/L/IY
F/AW/N/D
M/AO/R
IH/K/S/P/L/EY/N/D
S/OW
HH/OW/L/IY
S/EH/K/AH/N/D
B/AH/T/AH/N
ER/L/IY
CH/AE/L/IH/S
S/AH/K/S/EH/S/F/AH/L
AO/F/AH/S/ER
L/UW/Z
IH/N/D/IY/D
R/AE/SH/AH/N/AH/L/IY
S/P/R/IH/N/T/ER
N/EH/V/ER
S/EY/K/R/AH/D
S/EH/D
AO/L/W/EY/Z
JH/AH/S/T
L/IH/F/T/AH/D
IH/N/T/AH/JH/ER/Z
S/AY/K/OW/P/AE/TH
W/IH/L/IH/NG/L/IY
HH/AE/D
N/AA/T
EH/S
B/AA/T/AH/M
AE/Z
M/EY/K/S
R/AE/DH/ER
K/EH/R/AH/L/AA/N
N/AH/TH/IH/NG
N/AA/N/S/EH/N/T/S
G/OW/L/D
S/EH/N/T
IH/Z
P/UH/SH/T
AE/B/S/AH/L/UW/T
P/IY/S/AH/Z
P/ER/S/IH/N/IH/L
W/AA/Z
D/AA/L/ER
M/IH/N/T
S/OW/L
G/OW
IH/Z
M/EY/K/S
M/Y/UW/Z/IH/SH/AH/N/Z
N/EH/V/ER
S/EH/D
SH/UH/D
S/W/IH/M/IH/NG
L/UW/Z
T/EH/L/IH/NG/L/IY
P/R/AA/F/IH/T/S
M/IH/N/S/T/R/AH/L
JH/AH/S/T
S/W/IH/NG/Z
K/W/AO/R/T/ER/L/IY
IH/M/IY/D/IY/AH/T/L/IY
JH/IY/N/IY/AA/L/AH/JH/IY
K/AW
IH/K/S/P/EH/K/T
K/R/AE/M/P/S
EH/S
D/IH/S/EH/N/T
AE/NG/K/ER/Z
M/AH/S/AA/ZH
L/EH/T
M/EY
EH/JH/AH/K/EY/SH/AH/N
P/AA/S/CH/ER
IY/T/

L/AO/NG
D/AY
T/EH/S/T/AH/L/IY
V/ER/S
SH/AO/R/T
T/W/AY/S
EH/S
G/R/AE/M/ER
G/R/AE/N/AH/T
S/IY/N
S/IY/F/UW/D
L/ER/N/D
Y/UW/N/IY/K/L/IY
D/AO/T/ER
P/L/EY/T
R/OW/T
R/IH/L/IY
S/EH/D
K/L/AE/M/ER/D
N/AY/S
HH/AE/D
B/R/AA/N/Z
SH/OW/Z
P/IY/S
S/W/AA/L/OW/D
AO/L/W/EY/Z
JH/IY/AA/L/AH/JH/AH/S/T
W/AH/N
W/UH/L
K/Y/UH/R/IY/AH/S/L/IY
K/AA/L/IY/G
M/EH/D/L/IH/NG/L/IY
M/IH/T/AH/N/Z
D/IH/S/K/AH/V/ER/D
S/K/AH/L/P/CH/ER/D
AA/R
R/EY/D/IY/AH/M
SH/OW/N
G/R/EY/T/AH/S/T
M/IH/R/ER/Z
R/IH/M/AA/R/K/T
AE/S/K/T
S/T/AE/N/D
T/IY/CH/IH/NG
S/AH/M/EH/R/IH/L/IY
K/AO/L/D
S/ER/K/AH/S
T/AH/NG
K/AH/N/F/Y/UW/Z/D
AE/D/AH/D
R/AY
HH/AE/D
W/EH/R
N/AW
G/UH/D
F/EY/V/ER/IH/T
B/R/IH/NG
IH/K/S/P/EH/K/T/AH/D
JH/OW
S/AE/N/D/W/IH/CH
IH/L/EH/K/T/R/AA/N/Z
W/AA/T
M/AH/CH
S/M/OW/K/T
R/IY/S/AY/K/AH/L/D
T/ER/N
W/IY/L/Z
EH/S
EH/N/IY/TH/IH/NG
R/IH/T/AH/N
IY/K/W/L/Z
W/AA/Z
D/IH/D
B/EH/L/T
W/IH/NG/K
HH/AE/M/S/T/R/IH/NG
AO/L
W/UH/D
HH/AO/R/S
P/IH/G
AH/P/R/EH/N/T/AH/S
T/AY/M
AE/Z
R/AH/N/IH/NG
S/EY/V/Z
EH/S
D/IH/N/AY
K/W/IH/T
B/L/AY/N/D
T/EY/K
AH/L/UW/M/AH

G/EH/T
M/IY/T
JH/AH/S/T
B/AO/R/D
HH/AY/R/IH/NG
SH/AW/ER
AH/T/R/AE/K/T/AH/D
IH/Z
W/EY/T/S
M/AE/TH
G/AH/V/ER/N/ER/Z
AA/R
M/EH/N/IY
S/P/UW/N
S/T/R/AO/NG/L/IY
AE/S/K/T
M/AE/N
S/AH/P
M/AE/G/N/AH/T
W/AO/R/SH/IH/P
EH/S
L/AO/NG
L/IY/D/Z
W/AY/F
D/IH/N/ER
B/AE/D
S/IY
K/W/AY/AH/T
T/R/EH/N/CH/ER
HH/AE/V
EH/S
HH/AA/R/T
M/EY/K/S
K/AH/S/T/AH/M/Z
EH/S
F/UH/L/IY
W/IH/L
HH/AE/Z
P/ER/F/AO/R/M
W/ER
P/EH/N/IY/Z
EH/S
D/AE/N/S
S/EY/T/IH/D
AY
W/AA/Z
P/AW/N/D/Z
W/AA/Z
L/AY/AH/N
W/IH/N
W/ER/S/T
T/R/EH/ZH/ER/Z
L/EY/D/IY
M/EH/D/AH/L
G/R/EY/V/S/T/OW/N
HH/AE/N/D
OW/K/S
OW/L/IH/M/P/IH/K
M/AY/T/IY
S/ER/JH/AH/N
CH/IH/M/N/IY
G/OW/L/D
G/R/OW
IH/Z
IY/G/AH/L
D/IY
EY/K/AO/R/N/Z
AH/P/IH/N/Y/AH/N
HH/AA/R/T
S/P/AO/R/T
S/M/AO/L
K/R/UW
EH/S
W/ER/K/ER/Z
S/EH/L
K/EY/M
P/EY/D
IH/Z
S/T/OW/L/AH/N
G/R/EY/T
IH/Z
R/IH/P/L/AY/D
AH/F/EH/N/S/IH/V/L/IY
R/IH/M/EH/M/B/ER/D
N/AH/TH/IH/NG
HH/AA/B/IH/T
S/AO/S
G/UH/D/Z
W/EY/L
F/AH/JH
AE/S/K/T
OW/V/ER
W/EH/N/T
S/AH/N/D/EY/Z
M/AA/NG/G/R/L
R/EH/D/IH/NG
K/AE/R/IY
F/AO/R/S
HH/AA/T
M/AH/T/ER/D
P/EY/L


K/W/IH/K
AO/R
P/Y/UH/R/L/IY
K/AH/L/AE/P/S/T
R/IH/L/IY/V/D
K/AH/M/AA/D/AH/T/IY
S/ER/V/AY/V/ER
CH/IH/M/N/IY
AA/R
K/OW/S/T
S/IH/L/V/ER
F/L/UW
IH/N/S/AH/L/T
F/AW/N/D
G/OW/L/D
W/EH/R/IH/NG
G/AA/T
AA/K/S/AH/D/EH/N/T
IY/DH/ER
S/T/EY
K/OW/L/D
K/AA/L/AH/M/N/AH/S/T
L/EY/T
SH/IH/P/R/EH/K/T
W/ER
G/EH/T
AH/DH/ER/Z
N/EH/T/S
Y/AA/R/D
K/IH/L/T
M/EH/T/AH/L
K/AE/S/T
D/EH/D
T/R/AH/D/IH/SH/AH/N/AH/L
B/AE/R/IY/ER
F/IH/SH/AH/Z
K/R/OW
S/T/UH/D
D/IY
G/AA/R/B
K/AO/Z
AH/S/EH/M/B/AH/L/D
D/IH/Z/AY/N/ER
EH/S
B/AA/DH/ER/D
K/L/OW/DH/Z
W/AA/Z
L/OY/AH/L/IY
T/OW/K
HH/AE/D
AO/L/W/EY/Z
P/R/EY
S/AH/P/AO/R/T/ER/Z
JH/EH/N/ER/EY/SH/AH/N
K/AE/N
S/T/EY/SH/AH/N
AE/F/T/ER/N/UW/N
P/R/IH/N/S/AH/P/AH/L/Z
HH/AE/D
IH/N/S/P/EH/K/SH/AH/N
EH/S
F/UH/T/B/AO/L
T/EH/S/T
IH/R/AH/T/EY/T/AH/D
HH/AE/T
T/AH/SH/EY
AA/N/ER/AH/B/AH/L
W/AA/Z
SH/EH/F
P/L/EY/D
S/T/UW/D/AH/N/T/S
IY/V/AH/L
AW/T/S/AY/D
R/IY/P/AO/R/T/AH/D
P/UH/T
P/IH/P/L
W/EH/L
SH/IH/P/M/AH/N/T/S
W/EH/N
IH/L/UW/M/AH/N/EY/T/AH/D
HH/ER/P/IH/T/AA/L/AH/JH/IH/S/T
B/OW/AH
S/T/IY/L/IH/NG
R/AY/D


IH/K/S/P/IH/R/IY/AH/N/S/T
AE/K/T
Y/ER/EY/N/IY/AH/M
B/R/OW/K/AH/N
F/AE/M/AH/L/IY
K/R/AE/NG/K
D/AH/Z
F/IH/SH/P/AA/N/D
B/OY
F/AW/L/Z
K/OY/L/IY
G/R/AY/M
N/EH/S/T
S/EH/D
F/AW/N/D
IH/L
AE/L/F/AH/B/EH/T/IH/K/AH/L
SH/IY/P/IH/SH/L/IY
B/ER/D
G/OW/T
OW/N
S/EH/D
EY/T
P/OY/N/T
S/W/EH/T/ER
S/EH/T
W/UH/L/AH/N
M/EY/D
ER/JH/ER
B/IY
SH/UW/Z
K/EY/M
L/IY/F/IH/NG
OW/V/ER/K/UH/K
L/EH/T/ER/Z
R/AO/NG/F/AH/L/IY
B/IH/L/IY/V
W/EY/Z
P/R/AA/B/AH/B/L/IY
T/W/AY/S
AE/N/AH/G/R/AE/M
SH/IH/P/R/EH/K
R/EH/R/L/IY
W/AY/N/D
S/AH/F/ER
AO/R/D/ER
D/R/AH/M/ER/Z
S/T/EY/K/S
AO/L/W/EY/Z
T/EH/R/AH/B/AH/L
R/EH/S/T/ER/AA/N/T/S
JH/AH/S/T
S/IH/M/B/AH/L/Z
M/EY/K
K/AE/N/AA/T
S/EH/D
S/M/OW/K/IH/NG
V/EY/N/L/IY
AH/K/W/IH/T
B/AY/T
M/EH/N/IY
K/AA/F/ER/Z
JH/AH/JH/IH/Z
D/R/AH/G
B/AH/Z/AA/R
S/M/OW/K/IH/NG
IH/Z
IH/Z
F/EH/R
SH/R/IH/NG/K/IH/NG
IH/K/S/P/IH/R/IY/AH/N/S
S/IH/T/IY
S/IY/IH/NG
IH/Z
P/L/EY/G/OW/ER/Z
S/EH/S/EY/SH/AH/N
M/UH/R
M/EY/K
R/IY/V/Y/UW/Z
W/ER/K
D/IH/S/AY/D/IH/D
B/EY/K/ER/Z
HH/AE/V/IH/NG
R/OW/L
JH/AH/S/T
R/IH/V/ER/S/AH/L
F/AO/R/TH
T/

In [13]:
use_position = True
use_filter = True
penalty=8
path = "phonetic_filter_with_pos-ph-8"

before = time.time()
p = Pool(4)
ngram_search_space = p.map(rank_substitutions,   range(len(task1)))
length = time.time() - before

print("Total time taken in seconds: {}".format(length))

AH/L/EH/JH/D
G/IY/K
D/AO/T/ER
D/IH/S/T/R/OY/ER
HH/AE/F/W/EY
D/IH/Z/AY/N/D
M/AH/TH/AA/L/AH/JH/IY
P/EH/D/AH/L/D
AE/M
SH/IH/P/B/IH/L/D/ER
HH/OW/M/OW/S/EH/K/SH/AH/W/AH/L
EH/S
K/UW/L/IY
T/R/AY/D
IH/Z
G/EY/L/IY
L/EY/B/ER/ER
W/AY/Z
N/UW/L/IY
K/W/IH/R/IY/D
B/OW/T/S
IH/K/S/T/R/IY/M/L/IY
OY/L
F/AY/N
AO/F/AH/N
D/EH/D
S/EY/L/ER/Z
B/ER/D/Z
P/L/EH/ZH/ER
B/EY/B/IY/Z
S/IH/K/AH/F/AE/N/T/IH/K
M/EY/K
P/R/AE/K/T/AH/S
S/K/W/IY/Z/IH/NG
L/AH/M/B/AA/R
K/AA/M/P/L/AH/M/EH/N/T
F/ER/S/T
B/EY/B/IY
B/AO/R/D
W/IY/V
K/AH/M
K/AA/N/SH/AH/S
S/T/IH/F
K/IH/L/ER
D/AH/Z
R/AH/K/R/UW/T
AA/R
AO/R/AH/F/AH/S
L/UH/K
W/IH/D/TH/S
K/AA/R/P/AH/N/T/ER
P/AH/L/IY/S
DH/EH/N
W/AY/L/D/IH/B/IY/S/T/S
HH/AY/T/S
B/AA/D/IY
W/IH/L
B/AE/D
S/IH/R/IY/AH/L
V/AA/R/N/IH/SH
L/IY/P
G/AH/N/UW/Z
S/AH/G/JH/EH/S/T/AH/D
S/EH/D/IH/M/EH/N/T/L
P/R/OW/G/R/AE/M
K/W/AY/T
B/IY
W/UH/D
EH/D/AH/T/ER
B/AA/R/B/ER
G/OW
W/EY/T
G/IH/V/Z
HH/EH/R/IY/D
L/AE/N/D
S/T/AA/R/T/AH/D
F/ER/S/T
IH/Z
W/UH/D
HH/AE/NG
R/IH/D/AE/K/SH/AH/N
IH/Z
G/UH/D
B/ER/D/Z
D/AO/G
AY/R/AH/S
B/IY
R/EY/Z


W/ER/D
HH/AE/D
AE/N/ER/K/AH/S/T
K/R/AY
P/UH/R
T/OW/D
W/ER/L/D
P/EY/P/ER/Z
B/ER/OW/K
B/IY/IH/NG
G/R/EY/D/AH/D
K/AA/R
M/Y/UW/Z/IH/K/AH/L
R/AO/NG
M/AH/D
T/EH/N
SH/AA/P
F/AO/L/T/S
B/IY
S/IY/K/R/AH/T
R/IH/L/IY
K/AE/M/ER/AH
Y/UW/ZH/AH/W/AH/L/IY
S/T/R/EY/N/JH
K/IH/L/N
P/IH/K/CH/ER
D/R/IY/M/Z
P/AA/T/ER/IY
B/AE/T
K/AE/N/D/IY/D
JH/AH/S/T
HH/AE/V
HH/AE/D
F/EH/L
T/AY/M
S/W/IY/T
K/AE/S/AH/L
T/AA/F/IY
M/EY/K
AA/R
S/AY/AH/N/T/IH/S/T
V/EH/R/IY
V/EY/N/Z
AA/R
HH/AE/V/IH/NG
F/R/UW/G/AH/L
V/EH/R/IY
F/ER/M/L/IY
L/AO/NG/Z
B/EH/L/AH/K/OW/S
D/EH/R/D
R/EH/D
S/K/UW/L
K/W/AY/T
W/ER
P/IY/R/IY/AA/D/IH/K/AH/L/IY
T/AY/M
HH/AE/D
HH/AH/Z/B/AH/N/D
D/EY/Z
B/L/AH/D
IH/S/T/EY/T
M/AE/G/AH/Z/IY/N/Z
F/AA/R/S
K/W/AO/R/AH/L/S/AH/M
L/AA/R/JH
AE/S/K/T
IH/K/S/EH/S/IH/V
S/T/AA/R
M/AE/N/SH/AH/N
G/AH/V/ER/N/ER
S/IH/R/IH/AH/S
K/R/IH/T/AH/S/AY/Z/D
B/AE/D/L/IY
M/AE/N/SH/AH/N
N/OW/N
F/R/IY/K/W/AH/N/T/L/IY
N/AA/T
EH/S
W/ER
M/EY/D
IH/Z
EH/S
HH/Y/UW/M/AH/N
N/AA/T
S/T/ER/D
S/EH/D
K/AE/T
W/AO/T/ER
AY/D/IY/AH
B/AA/R/K/T
TH/IH/NG/K/S
EY/N
K/AE

JH/AH/S/T
V/Y/UW
K/AE/N
M/AW/TH
B/AH/S
L/IH/S/P
S/EH/D
P/IY/P/AH/L
SH/EH/R
P/EY/SH/AH/N/T/L/IY
S/P/EH/L/Z
W/EY/T
F/EH/R
G/IH/V/Z
IH/Z
AA/R
HH/OW/M
TH/R/OW/IH/NG
D/IH/F/EH/N/D/AH/D
JH/EH/N/ER/AH/S
B/EY/K/ER/IY
JH/AH/S/T
S/AY/T/IH/NG
V/AO/L/T
R/OW/L/Z
D/OW/N/AH/T/S
AA/R/M/Z
M/EH/N/IY
AA/R
W/ER
B/EH/R
AA/R
G/AA/R/L/IH/K
M/AO/R
EH/S
F/AO/L
IH/M/P/L/AH/M/AH/N/T
K/AA/N/T/IH/N/AH/N/T/L/IY
B/UH/L/Y/AH/N
P/EY
K/IH/CH/AH/N
S/EH/D
M/EY/K
Y/UW/S/AH/Z
HH/IH/L/Z
N/IY/D/Z
JH/AH/S/T
G/R/EY/T/ER
L/IH/B/ER/T/IY/Z
SH/UH/D
IH/Z
M/EH/M/ER/IY/Z
Y/UW/AA/N
EH/K/S/EH/S
SH/R/EH/D
R/AE/SH
K/ER/AH/N/S/IY
S/T/AA/P
P/AA/R/T/AH/D
F/AW/L
D/IH/G/IH/NG
EH/S
F/AE/NG/Z
HH/EH/R
W/IH/N/Z
IH/N/S/T/R/AH/K/SH/AH/N/Z
W/EH/L/TH
S/T/AY/L/IH/S/T
F/AE/S/T/ER
K/AA/M/P/L/AH/K/EY/T/AH/D
JH/AH/S/T
SH/AA/K
M/AE/N
S/AW/TH
T/EH/K/N/IH/K/AH/L
W/EH/N
B/AE/T/AH/L
M/AY/T
D/IH/S/AY/F/ER
D/AY/D
S/T/R/AO/NG/ER
HH/EH/D/EY/K
K/AE/N
S/T/IY/L
M/AY/G/R/EY/N
IH/Z
AA/R/T/AH/K/AH/L
M/AH/CH
IH/N/F/L/UW/EH/N/CH/AH/L
D/UW
S/T/EY/SH/AH/N/EH/R/IY
IH/Z
B/EY/

V/AY/S
S/EH/V/R/AH/L
K/AH/L/EH/K/SH/AH/N
P/OW/T/AH/N/T
N/OW/Z
P/EY/N
D/EY/T
S/ER/T/AH/N
HH/ER/D
S/T/AE/M/P
R/AA/B/ER
M/EY
M/AO/R/N/IH/NG
W/AA/Z
L/AO/S/T
P/OW/T/AH/B/AH/L
K/IH/L/N
G/R/UW
G/R/EY/T
S/ER/CH
AE/B/S/IH/N/TH
M/EY/K/IH/NG
N/IY/D
SH/IY/P
L/AH/V/D
B/R/EH/D
N/AA/T
JH/AH/S/T
T/R/AE/K/IH/NG
M/AH/T
SH/IY/P
T/AY/M
D/IH/S/AY/D/IH/D
S/AW/N/D/Z
B/IY
P/AA/T/ER/IY
T/EH/N/D
S/T/R/EY/N/JH
HH/AE/NG/D
AE/D/M/AY/R
HH/IY/R
SH/R/IH/M/P
D/EY/L/IY
K/R/EY/Z/AH/L/IY
M/EY/D
HH/AA/R/V/AH/S/T
G/OW/Z
G/AE/NG/S/T/ER/Z
P/AH/P
D/EY/Z
P/ER/M/IH/T
S/EH/D
F/L/AW/ER
G/R/OW/N
L/AE/S/T
T/EH/L/IH/K/AH/N/IY/S/IH/S
W/EY/W/ER/D
CH/EY/S/IH/NG
IH/K/S/P/L/EY/N/D
F/L/AA/K/S
R/UW/M/ER/Z
R/EY/Z
HH/AE/D
D/AO/T/ER
G/AA/R/D/AH/N/IH/NG
N/EY/V
L/AY/S/AH/N/S
EH/S
M/OW/T/ER/B/AY/K
L/AO/S/T
S/IH/N
EH/S
EH/R/IY/AH
W/EY/T
AO/L/W/EY/Z
V/IH/Z/AH/T/AH/D
AH/W/EY
W/IH/S/P/ER/D
W/AY/L/D
P/R/IH/N/S/EH/S
JH/AH/S/T
AH/P/AA/R/T
G/ER/L/F/R/EH/N/D
S/ER/JH/ER/IY
S/EY/L
B/IH/N
W/UH/D
D/IH/L/EH/M/AH
P/EH/N/S/AH/L
M/IH/S/T
S/AH/M/TH/IH/NG
S/EY
B/L

CH/AE/L/IH/S
N/EY/V/AH/L
D/AH/V/Z
K/AA/R/P/AH/T
IH/N/D/IY/D
B/EH/L/IY
D/IH/S/AY/D/IH/D
S/EY/K/R/AH/D
B/R/EY/K/S
S/OW
F/AO/R/M/Y/AH/L/AH
L/IH/F/T/AH/D
W/AA/Z
B/AH/T/AH/N
IH/K/S/P/L/EY/N/D
HH/AE/D
K/L/AE/S
AO/F/AH/S/ER
B/AA/T/AH/M
T/ER/N/D
R/AE/SH/AH/N/AH/L/IY
ER/L/IY
K/EH/R/AH/L/AA/N
K/AE/N/AA/T
S/EH/D
L/UW/Z
V/AY/AH/L/AH/N/T
IH/N/T/AH/JH/ER/Z
N/EH/V/ER
IH/Z
M/AO/R
JH/AH/S/T
N/AA/T
P/ER/S/IH/N/IH/L
S/EH/K/AH/N/D
W/IH/L/IH/NG/L/IY
M/EY/K/S
G/OW
S/AH/K/S/EH/S/F/AH/L
AE/Z
M/Y/UW/Z/IH/SH/AH/N/Z
G/OW/L/D
N/AH/TH/IH/NG
S/P/R/IH/N/T/ER
P/UH/SH/T
S/W/IH/M/IH/NG
S/EH/N/T
W/AA/Z
AO/L/W/EY/Z
M/IH/N/S/T/R/AH/L
P/IY/S/AH/Z
S/OW/L
S/AY/K/OW/P/AE/TH
IH/M/IY/D/IY/AH/T/L/IY
M/IH/N/T
N/EH/V/ER
EH/S
K/R/AE/M/P/S
M/EY/K/S
L/UW/Z
R/AE/DH/ER
L/EH/T
SH/UH/D
JH/AH/S/T
N/AA/N/S/EH/N/T/S
IY/T/IH/NG
P/R/AA/F/IH/T/S
JH/IY/N/IY/AA/L/AH/JH/IY
AE/B/S/AH/L/UW/T
N/EH/V/ER
K/W/AO/R/T/ER/L/IY
D/AA/L/ER
D/IH/S/EH/N/T
G/EH/T
IH/K/S/P/EH/K/T
IH/Z
IH/K/S/P/EH/R/AH/M/AH/N/T
EH/JH/AH/K/EY/SH/AH/N
AE/NG/K/ER/Z
S/EH/D
IH/Z
W/ER/

G/R/AE/N/AH/T
IH/Z
EH/S
S/IY/F/UW/D
Y/UW/N/IY/K/L/IY
V/ER/S
IH/Z
EH/S
P/L/EY/T
R/IH/L/IY
Y/AA/R/D
S/IY/N
K/L/AE/M/ER/D
N/AY/S
S/T/AO/R/IY
D/AO/T/ER
SH/OW/Z
P/IY/S
N/OW/Z
S/EH/D
JH/IY/AA/L/AH/JH/AH/S/T
AO/L/W/EY/Z
L/AO/NG
B/R/AA/N/Z
K/Y/UH/R/IY/AH/S/L/IY
SH/AO/R/T
K/AA/L/IY/G
G/R/AE/M/ER
D/IH/S/K/AH/V/ER/D
W/AH/N
S/K/AH/L/P/CH/ER/D
L/ER/N/D
M/EH/D/L/IH/NG/L/IY
SH/OW/N
R/EY/D/IY/AH/M
R/OW/T
R/IH/M/AA/R/K/T
AA/R
AE/S/K/T
HH/AE/D
M/IH/R/ER/Z
K/AO/L/D
S/AH/M/EH/R/IH/L/IY
S/W/AA/L/OW/D
T/IY/CH/IH/NG
K/AH/N/F/Y/UW/Z/D
AE/D/AH/D
W/UH/L
T/AH/NG
W/EH/R
N/AW
M/IH/T/AH/N/Z
R/AY
IH/K/S/P/EH/K/T/AH/D
B/R/IH/NG
G/R/EY/T/AH/S/T
IH/L/EH/K/T/R/AA/N/Z
F/EY/V/ER/IH/T
M/AH/CH
S/T/AE/N/D
S/AE/N/D/W/IH/CH
W/IY/L/Z
R/IY/S/AY/K/AH/L/D
S/ER/K/AH/S
S/M/OW/K/T
IY/K/W/L/Z
R/IH/T/AH/N
HH/AE/D
EH/S
B/EH/L/T
W/IH/NG/K
G/UH/D
W/AA/Z
W/UH/D
HH/AO/R/S
JH/OW
HH/AE/M/S/T/R/IH/NG
T/AY/M
AE/Z
W/AA/T
S/EY/V/Z
P/IH/G
EH/S
T/EY/K
T/ER/N
R/AH/N/IH/NG
B/L/AY/N/D
IH/Z
EH/N/IY/TH/IH/NG
K/W/IH/T
IH/Z
N/IY/D/AH/D
D/IH/D
M/EH/R/AH/TH

S/T/R/AO/NG/L/IY
L/AH/M/B/AA/R
HH/AY/R/IH/NG
P/AE/K/IH/NG
M/AE/G/N/AH/T
IH/Z
M/IY/T
D/AA/K/T/ER
W/AY/F
M/AE/TH
SH/AW/ER
W/EH/N/T
W/EY/T/S
S/P/UW/N
K/W/AY/AH/T
B/AO/R/D
M/EH/N/IY
S/AH/P
HH/AA/R/T
M/AE/N
L/AO/NG
F/UH/L/IY
G/AH/V/ER/N/ER/Z
EH/S
B/AE/D
W/ER
D/IH/N/ER
HH/AE/V
AE/S/K/T
S/EY/T/IH/D
T/R/EH/N/CH/ER
K/AH/S/T/AH/M/Z
W/AA/Z
W/AO/R/SH/IH/P
M/EY/K/S
W/ER/S/T
P/ER/F/AO/R/M
L/IY/D/Z
W/IH/L
G/R/EY/V/S/T/OW/N
D/AE/N/S
S/IY
P/EH/N/IY/Z
CH/IH/M/N/IY
EH/S
W/AA/Z
IH/Z
EH/S
P/AW/N/D/Z
W/IH/N
AH/P/IH/N/Y/AH/N
HH/AE/Z
T/R/EH/ZH/ER/Z
M/EH/D/AH/L
EH/S
EH/S
OW/L/IH/M/P/IH/K
OW/K/S
P/EY/D
AY
M/AY/T/IY
IH/Z
G/OW/L/D
L/AY/AH/N
G/R/OW
N/AH/TH/IH/NG
D/IY
L/EY/D/IY
EY/K/AO/R/N/Z
S/AO/S
S/P/AO/R/T
HH/AE/N/D
S/M/AO/L
F/AH/JH
K/R/UW
S/ER/JH/AH/N
S/AH/N/D/EY/Z
S/EH/L
K/EY/M
IY/G/AH/L
S/T/OW/L/AH/N
HH/AA/T
G/R/EY/T
HH/AA/R/T
AH/F/EH/N/S/IH/V/L/IY
W/IY/N/IH/Z
R/IH/M/EH/M/B/ER/D
W/ER/K/ER/Z
G/UH/D/Z
M/AE/G/N/AH/T
W/EY/L
IH/Z
AE/S/K/T
W/EH/N/T
R/AH/F/R/IH/JH/ER/EY/T/ER
R/IH/P/L/AY/D
M/AA/NG/G/R/L
K/AE/R/IY
W/A

D/R/AO/L
G/AA/T
HH/IH/T
V/IH/K/T/AH/M
K/OW/L/D
D/EY
K/W/IH/K
P/Y/UH/R/L/IY
W/ER
AO/R
R/IH/L/IY/V/D
S/ER/V/AY/V/ER
N/EH/T/S
K/AH/M/AA/D/AH/T/IY
K/OW/S/T
AA/R
K/AE/S/T
S/IH/L/V/ER
F/AW/N/D
IH/N/S/AH/L/T
F/IH/SH/AH/Z
G/OW/L/D
AA/K/S/AH/D/EH/N/T
W/EH/R/IH/NG
D/IY
IY/DH/ER
S/T/EY
SH/IH/P/R/EH/K/T
D/IH/Z/AY/N/ER
K/AA/L/AH/M/N/AH/S/T
L/EY/T
Y/AA/R/D
K/L/OW/DH/Z
AH/DH/ER/Z
G/EH/T
D/EH/D
HH/AE/D
K/IH/L/T
M/EH/T/AH/L
K/R/OW
JH/EH/N/ER/EY/SH/AH/N
T/R/AH/D/IH/SH/AH/N/AH/L
B/AE/R/IY/ER
K/AO/Z
AE/F/T/ER/N/UW/N
S/T/UH/D
G/AA/R/B
B/AA/DH/ER/D
F/UH/T/B/AO/L
AH/S/EH/M/B/AH/L/D
EH/S
AO/L/W/EY/Z
T/AH/SH/EY
W/AA/Z
L/OY/AH/L/IY
P/L/EY/D
S/T/EY/SH/AH/N
T/OW/K
P/R/EY
AW/T/S/AY/D
IH/N/S/P/EH/K/SH/AH/N
K/AE/N
S/AH/P/AO/R/T/ER/Z
W/EH/L
IH/R/AH/T/EY/T/AH/D
P/R/IH/N/S/AH/P/AH/L/Z
IH/L/UW/M/AH/N/EY/T/AH/D
HH/AE/D
W/AA/Z
T/EH/S/T
R/AY/D
EH/S
IY/V/AH/L
HH/AE/T
D/IH/S/T/R/IH/K/T
AA/N/ER/AH/B/AH/L
P/IH/P/L
SH/EH/F
S/T/UW/D/AH/N/T/S
W/EH/D/IH/NG
HH/ER/P/IH/T/AA/L/AH/JH/IH/S/T
R/IY/P/AO/R/T/AH/D
AH/W/EY/T
P/UH/T
T/OW/D/Z

Y/ER/EY/N/IY/AH/M
D/AY/AH/P/ER/Z
F/AE/M/AH/L/IY
K/R/AE/NG/K
R/AE/SH/L/IY
F/IH/SH/P/AA/N/D
AE/K/T
F/AW/L/Z
K/OY/L/IY
B/R/OW/K/AH/N
N/EH/S/T
S/EH/D
D/AH/Z
AE/L/F/AH/B/EH/T/IH/K/AH/L
IH/L
B/OY
B/ER/D
G/OW/T
G/R/AY/M
OW/N
EY/T
F/AW/N/D
P/OY/N/T
S/EH/T
SH/IY/P/IH/SH/L/IY
M/EY/D
ER/JH/ER
S/EH/D
SH/UW/Z
K/EY/M
S/W/EH/T/ER
OW/V/ER/K/UH/K
L/EH/T/ER/Z
W/UH/L/AH/N
B/IH/L/IY/V
W/EY/Z
B/IY
P/R/AA/B/AH/B/L/IY
L/IY/F/IH/NG
AE/N/AH/G/R/AE/M
R/EH/R/L/IY
W/AY/N/D
R/AO/NG/F/AH/L/IY
AO/R/D/ER
T/W/AY/S
S/T/EY/K/S
SH/IH/P/R/EH/K
AO/L/W/EY/Z
S/AH/F/ER
R/EH/S/T/ER/AA/N/T/S
D/R/AH/M/ER/Z
M/EY/K
T/EH/R/AH/B/AH/L
K/AE/N/AA/T
JH/AH/S/T
S/M/OW/K/IH/NG
S/IH/M/B/AH/L/Z
AH/K/W/IH/T
M/EH/N/IY
S/EH/D
JH/AH/JH/IH/Z
V/EY/N/L/IY
B/AH/Z/AA/R
B/AY/T
IH/Z
K/AA/F/ER/Z
F/EH/R
D/R/AH/G
IH/K/S/P/IH/R/IY/AH/N/S
S/M/OW/K/IH/NG
S/IY/IH/NG
IH/Z
P/L/EY/G/OW/ER/Z
SH/R/IH/NG/K/IH/NG
M/UH/R
S/IH/T/IY
R/IY/V/Y/UW/Z
IH/Z
D/IH/S/AY/D/IH/D
S/EH/S/EY/SH/AH/N
HH/AE/V/IH/NG
M/EY/K
W/ER/K
JH/AH/S/T
B/EY/K/ER/Z
F/AO/R/TH
R/OW/L
B/IY
D/EY
R/IH/V/

In [20]:
cd ..

/home/doogy/Projects/pun_detection


In [9]:
with open("results/phonetic_filter_with_pos-ph-8-no-contractions", 'w') as f:
    json.dump(ngram_search_space, f)

In [5]:
cd results/phonetic_filter_no_pos/

/home/doogy/Projects/pun_detection/results/phonetic_filter_no_pos


In [8]:
total = []
for i in range(1780):
    with open(str(i)) as f:
        total.append(json.load(f))
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [10]:
total[10]

{"' s between": [["' em between", -371.75]],
 'my heel ,': [['my hole ,', 1882.8],
  ['my meal ,', 1505.52],
  ['my wheel ,', -597.6],
  ['my real ,', -626.76],
  ['my zeal ,', -627.12],
  ['my deal ,', -636.12],
  ['my hall ,', -679.3199999999999],
  ['my whole ,', -685.4399999999999],
  ['my heat ,', -704.4444444444446],
  ['my reel ,', -837.0],
  ['my seal ,', -857.88],
  ['my hell ,', -859.3199999999999],
  ['my feel ,', -1058.76],
  ['my haul ,', -1084.32],
  ['my hull ,', -1089.36],
  ['my keel ,', -1121.04],
  ['my veal ,', -1139.3999999999999],
  ['my hill ,', -1190.2222222222224],
  ['my hear ,', -1201.777777777778],
  ['my here ,', -1345.3333333333335],
  ['my heap ,', -1366.2222222222224],
  ['my heath ,', -1386.666666666667],
  ['my hail ,', -1391.1111111111113],
  ['my heed ,', -1403.5555555555559],
  ['my healer ,', -1477.5510204081634],
  ['my heals ,', -1593.0],
  ['my heal ,', -3119.0]],
 'my sole and': [['my soul and', 58067.0],
  ['my goal and', 3237.7777777777783],


In [11]:
task1[10]

{'pun': True,
 'words': ['It',
  'is',
  'between',
  'my',
  'sole',
  'and',
  'my',
  'heel',
  ',',
  'said',
  'Tom',
  'archly',
  '.']}

## use_position = True
use_filter = True
path = "phonetic_filter_no_pos"
rank_substitutions(8)

## Phonetic Generation, Running only on puns, no Tom Swiftys

In [None]:
# Remove Tom's from Data set, based solely on the word 'Tom'
t1_no_toms, no_toms_search_space = [], []
for i, p in enumerate(task1):
    if 'Tom' not in p['words']:
        t1_no_toms.append(p)
        no_toms_search_space.append(search_space[i])

In [None]:
def switch_score(distance, frequency_difference, position):
    return frequency_difference / ((distance**2 + 1 + position))

In [None]:
no_toms_search_space

In [None]:
all_res = []
for i, results in enumerate(no_toms_search_space):
    if results == 'miss':
        all_res.append(('miss', 0))
        continue
    max_score = 0
    max_word = ''
    for original_word, replacements in results.items():
        pos = t1_no_toms[i]['words'].index(original_word)
        pos = len(t1_no_toms[i]['words']) - pos
        for subs in replacements:
            score = switch_score(subs[1], subs[2], pos)
            if score > max_score:
                max_score = score
                max_word = subs[0]
    all_res.append((max_word, max_score))

In [None]:
tp, fp, tn, fn = 0, 0, 0, 0

for i, result in enumerate(all_res):
    if result[1] > 10:
        if t1_no_toms[i]['pun']:
            tp += 1
        else:
            fp += 1
    else:
        if not t1_no_toms[i]['pun']:
            tn += 1
        else:
            fn += 1
            
acc = (tp + tn) / len(all_res)
prec = tp / (tp + fn)
recall = tp / (tp + fp)
f1 = (2*(recall*prec)) / (recall + prec)
#     accuracies.append(acc)
#     precisions.append(prec)
#     recalls.append(recall)
#     f1s.append(f1)
print(tp, tn, fp, fn)
print("Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(acc, prec, recall, f1))

## Using Phonetic Generation, Tom Detection + Language Model

In [None]:
m = models.KeyedVectors.load_word2vec_format('/home/doogy/Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('babies')

In [None]:
all_res = []
for i, results in enumerate(search_space):
    print_progress(i+1, len(search_space))
    if is_Tom_Swifty(' '.join(task1[i]['words']), m):
        all_res.append(('tom', 1))
        continue
    if results == 'miss':
        all_res.append(('miss', 0))
        continue
    max_score = 0
    max_word = ''
    for original_word, replacements in results.items():
        og_stem = stemmer.stem(original_word)
        pos = task1[i]['words'].index(original_word)
        pos = len(task1[i]['words']) - pos
        for subs in replacements:
            if stemmer.stem(subs[0]) == og_stem:
                continue
            score = switch_score(subs[1], subs[2], 0)
            if score > max_score:
                max_score = score
                max_word = subs[0]
    all_res.append((max_word, max_score))
    

In [None]:
for i, res in enumerate(all_res):
    print(i, res, ' '.join(task1[i]['words']))

In [None]:
search_space[1762]

In [None]:
get_closest_sounding_words('ordure'), cmu['order']

In [None]:
for original_word, subs in baby_oil.items():
    max_word, max_score = '', 0
    print(original_word, subs)
    pos = len(task1[2]['words']) - task1[2]['words'].index(original_word)
    for sub in subs:
        print(switch_score(sub[1], sub[2], pos))

In [None]:
tp, fp, tn, fn = 0, 0, 0, 0

for i, result in enumerate(all_res):
    if result[1] > 0:
        if task1[i]['pun']:
            tp += 1
        else:
            fp += 1
    else:
        if not task1[i]['pun']:
            tn += 1
        else:
            fn += 1
            
acc = (tp + tn) / len(all_res)
prec = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*(recall*prec)) / (recall + prec)
#     accuracies.append(acc)
#     precisions.append(prec)
#     recalls.append(recall)
#     f1s.append(f1)
print(tp, tn, fp, fn)
print("Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(acc, prec, recall, f1))

In [None]:
m.similarity('ledge', 'mountain')

In [None]:
for i in range(len(search_space)):
    