In [1]:
from src.data_processing import load_data
import itertools
import string

from src.ngrams import *
from src.string_similarity import levenshtein
import operator
from src.data_processing import print_progress
from nltk import word_tokenize, pos_tag
from src.data_processing import load_cmu
from src.ipatoarpabet import translate
from string import punctuation
from src.pronunciations import phonetic_distance
import os

In [2]:
from src.pun_algorithms import *

Loading Model, this could take a while...


In [3]:
with open("data/ngram_searchspace/ngram_totals.json") as f:
    search_space = json.load(f)

In [9]:
all_frequencies = defaultdict(int)
for context in search_space:
    for k, v, in context.items():
        all_frequencies[k] = v['original_frequency']
        for k1, v1 in v['substitutions'].items():
            all_frequencies[k1] = v1

In [6]:
task1, task2, task3, min_pairs, strings, pun_strings = load_data()

In [19]:
def score(original_frequency, new_frequency, original_word, new_word, position, ph_penalty=2):
#     return ((new_frequency - original_frequency) * phonetic_distance(original_word, new_word)) / position
    return ((new_frequency - original_frequency)
             * (phonetic_distance(original_word, new_word, translated=True) ** ph_penalty
             * position)) # pos is normalised

In [20]:
def single_score(og_tri, new_tri, ph_penalty=2):
    og_freq = all_frequencies[og_tri]
    new_freq = all_frequencies[new_tri]
    return score(og_freq, new_freq, og_tri.split()[1], new_tri.split()[1], 1, ph_penalty)

In [6]:
def classification_accuracy(run):
    tp, fp, tn, fn = 0, 0, 0, 0
    
    for i in range(len(task1)):
        gold_pun = task1[i]['pun']
        if run[i] and gold_pun:
            tp += 1
        if not run[i] and not gold_pun:
            tn += 1
        if run[i] and not gold_pun:
            fp += 1
        if not run[i] and gold_pun:
            fn += 1
    
    results = {}
    results['acc'] = (tp + tn) / len(task1)
    results['recall'] = tp / (tp + fn)
    results['prec'] = tp / (tp + fp)
    results['f1'] = ((2*(results['recall'] * results['prec']))
                      / (results['recall'] + results['prec']))
    
    return results

# No Trigram Baseline

In [7]:
def no_trigram_baseline(index):
    current_context = search_space[index]
    for original_trigram in current_context:
        if current_context[original_trigram]['original_frequency'] == 0:
            return True
    return False

In [8]:
results = [no_trigram_baseline(index) for index in range(len(search_space))]

In [9]:
classification_accuracy(results)

{'acc': 0.7792134831460674,
 'f1': 0.841596130592503,
 'prec': 0.8628099173553719,
 'recall': 0.8214004720692368}

# No Quadgram Baseline

In [10]:
all_quadgrams_in_task1 = set()
for context in task1:
    words = context['words']
    for i in range(len(words)-3):
        all_quadgrams_in_task1.add(' '.join(words[i:i+4]))

In [11]:
all_quadgrams_in_task1 = list(sorted(all_quadgrams_in_task1))

In [110]:
groupings['var']

['varnish without a trace']

In [103]:
def quad_to_dict(text):
    ret = {}
    for line in text.split('\n'):
        lsplit = line.split()
        try:
            ret[' '.join(lsplit[:4])] = int(lsplit[4])
        except:
            pass
    return ret

In [112]:
quadgram_frequencies ={}
for i, beginning_letters in enumerate(groupings):
    try:
        ngram_output = subprocess.check_output(['zcat', get_gram_file(beginning_letters, 4)]).decode('latin-1')
    except:
        pass
    quad_dict = quad_to_dict(ngram_output)
    for subquads in groupings[beginning_letters]:
        try:
#             print(subquads)
            quadgram_frequencies[subquads] = quad_dict[subquads]
        except KeyError:
            pass
    print_progress(i, len(groupings))

 |████████████████████████████████████████████████████████████████████████████████████████████████████| 99.9% 

In [114]:
with open("")

{'The last will be': 1026,
 'If they say you': 1071,
 'tornado destroyed their home': 45,
 'doubt , do nothing': 313,
 'days without prayer makes': 426,
 'that the mind is': 26985,
 'man , but I': 36696,
 'could make you turn': 86,
 'a great opera singer': 321,
 'My friend is very': 771,
 "thinks he ' s": 58,
 'in computer operating systems': 1431,
 'self - restraint ,': 16162,
 'I was surprised to': 241882,
 'the bright lights ,': 4855,
 'a dangerous thing .': 18153,
 "That ' s no": 116,
 'name is Gil .': 46,
 ', there is an': 1233292,
 'out from his mother': 300,
 "can ' t take": 387,
 'I know what I': 305110,
 'name is Manuel ,': 101,
 ', but I wasn': 1371,
 'receive , may the': 145,
 'the day you buy': 2430,
 'and steady wins the': 8854,
 'The sun is rising': 3494,
 'The best things come': 1420,
 'cat that swallowed a': 83,
 'He was the backup': 190,
 'finding shoes that fit': 586,
 'was in my cabin': 311,
 'went on to a': 32184,
 'read the next message': 653,
 'and others are the'

In [38]:
freqs = {}
for i, quad in enumerate(all_quadgrams_in_task1):
    try:
        quad = ngram_frequency(quad.split())
    except ValueError:
        print(quad)
        raise

    print_progress(i, len(all_quadgrams_in_task1))

 |----------------------------------------------------------------------------------------------------| 0.2% 

KeyboardInterrupt: 

In [27]:
groupings = defaultdict(list)
for quadgram in all_quadgrams_in_task1:
    groupings[quadgram.split()[0][:3]].append(quadgram)

In [53]:
get_gram_file(';', 4)

'/home/doogy/Data/4grams/c1/symbols/semicolon.gz'

In [41]:
from src.ngrams import ngram_frequency, get_gram_file

In [26]:
ngram_frequency('a flare for'.split())

7986

In [None]:
accepted_pos = {'ADV', 'ADJ', 'VERB', 'NOUN'}
def rank_substitutions(index):
    
    if os.path.exists("results/0/{}".format(index)):
        print(index)
        return
    
    space = search_space[index]
    context = task1[index]['words']
    
    # takes in list of subs, context is list of words
    res = defaultdict(dict)
    context_length = len(context)

    for trigram, candidate in space.items():

        position = context.index(trigram.split()[1])
        end_position = context_length - position
        
        # take position and normalise it wrt length of context
        normal_position = position / context_length
        original_freq = candidate['original_frequency']
        
        original_word = trigram.split()[1]
        original_ph = ph_translate(original_word)
        
        for sub, new_freq in candidate['substitutions'].items():
            
            new_context = [w for w in context]
            new_context[position-1:position+2] = sub.split()
            
            new_word = sub.split()[1]
            new_ph = ph_translate(new_word)
            
            if any([w in string.punctuation for w in new_word]):
                continue
                
            tags = [w[1] for w in pos_tag(new_context, tagset='universal')]
            
            if tags[position] not in accepted_pos:
                continue
            

            s = score(original_freq, 
                      new_freq, 
                      original_ph,
                      new_ph,
                      normal_position)
            
            res[trigram][sub] = s
            
            

    with open("results/0/{}".format(index), 'w') as f:
        json.dump(res, f, indent=4)
    
    print(index)
    return res
            
        

In [None]:
ranked2 = sort_answers(rank_substitutions(8))

In [None]:
sort_answers(json.load(open("results/0/4")))

In [None]:
strings[4]

In [None]:
def sort_answers(unsorted_dict):
    sd = {}
    for k, d in unsorted_dict.items():
        sd[k] = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return sd

In [None]:
import time
before = time.time()
from multiprocessing import Pool
p = Pool(4)
ngram_search_space = p.map(rank_substitutions, range(len(search_space)))
length = time.time() - before
print("Total time taken in seconds: {}".format(length))

In [None]:
search_space[6]

## Phonetic Generation, Running only on puns, no Tom Swiftys

In [None]:
# Remove Tom's from Data set, based solely on the word 'Tom'
t1_no_toms, no_toms_search_space = [], []
for i, p in enumerate(task1):
    if 'Tom' not in p['words']:
        t1_no_toms.append(p)
        no_toms_search_space.append(search_space[i])

In [None]:
def switch_score(distance, frequency_difference, position):
    return frequency_difference / ((distance**2 + 1 + position))

In [None]:
no_toms_search_space

In [None]:
all_res = []
for i, results in enumerate(no_toms_search_space):
    if results == 'miss':
        all_res.append(('miss', 0))
        continue
    max_score = 0
    max_word = ''
    for original_word, replacements in results.items():
        pos = t1_no_toms[i]['words'].index(original_word)
        pos = len(t1_no_toms[i]['words']) - pos
        for subs in replacements:
            score = switch_score(subs[1], subs[2], pos)
            if score > max_score:
                max_score = score
                max_word = subs[0]
    all_res.append((max_word, max_score))

In [None]:
len("""0
1
112
2
113
224
3
225
4
226
114
5
6
115
7
227
116
8
117
118
228
9
229
10
11
119
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257""").split()

In [None]:
tp, fp, tn, fn = 0, 0, 0, 0

for i, result in enumerate(all_res):
    if result[1] > 10:
        if t1_no_toms[i]['pun']:
            tp += 1
        else:
            fp += 1
    else:
        if not t1_no_toms[i]['pun']:
            tn += 1
        else:
            fn += 1
            
acc = (tp + tn) / len(all_res)
prec = tp / (tp + fn)
recall = tp / (tp + fp)
f1 = (2*(recall*prec)) / (recall + prec)
#     accuracies.append(acc)
#     precisions.append(prec)
#     recalls.append(recall)
#     f1s.append(f1)
print(tp, tn, fp, fn)
print("Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(acc, prec, recall, f1))

## Using Phonetic Generation, Tom Detection + Language Model

In [None]:
m = models.KeyedVectors.load_word2vec_format('/home/doogy/Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('babies')

In [None]:
all_res = []
for i, results in enumerate(search_space):
    print_progress(i+1, len(search_space))
    if is_Tom_Swifty(' '.join(task1[i]['words']), m):
        all_res.append(('tom', 1))
        continue
    if results == 'miss':
        all_res.append(('miss', 0))
        continue
    max_score = 0
    max_word = ''
    for original_word, replacements in results.items():
        og_stem = stemmer.stem(original_word)
        pos = task1[i]['words'].index(original_word)
        pos = len(task1[i]['words']) - pos
        for subs in replacements:
            if stemmer.stem(subs[0]) == og_stem:
                continue
            score = switch_score(subs[1], subs[2], 0)
            if score > max_score:
                max_score = score
                max_word = subs[0]
    all_res.append((max_word, max_score))
    

In [None]:
for i, res in enumerate(all_res):
    print(i, res, ' '.join(task1[i]['words']))

In [None]:
search_space[1762]

In [None]:
get_closest_sounding_words('ordure'), cmu['order']

In [None]:
for original_word, subs in baby_oil.items():
    max_word, max_score = '', 0
    print(original_word, subs)
    pos = len(task1[2]['words']) - task1[2]['words'].index(original_word)
    for sub in subs:
        print(switch_score(sub[1], sub[2], pos))

In [None]:
tp, fp, tn, fn = 0, 0, 0, 0

for i, result in enumerate(all_res):
    if result[1] > 0:
        if task1[i]['pun']:
            tp += 1
        else:
            fp += 1
    else:
        if not task1[i]['pun']:
            tn += 1
        else:
            fn += 1
            
acc = (tp + tn) / len(all_res)
prec = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*(recall*prec)) / (recall + prec)
#     accuracies.append(acc)
#     precisions.append(prec)
#     recalls.append(recall)
#     f1s.append(f1)
print(tp, tn, fp, fn)
print("Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(acc, prec, recall, f1))

In [None]:
m.similarity('ledge', 'mountain')

In [None]:
for i in range(len(search_space)):
    