In [1]:
from src.data_processing import load_data
import itertools
import string

from src.ngrams import *
from src.string_similarity import levenshtein
import operator
from src.data_processing import print_progress
from nltk import word_tokenize, pos_tag
from src.data_processing import load_cmu
from src.ipatoarpabet import translate
from string import punctuation
from src.pronunciations import phonetic_distance
import os
from pattern.en import lexeme
from src.pronunciations import get_closest_sounding_words as csw

In [2]:
from src.pun_algorithms import *

Loading Model, this could take a while...


In [3]:
with open("data/ngram_searchspace/ngram_totals2.json") as f:
    search_space = json.load(f)

In [4]:
len(search_space)

1780

(66639115, 7263)

In [10]:
task1, task2, task3, min_pairs, strings, pun_strings = load_data()

In [11]:
def classification_accuracy(run):
    tp, fp, tn, fn = 0, 0, 0, 0
    
    for i in range(len(task1)):
        gold_pun = task1[i]['pun']
        if run[i] and gold_pun:
            tp += 1
        if not run[i] and not gold_pun:
            tn += 1
        if run[i] and not gold_pun:
            fp += 1
        if not run[i] and gold_pun:
            fn += 1
    
    results = {}
    results['acc'] = (tp + tn) / len(task1)
    results['recall'] = tp / (tp + fn)
    results['prec'] = tp / (tp + fp)
    results['f1'] = ((2*(results['recall'] * results['prec']))
                      / (results['recall'] + results['prec']))
    
    return results

# No Trigram Baseline

In [12]:
def no_trigram_baseline(index):
    current_context = search_space[index]
    for original_trigram in current_context:
        if current_context[original_trigram]['original_frequency'] == 0:
            return True
    return False

In [13]:
results = [no_trigram_baseline(index) for index in range(len(search_space))]

In [14]:
classification_accuracy(results)

{'acc': 0.7825842696629214,
 'f1': 0.8457552809884415,
 'prec': 0.8570274636510501,
 'recall': 0.8347757671125098}

In [11]:
ngram_frequency('a staring contest'.split())

4058

# No Quadgram Baseline

In [10]:
all_quadgrams_in_task1 = set()
for context in task1:
    words = context['words']
    for i in range(len(words)-3):
        all_quadgrams_in_task1.add(' '.join(words[i:i+4]))

In [11]:
all_quadgrams_in_task1 = list(sorted(all_quadgrams_in_task1))

In [None]:
groupings = defaultdict(list)
for quadgram in all_quadgrams_in_task1:
    groupings[quadgram.split()[0][:3]].append(quadgram)

In [103]:
def quad_to_dict(text):
    ret = {}
    for line in text.split('\n'):
        lsplit = line.split()
        try:
            ret[' '.join(lsplit[:4])] = int(lsplit[4])
        except:
            pass
    return ret

In [112]:
quadgram_frequencies ={}
for i, beginning_letters in enumerate(groupings):
    try:
        ngram_output = subprocess.check_output(['zcat', get_gram_file(beginning_letters, 4)]).decode('latin-1')
    except:
        pass
    quad_dict = quad_to_dict(ngram_output)
    for subquads in groupings[beginning_letters]:
        try:
#             print(subquads)
            quadgram_frequencies[subquads] = quad_dict[subquads]
        except KeyError:
            pass
    print_progress(i, len(groupings))

 |████████████████████████████████████████████████████████████████████████████████████████████████████| 99.9% 

In [115]:
with open("data/quadgram_frequencies.json") as f:
    quadgram_frequencies = json.load(f)

In [13]:
def no_quadgram_baseline(index):
    context = task1[index]['words']
    for i in range(0, len(context)-3):
        if ' '.join(context[i:i+4]) not in quadgram_frequencies:
            return True
    return False

In [137]:
results = [no_quadgram_baseline(i) for i in range(len(task1))]

In [138]:
classification_accuracy(results)

{'acc': 0.7893258426966292,
 'f1': 0.8700173310225303,
 'prec': 0.7775712515489467,
 'recall': 0.987411487018096}

In [11]:
all_frequencies = defaultdict(int)
for context in search_space:
    for og, sub in context.items():
        all_frequencies[og] = sub['original_frequency']
        for ssub, f in sub['substitutions'].items():
            all_frequencies[ssub] = f

# All Trigrams, No Position

In [5]:
def score(original_frequency, new_frequency, original_word, new_word, position, ph_penalty=2):
    return ( (new_frequency - original_frequency)
           * ((phonetic_distance(original_word, new_word, translated=True)**ph_penalty) 
           * position)) # pos is normalised

In [6]:
def single_score(original_trigram, new_trigram, ph_penalty=2):
    original_freq = all_frequencies[original_trigram]
    new_freq = all_frequencies[new_trigram]
    og_word, new_word = original_trigram.split()[1], new_trigram.split()[1]
    return score(original_freq, new_freq, og_word, new_word, 1, ph_penalty)

In [7]:
def sort_answers(unsorted_dict):
    sd = {}
    for k, d in unsorted_dict.items():
        sd[k] = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return sd

In [8]:
accepted_pos = {'ADV', 'ADJ', 'VERB', 'NOUN'}
def rank_substitutions(index):
    
#     full_path = "results/{}/{}".format(path, index)
    
#     if os.path.exists(full_path):
#         print(index)
#         with open(full_path) as f:
#             res = json.load(f)
#         return res
    
    space = search_space[index]
    context = task1[index]['words']
    
    # takes in list of subs, context is list of words
    res = defaultdict(dict)
    context_length = len(context)

    for trigram, candidate in space.items():

        # No Pos experiment, set to 1
        position = context.index(trigram.split()[1])
        end_position = context_length - position
        
        # take position and normalise it wrt length of context
        if use_position:
            normal_position = position / context_length
        else:
            normal_position = 1
        
        original_freq = candidate['original_frequency']     
        original_word = trigram.split()[1]
        
        if original_word in cmu:
            original_ph = cmu[original_word][0]
        else:
            # skip words not in new cmu
            continue
         
        if use_filter:
            phoneme_filter = set(csw(original_word))
            
        lexemes = lexeme(original_word)
        
        for sub, new_freq in candidate['substitutions'].items():
            
            new_word = sub.split()[1]
            
            
            if use_filter:
                if new_word not in phoneme_filter:
                    continue
            
            # ignore lexical derivatives
            if new_word in lexemes:
                continue
            
            new_context = [w for w in context]
            new_context[position-1:position+2] = sub.split()
            
            
            if new_word in cmu:
                new_ph = cmu[new_word][0]
            else:
                # skip words not in new cmu
                continue
            
            if any([w in string.punctuation for w in new_word]):
                continue
                
            tags = ([w[1] for w in 
                     pos_tag(new_context, tagset='universal')])
            
            if tags[position] not in accepted_pos:
                continue

            s = score(original_freq, 
                      new_freq, 
                      original_ph,
                      new_ph,
                      normal_position,
                      ph_penalty=penalty)
            
            res[trigram][sub] = s
            
            
        # might not need to write these to file, but write whole thing to file instead
#     with open(full_path, 'w') as f:
#         json.dump(sort_answers(res), f, indent=4)
    
    return sort_answers(res)        

In [None]:
import time
from multiprocessing import Pool

use_position = True
use_filter = False
penalty=8

before = time.time()
p = Pool(4)
ngram_search_space = p.map(rank_substitutions,   range(len(task1)))
length = time.time() - before

print("Total time taken in seconds: {}".format(length))

In [9]:
with open("results/phonetic_filter_with_pos-ph-8-no-contractions", 'w') as f:
    json.dump(ngram_search_space, f)

## use_position = True
use_filter = True
path = "phonetic_filter_no_pos"
rank_substitutions(8)

## Phonetic Generation, Running only on puns, no Tom Swiftys

In [None]:
# Remove Tom's from Data set, based solely on the word 'Tom'
t1_no_toms, no_toms_search_space = [], []
for i, p in enumerate(task1):
    if 'Tom' not in p['words']:
        t1_no_toms.append(p)
        no_toms_search_space.append(search_space[i])

In [None]:
def switch_score(distance, frequency_difference, position):
    return frequency_difference / ((distance**2 + 1 + position))

In [None]:
no_toms_search_space

In [None]:
all_res = []
for i, results in enumerate(no_toms_search_space):
    if results == 'miss':
        all_res.append(('miss', 0))
        continue
    max_score = 0
    max_word = ''
    for original_word, replacements in results.items():
        pos = t1_no_toms[i]['words'].index(original_word)
        pos = len(t1_no_toms[i]['words']) - pos
        for subs in replacements:
            score = switch_score(subs[1], subs[2], pos)
            if score > max_score:
                max_score = score
                max_word = subs[0]
    all_res.append((max_word, max_score))

In [None]:
tp, fp, tn, fn = 0, 0, 0, 0

for i, result in enumerate(all_res):
    if result[1] > 10:
        if t1_no_toms[i]['pun']:
            tp += 1
        else:
            fp += 1
    else:
        if not t1_no_toms[i]['pun']:
            tn += 1
        else:
            fn += 1
            
acc = (tp + tn) / len(all_res)
prec = tp / (tp + fn)
recall = tp / (tp + fp)
f1 = (2*(recall*prec)) / (recall + prec)
#     accuracies.append(acc)
#     precisions.append(prec)
#     recalls.append(recall)
#     f1s.append(f1)
print(tp, tn, fp, fn)
print("Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(acc, prec, recall, f1))

## Using Phonetic Generation, Tom Detection + Language Model

In [None]:
m = models.KeyedVectors.load_word2vec_format('/home/doogy/Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
stemmer.stem('babies')

In [None]:
all_res = []
for i, results in enumerate(search_space):
    print_progress(i+1, len(search_space))
    if is_Tom_Swifty(' '.join(task1[i]['words']), m):
        all_res.append(('tom', 1))
        continue
    if results == 'miss':
        all_res.append(('miss', 0))
        continue
    max_score = 0
    max_word = ''
    for original_word, replacements in results.items():
        og_stem = stemmer.stem(original_word)
        pos = task1[i]['words'].index(original_word)
        pos = len(task1[i]['words']) - pos
        for subs in replacements:
            if stemmer.stem(subs[0]) == og_stem:
                continue
            score = switch_score(subs[1], subs[2], 0)
            if score > max_score:
                max_score = score
                max_word = subs[0]
    all_res.append((max_word, max_score))
    

In [None]:
for i, res in enumerate(all_res):
    print(i, res, ' '.join(task1[i]['words']))

In [None]:
search_space[1762]

In [None]:
get_closest_sounding_words('ordure'), cmu['order']

In [None]:
for original_word, subs in baby_oil.items():
    max_word, max_score = '', 0
    print(original_word, subs)
    pos = len(task1[2]['words']) - task1[2]['words'].index(original_word)
    for sub in subs:
        print(switch_score(sub[1], sub[2], pos))

In [None]:
tp, fp, tn, fn = 0, 0, 0, 0

for i, result in enumerate(all_res):
    if result[1] > 0:
        if task1[i]['pun']:
            tp += 1
        else:
            fp += 1
    else:
        if not task1[i]['pun']:
            tn += 1
        else:
            fn += 1
            
acc = (tp + tn) / len(all_res)
prec = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*(recall*prec)) / (recall + prec)
#     accuracies.append(acc)
#     precisions.append(prec)
#     recalls.append(recall)
#     f1s.append(f1)
print(tp, tn, fp, fn)
print("Accuracy: {}\nPrecision: {}\nRecall: {}\nF1: {}".format(acc, prec, recall, f1))

In [None]:
m.similarity('ledge', 'mountain')

In [None]:
for i in range(len(search_space)):
    