In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from src.data_processing import load_data
from src.pronunciations import get_closest_sounding_words
from nltk import word_tokenize, pos_tag


In [None]:
with open("data/ngram_frequencies.json") as f:
    ngram_frequencies = json.load(f)
    
task1, task2, task3, min_pairs, pun_strings, strings = load_data()

In [None]:
no_toms = []
task1_no_toms = []
for i, context in enumerate(ngram_frequencies):
    tom = False
    for trigram in [n[0] for n in context]:
        if 'Tom' in trigram:
            tom = True
            break
    if not tom:
        no_toms.append(context)
        task1_no_toms.append(task1[i])

In [None]:
no_toms[0], task1_no_toms[0]

In [None]:
def pos_of_trigram(context, trigram):
    text = word_tokenize(' '.join(context['words']))
    pos_text = pos_tag(text, tagset='universal')
    target_pos = pos_text[text.index(trigram[1])][1]
    
    return target_pos

In [None]:
# take a context, look at all trigrams with frequency less than thresh, find similar sounding words.
normal_pos = {'ADJ', 'ADV', 'NOUN', 'VERB'}
def score(index, threshold=0):
    frequencies, context = ngram_frequencies[index], task1[index]
    print(frequencies)
    for trigram, freq in frequencies:
        if freq <= threshold and pos_of_trigram(context, trigram) in normal_pos:
            print(trigram)
            print(pos_of_trigram(context, trigram))
            print(get_closest_sounding_words(trigram[1]))
            

In [None]:
score(6, 10000)
# task2[:6]

In [None]:
get_closest_sounding_words('harried', share_first_letter=True)

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [None]:
import gensim

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('/home/doogy/Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
model.similarity('hello', 'goodbye')

In [173]:
from nltk.stem import LancasterStemmer
from src.pronunciations import phonetic_translation
from src.data_processing import load_cmu
from collections import defaultdict
cmu = load_cmu()
stemmer = LancasterStemmer()

def prefixes(word, threshold=None):
    
    if not threshold:
        stem = stemmer.stem(word)
        translation = phonetic_translation(stem)
        if stem in cmu:
            threshold = len(translation)
        else:
            threshold = len(translation) - 1
    
    print(threshold)
    ret = defaultdict(list)
    phonetics = phonetic_translation(word)
    seen = {word}
    
    for i in range(1, len(phonetics)):
        for k, v in cmu.items():
            if phonetics[:i] == v[0][:i] and k not in seen:
                if i >= threshold:
                    ret[i].append(k)
                    seen.add(k)
                # case for when perfect prefixes
                    
                elif len(v[0]) == i and k not in seen:
                    ret[i].append(k)
                    seen.add(k)
    seen.remove(word)
    return ret, seen

In [174]:
prefixes('saucily')

4


(defaultdict(list, {2: ['saw'], 3: ['sauce', 'soss'], 4: ['sausages']}),
 {'sauce', 'sausages', 'saw', 'soss'})

In [168]:
cmu['saucy'], stemmer.stem('saucily')

([['S', 'AO', 'S', 'IY']], 'saucy')

In [None]:
whined = prefixes('whined')
whined = whined[3]

In [None]:
from nltk.tokenize import word_tokenize

def word_sentence_similarity(word, sentence, tokenize=False):
    if word not in model.vocab:
        return None, -1
    
    if tokenize:
        sentence = word_tokenize(sentence)
    max_score = -1
    max_pair = None
    for w in sentence:
        if w in model.vocab:
            score = model.similarity(word, w)
            if score > max_score:
                max_score = score
                max_pair = (word, w)
    return max_pair, max_score
            

In [189]:
from nltk import pos_tag

def is_Tom_Swifty(sentence):
    # do all the adverb stuff...
    # tom swifty has format [PROPER NOUN said ADVERB .]
    sentence = word_tokenize(sentence)
    pos = pos_tag(sentence)
    words = [p[0] for p in pos]
    tags = [p[1] for p in pos]
    
    if 'NNP' not in tags[-5:]:
        return False
    
    for i in range(len(tags)-1, 0, -1):
        if tags[i] == 'NNP':
            noun_position = i
            break
        
    candidates = []
    for i in range(noun_position+1, len(tags)):
        if tags[i] in {'VBD', 'RB'}:
            candidates.append(words[i])
    
    # If the word is neither an adverb or verb, return false
    if len(candidates) == 0:
        return False

    prefs = []
    for candidate in candidates:
        a, b = prefixes(candidate, 3)
        prefs.extend(b)
    
    # get utterance
    for i in range(len(pos)):
        pass
        
    # remove stopwords and search word
    
    search_sentence = [w for w in sentence if w.lower() not in stopwords.words('english') and w not in candidates]
    
    max_score = -1
    best_pair = None
    for word in prefs:
        pair, score = word_sentence_similarity(word, search_sentence)
        if score > max_score:
            max_score = score
            best_pair = pair
    return best_pair, max_score

In [188]:
word_sentence_similarity('sauce', '"This pizza place is great!" Tom exclaimed saucily.', True)

(('sauce', 'pizza'), 0.36959888341468228)

In [195]:
is_Tom_Swifty("'How many lambs are on your farm?' Tom asked sheepishly.")

3
3


(('sheep', 'lambs'), 0.71513635423071409)

In [192]:
stemmer.stem('punctually')

'punct'

In [158]:
'sauce' in prefixes('saucily')[1]

False

In [121]:
phonetic_translation('gratingly'), stemmer.stem('gratingly')

(['G', 'R', 'EY', 'T', 'IH', 'NG', 'L', 'IY'], 'grat')

In [None]:
cmu['bluntly']

In [176]:
stemmer.stem('sheepishly')

'sheep'

In [None]:
from nltk.stem import LancasterStemmer
lstemmer = LancasterStemmer()
lstemmer.stem('bluntly')

In [None]:
from nltk.corpus import stopwords

In [None]:
stopwords.words('english')

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [133]:
get_closest_sounding_words('saucily')

{'civilly',
 'haughtily',
 'icily',
 'saucy',
 'sawmill',
 'sawmills',
 'sicily',
 'silly',
 'softly',
 'sorely'}