In [1]:
import json
from src.data_processing import print_progress, scores_as_list, load_data, load_cmu
from src.pronunciations import phonetic_distance
import numpy as np
from gensim import models
from src.pun_algorithms import is_Tom_Swifty


from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
task1, task2, task3, min_pairs, strings, pun_strings = load_data()

In [3]:
substitutions = scores_as_list('all_trigram_with_pos')

returning from json


In [4]:
stemmer = PorterStemmer()
lm = WordNetLemmatizer()

with open("results/tom_swifties.json") as f:
    tsa = json.load(f)
    
with open("/home/doogy/Data/semeval2017_task7/data/test/subtask2-heterographic-test.gold") as f:
    t2_indices = [int(l.split()[0].split('_')[1])-1 for l in f.readlines()]
    
with open("/home/doogy/Data/semeval2017_task7/data/test/subtask3-heterographic-test.gold") as f:
    t3_indices = [int(l.split()[0].split('_')[1])-1 for l in f.readlines()]
    
tom_swifty_annotations = []
with open("data/t1-t3-mappings.json") as f:
    mappings = [int(k) for k in json.load(f)]
    
for ind, ts in zip(t2_indices, tsa):
    if ind in mappings:
        tom_swifty_annotations.append(ts)

for t, t3_ind in zip(task3, t3_indices):
    stems = [stemmer.stem(w) for w in t[1] if stemmer.stem(w) not in t[1]]
    t[1].extend(stems)
    lemmas = [lm.lemmatize(w) for w in t[1] if lm.lemmatize(w) not in t[1]]
    t[1].extend(lemmas)
            
tom_rankings = []
for ts in tom_swifty_annotations:
    if ts:
        derivatives = []
        # GET DERIVATES FUNCTION WORKING BETTER 
        for w in [t[0][0] for t in ts[0][1]]:
            s = set([w])
            s.update([stemmer.stem(w), lm.lemmatize(w)])
            derivatives.append(s)
        tom_rankings.append(derivatives)
    else:
        tom_rankings.append(False)

In [41]:
for i in range(len(t3_subs)):
    for j in range(len(t3_subs[i])):
        
        t3_subs[i][j] = list(t3_subs[i][j])
        t3_subs[i][j][1] = t3_subs[i][j][1][:25]

In [43]:
def get_rankings(substitutions):

    t3_subs = []
    for i in range(len(task1)):
        t3_subs.append(list(sorted(substitutions[i].items(), key=lambda x: x[1][0][1], reverse=True)))
    
    for i in range(len(t3_subs)):
        for j in range(len(t3_subs[i])):
        
            t3_subs[i][j] = list(t3_subs[i][j])
            t3_subs[i][j][1] = t3_subs[i][j][1][:25]

    sub_rankings = []
    for i, subs in enumerate(t3_subs):
        print_progress(i, len(t3_subs))
        ranked_subs = {}
        for sub in subs:
            derivations = []
            for w in [s[0].split()[1] for s in sub[1]]:
                s = set([w])
                s.update([stemmer.stem(w), lm.lemmatize(w)])
                derivations.append(s)
            ranked_subs[sub[0].split()[1]] = derivations

        sub_rankings.append(ranked_subs)
    return sub_rankings

In [56]:
def measures(sub_rankings):

    guesses, tp = 0, 0
    mrrank = 0
    mrank = 0

    for i, ind in enumerate(mappings):

        target = task3[i][0]
        list_to_use = None
        translation = None

        if tom_swifty_annotations[i]:
            translation = tom_rankings[i][0]
            
            for r, ts in enumerate(tom_swifty_annotations[i]):
                if [w for w in task3[i][1] if w in ts]:
                    mrank += (r+1)
                    mrrank += 1/(r+1)

        else:
            for t in target[0]:
                if t in sub_rankings[ind]:
                    translation = sub_rankings[ind][t][0]
            if not translation:
                max_score, max_k = 0, 0
                for k in sub_rankings[ind]:
                    for t in target:
                        score = phonetic_distance(k, t)
                        if score > max_score:
                            max_score = score
                            max_k = k
            try:
                translation = sub_rankings[ind][max_k][0]
                for r, tr in enumerate(sub_rankings[ind][max_k]):
                    if [w for w in task3[i][1] if w in tr]:
                        mrank += (r+1)
                        mrrank += 1/(r+1)
            except:
                pass
            
        guesses += int(bool(translation))
        
        if translation:

            correct = [w for w in translation if w in task3[i][1]]
            tp += int(bool(correct))
    
    return tp/guesses, mrrank / len(task3), mrank / len(task3)

In [48]:
sub_rankings = get_rankings(substitutions)

 |████████████████████████████████████████████████████████████████████████████████████████████████████| 99.9% 

In [57]:
measures(sub_rankings)

(0.4966313763233879, 0.5315365449213408, 4.2996357012750455)

In [166]:
' '.join(task1[184]['words']), t3_subs[132]

('They say curiosity killed the cat , and they were not kitten .',
 [('not kitten .',
   [['not written .', 1874.3118548676932],
    ['not listen .', 1114.0892927999996],
    ['not given .', 1036.8747612553843],
    ['not return .', 893.5109515622345],
    ['not listed .', 478.59915865384613],
    ['not certain .', 449.04968661737627],
    ['not enough .', 400.64737910153866],
    ['not permitted .', 370.9502701093974],
    ['not common .', 289.5034353377438],
    ['not hidden .', 249.2097939692307],
    ['not returned .', 231.96514423076923],
    ['not happen .', 180.83930099927463],
    ['not counted .', 142.26292067307693],
    ['not mistaken .', 133.86418269230768],
    ['not consistent .', 123.82271634615384],
    ['not Christian .', 116.44202168937363],
    ['not eaten .', 112.04847537230766],
    ['not committed .', 107.5826163216815],
    ['not critical .', 103.14483173076923],
    ['not determined .', 99.16856971153847],
    ['not women .', 95.16445853538458],
    ['not begun 

In [146]:
from src.ngrams import ngram_frequency

ngram_frequency('to compliment her'.split())

5545

In [84]:
pprint(sub_rankings[4])

{'gnus': ['news',
          'ones',
          'guys',
          'guns',
          'puns',
          'manners',
          'attitude',
          'reviews',
          'attitudes',
          'pun',
          'bugs',
          'genes',
          'money',
          'month',
          'run',
          'cruise',
          'bunch',
          'shoes',
          'group',
          'son',
          'lines',
          'governance',
          'signs',
          'guy',
          'config',
          'tonight',
          'grades',
          'new',
          'bands',
          'knees',
          'luck',
          'sunburn',
          'girls',
          'country',
          'Japanese',
          'things',
          'mothers',
          'sign',
          'values',
          'experiences',
          'cantonese',
          'games',
          'companies',
          'conditions',
          'loans',
          'numbers',
          'solution',
          'pregnancy',
          'dancers',
          'clusters',
   

In [None]:
guesses, tp = 0, 0

for i, ind in enumerate(mappings):
    guesses += 1
    
    target = task3[i][0]
    list_to_use = None
    translation = None
    
    if tom_swifty_annotations[i]:
        translation = tom_swifty_annotations[i][0][1][0][0][0]
    else:
        for t in target:
            for og_trigram in t3_subs[ind]:
                if t == og_trigram[0].split()[1]:
                    translation = og_trigram[1][0][0].split()[1]
                    break
        if not translation:
            try:
                translation = t3_subs[ind][0][1][0][0].split()[1]
            except:
                guesses -= 1
        
    tp += int(translation in task3[i][1])
#     if translation not in task3[i][1]:
#         print(ind, task3[i], translation)
    
print(tp / guesses)

In [167]:
pprint(task1[48]), pprint(t3_subs[48])

{'pun': True,
 'words': ['When',
           'his',
           'clothes',
           'dryer',
           'broke',
           'he',
           'was',
           'lint',
           'another',
           'one',
           '.']}
[('was lint another',
  [['was sent another', 1.2876420454545454],
   ['was just another', 1.2796603116122158],
   ['was yet another', 0.6582502885298296],
   ['was built another', 0.19886363636363635],
   ['was planning another', 0.12802926383173308],
   ['was denied another', 0.0909163054545455],
   ['was finished another', 0.0833189256262874],
   ['was lost another', 0.06881280000000004],
   ['was clearly another', 0.0562552825927312],
   ['was living another', 0.05463319272727275],
   ['was published another', 0.03419787636363638],
   ['was not another', 0.028828392727272674],
   ['was dealt another', 0.025566794655539772],
   ['was learning another', 0.023771694545454556],
   ['was given another', 0.016463592727272696],
   ['was quite another', 0.01633000727272

(None, None)

In [None]:
for t in t3_subs[0]:
    print(t[0])

In [None]:
t3_subs[0][0][1][0][0]

In [None]:
task2[16]

In [None]:
min_pairs[16]

In [None]:
for i, c in enumerate(task2):
    print(' '.join(c['words']), task3[i])

In [None]:
mappings = []
for i, pair in enumerate(task3):
    
    original_word = task3[i][0][0]
    count = 0
    single = 0
    prompt = ""
    for j in range(len(task2)):
        if original_word in ''.join(task2[j]['words']).lower():
            prompt += str(j) + "\t" + ' '.join(task2[j]['words']) + "\n"
            count += 1
            single = j
    if count == 1:
        mappings.append((single, i))
    else:
        print("PAIR: ", task3[i])
        print(prompt)
        mappings.append((int(input()), i))
        print("\n\n\n\n")

In [None]:
for c in task2:
    if 'coolly' in c['words']:
        print(c)

In [None]:
for i, index in enumerate(mappings):
    print(min_pairs[i], ' '.join(task1[index]['words']))

In [None]:
t2_indices