In [1]:
import sys
import os
import re
import getopt

In [2]:
import evaluate
import wmmapping
import numpy
import pickle

 Using wmmapping_fixed.py


In [3]:
import learnconfig_fixed as learnconfig

In [4]:
import learn_fixed as learn

In [5]:
import sys

sys.argv = [
    'main.py',
    '-c', 'input_wn_fu_cs_scaled_categ.dev',
    '-l', 'all_catf_norm_prob_lexicon_cs.all',
    '-i', 'input_dir',
    '-o', 'output_dir',
    '-C', 'config.ini',
    '-s', 'stopwords.txt',
]

def main():
    try:
        options_list = ["help", "corpus=", "lexicon=", "inputdir=", "output=", "config=", "stop=", "f="]
        opts, args = getopt.getopt(sys.argv[1:], "hc:l:i:o:C:s:f:", options_list)
    except getopt.GetoptError as msg:
        print(msg)
        usage()
        sys.exit(2)

    if len(opts) < 4:
        usage()
        sys.exit(0)

    corpus_path = ""
    stop = ""
    config_path = ""
    lexname = ""
    indir = ""
    outdir = ""

    for o, a in opts:
        if o in ("-h", "--help"):
            usage()
            sys.exit(0)
        elif o in ("-c", "--corpus"):
            corpus_path = a
        elif o in ("-l", "--lexicon"):
            lexname = a
        elif o in ("-i", "--inputdir"):
            indir = a
        elif o in ("-o", "--output"):
            outdir = a
        elif o in ("-C", "--config"):
            config_path = a
        elif o in ("-s", "--stop"):
            stop = a
        elif o in ("-f", "--f"):
            pass

    config_path = "config.ini"
    stop = "stopwords.txt"
    lexname = "all_catf_norm_prob_lexicon_cs.all"
    corpus_path = "input_wn_fu_cs_scaled_categ.dev"


    learner_config = learnconfig.LearnerConfig(config_path)
    print("ww")
    stopwords = []
    if len(stop) > 2:
        with open(stop, 'r') as stopwords_file:
            for line in stopwords_file:
                stopwords.append(line.strip() + ":N")

    learner = learn.Learner(lexname, learner_config, stopwords)

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    if corpus_path == "":
        for dirpath, _, filenames in os.walk(indir):
            for fname in filenames:
                if re.search(r"#\d+\.txt$", fname):
                    outdirpath = dirpath.replace(indir, outdir)
                    if not os.path.exists(outdirpath):
                        os.makedirs(outdirpath)
                    corpus = os.path.join(dirpath, fname)
                    print("Processing Corpus:", corpus)
                    learner.process_corpus(corpus, outdirpath)
                    print("Done")
                    learner.reset()
    else:
        learner.process_corpus(corpus_path, outdir)

    if learner._stats_flag:
        print("output files ..... ")
        write_acq_score_timestamp(learner, outdir)
        print("alignment", learner.alignment_method, corpus_path)

    write_learned_lex(learner, outdir)
    write_alignments(learner, outdir)


In [6]:
def write_alignments(learner, outdir):
    """
    Write the contents of the learner to a file in directory outdir.
    The file is named as: aligns_lm_{lambda}_a{alpha}_ep{epsilon} where lambda,
    alpha, and epsilon come from the Learner learner.

    The file is written as:
        {word}--{feature} [ ({time}, {alignment}), ({time}, {alignment}), ... ]
    for each word-feature pair that occurred at least once together.
    """
    lm = learner._lambda
    a = learner._alpha
    ep = learner._epsilon
    filename = f"{outdir}/aligns_lm_{lm}_a{a}_ep{ep}"

    with open(filename, 'w') as output:
        for word in learner._wordsp.all_words(0):
            for feature in learner._features:
                alignments = learner._aligns.alignments(word, feature)
                if not alignments:
                    continue

                line = f"{word}--{feature} [ "
                for t, val in alignments.items():
                    line += f"({t}, {val:.5f}), "
                line += " ]\n\n"
                output.write(line)


In [7]:
def write_learned_lex(learner, outdir):
    """
    Write the contents of the learner to a file in directory outdir.
    The file is named as: lex_lm_{lambda}_a{alpha}_ep{epsilon} where lambda,
    alpha, and epsilon come from the Learner learner.

    The file is written as:
        {word}:{frequency} [{feature}:(true_prob, learned_prob), ...]
        <<acquisition_score>>
    for each word.
    """
    lm = learner._lambda
    a = learner._alpha
    ep = learner._epsilon
    filename = f"{outdir}/lex_lm_{lm}_a{a}_ep{ep}"
    min_prob = 0.0001

    with open(filename, 'w') as output:
        for word in learner._wordsp.all_words(0):
            prob_feature_pairs = learner._gold_lexicon.meaning(word).sorted_features()
            learned_meaning = learner._learned_lexicon.meaning(word)
            freq = learner._wordsp.frequency(word)

            line = f"{word}:{freq} ["
            for true_prob, feature in prob_feature_pairs:
                if true_prob > min_prob:
                    learned_prob = learned_meaning.prob(feature)
                    line += f"{feature}:(%.5f, %.5f), " % (true_prob, learned_prob)
            line += " ]\n\n"
            output.write(line)

            acq_score = learner.acquisition_score(word)
            output.write(f"   << {acq_score:.6f} >>\n\n")


In [8]:
def write_acq_score_timestamp(learner, outdir):
    '''
    added by Shanshan Huang
    Record the acquisition score in each time step in pickle format,
    with the option to save the whole learner object into a single pickle
    (warning: very large!!!!)
    These pickle files are later used to plot learning graphs used for
    analysis in the paper
    '''


    with open(outdir+'/acq_score_timestamp.pkl', "wb") as handle:
        pickle.dump(learner._acq_score_list, handle)



In [9]:

if __name__ == "__main__":
    main()

ww
number of Gold Features 8484
100
200
300
400
500
600
700
800
900
1000
output files ..... 
alignment 0 input_wn_fu_cs_scaled_categ.dev


In [10]:
with open("output_dir/input_wn_fu_cs_scaled_categ.dev.words1000", "r") as f:
    for line in f:
        print(line.strip())

a:DET,125,9,0,439,39,10,992,
about:PREP,6,23,0,-1,-1,-1,707,
accident:N,1,842,42,-1,-1,-1,842,
again:ADV,3,491,13,-1,-1,-1,853,
all:DET,14,21,0,-1,-1,-1,852,
an:DET,6,356,9,-1,-1,-1,842,
and:CONJ,36,21,0,723,24,31,931,
angel:N,1,881,43,-1,-1,-1,881,
anna:N,2,891,46,-1,-1,-1,911,
another:DET,5,378,9,-1,-1,-1,777,
answer:V,1,164,1,-1,-1,-1,164,
ant:N,1,203,1,-1,-1,-1,203,
any:DET,3,440,11,-1,-1,-1,850,
apron:N,1,356,9,-1,-1,-1,356,
are:AUX,44,5,0,942,41,47,952,
argue:V,1,993,52,-1,-1,-1,993,
arm:N,2,558,19,-1,-1,-1,652,
as:PREP,2,260,1,-1,-1,-1,595,
at:PREP,7,40,0,-1,-1,-1,868,
awake:V,1,939,47,-1,-1,-1,939,
away:ADV,5,571,20,-1,-1,-1,775,
aye:OTH,1,774,38,-1,-1,-1,774,
baby:N,12,21,0,890,7,45,930,
back:ADV,5,228,1,-1,-1,-1,853,
ball:N,2,271,3,-1,-1,-1,272,
bang:N,2,287,5,-1,-1,-1,289,
be:V,291,1,0,417,119,9,1000,
bear:N,5,343,8,-1,-1,-1,438,
because:CONJ,3,20,0,-1,-1,-1,760,
bed:N,1,817,40,-1,-1,-1,817,
bee:N,3,167,1,-1,-1,-1,183,
beetle:N,1,837,42,-1,-1,-1,837,
big:ADJ,5,519,17,-1,-1,-

In [11]:
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

time_props = pd.read_csv("output_dir/time_props_1000.csv")


display(time_props)




Unnamed: 0,time,heard,learned,nov_n_min1,nov_n_min2,all,lrn,n,v,oth
0,1,6,0,-1.0,-1.0,0.016788,0.0,0.017018,0.016127,0.01742
1,2,8,0,-1.0,-1.0,0.01888,0.0,0.017776,0.020623,0.017922
2,3,9,0,-1.0,-1.0,0.018395,0.0,0.017776,0.020623,0.016785
3,4,13,0,-1.0,-1.0,0.020063,0.0,0.019349,0.024364,0.017193
4,5,19,0,-1.0,-1.0,0.020391,0.0,0.019389,0.025475,0.017204
5,6,21,0,-1.0,-1.0,0.020545,0.0,0.020391,0.025475,0.017362
6,7,22,0,-1.0,-1.0,0.020773,0.0,0.020391,0.025475,0.01818
7,8,25,0,-1.0,-1.0,0.021523,0.0,0.02232,0.025633,0.0184
8,9,29,0,0.030849,0.030849,0.022366,0.0,0.024215,0.025308,0.019019
9,10,30,0,-1.0,-1.0,0.022232,0.0,0.024215,0.025308,0.018967


In [12]:
with open("output_dir/lex_lm_-1.0_a20.0_ep0.01", "r") as f:
    for line in f:
        print(line.strip())


be:V:291 [stative#0:(0.55041, 0.26438), be#0:(0.44959, 0.19683),  ]

<< 0.903615 >>

not:NEG:65 [negative#0:(0.45721, 0.20433), conjunction#0:(0.42345, 0.18450), unary#0:(0.07956, 0.01219), not#0:(0.03978, 0.00913),  ]

<< 0.947039 >>

you:N:200 [definite#0:(0.48488, 0.23741), pronoun#0:(0.32326, 0.14475), 2nd_person#0:(0.15349, 0.02614), you#0:(0.03837, 0.00255),  ]

<< 0.900002 >>

that:N:141 [determiner#0:(0.43457, 0.22588), definite#0:(0.31605, 0.14970), singular#0:(0.17434, 0.06193), that#0:(0.07504, 0.01174),  ]

<< 0.869185 >>

watch:V:2 [perceive#0:(0.49776, 0.00104), in_reaction_to#0:(0.36173, 0.00057), search#0:(0.11261, 0.00010), watch:(0.02790, 0.00010),  ]

<< 0.111972 >>

joel:N:6 [person#0:(0.71709, 0.00954), joel#0:(0.28291, 0.00419),  ]

<< 0.665269 >>

the:DET:88 [determiner#0:(0.45746, 0.14643), definite#0:(0.33270, 0.08909), singular#0:(0.18352, 0.03209), the#0:(0.02633, 0.00264),  ]

<< 0.792282 >>

wed:V:1 [cause#0:(0.32493, 0.00012), Adv#0:(0.14781, 0.00012), pro

In [13]:
file_path = "output_dir/aligns_lm_-1.0_a20.0_ep0.01"

with open(file_path, "r") as f:
    for line in f:
        line = line.strip()
        if line and "--" in line:
            print(line)



be:V--entity#1 [ (22, 0.05025), (75, 0.05185), (95, 0.05365), (100, 0.05619), (124, 0.05987), (130, 0.06237), (158, 0.06584), (185, 0.06788), (189, 0.07124), (190, 0.07363), (215, 0.07494), (259, 0.07947), (314, 0.07411), (329, 0.07316), (336, 0.07522), (360, 0.07958), (368, 0.07943), (369, 0.08120), (382, 0.08074), (395, 0.07945), (399, 0.08130), (470, 0.07750), (485, 0.06959), (517, 0.07501), (521, 0.07588), (522, 0.07598), (528, 0.08870), (531, 0.08711), (555, 0.08888), (557, 0.07744), (669, 0.06848), (672, 0.07757), (673, 0.07535), (683, 0.06549), (701, 0.07895), (705, 0.07324), (711, 0.07703), (776, 0.07373), (797, 0.06915), (811, 0.06952), (813, 0.07623), (854, 0.07188), (886, 0.06876), (891, 0.06490), (898, 0.06784), (930, 0.06749), (943, 0.06746), (947, 0.06531), (962, 0.06726), (987, 0.06839),  ]
be:V--slang#0 [ (528, 0.04745),  ]
be:V--2nd_person#0 [ (82, 0.04987), (152, 0.05146), (156, 0.05289), (158, 0.05536), (220, 0.05491), (319, 0.05395), (328, 0.05416), (407, 0.05407), 

In [14]:
import pickle

with open("output_dir/acq_score_timestamp.pkl", "rb") as f:
    acq_scores = pickle.load(f)

for word, time_dict in acq_scores.items():
    print(f"\n Word: {word}")
    for t, score in sorted(time_dict.items()):
        print(f"  Time {t}: {score:.4f}")




 Word: be:V
  Time 1: 0.0155
  Time 2: 0.0183
  Time 4: 0.0247
  Time 20: 0.0730
  Time 21: 0.0910
  Time 22: 0.1103
  Time 30: 0.1628
  Time 41: 0.2361
  Time 42: 0.2685
  Time 43: 0.3006
  Time 46: 0.3414
  Time 47: 0.3586
  Time 56: 0.4240
  Time 58: 0.4418
  Time 60: 0.4599
  Time 61: 0.4856
  Time 64: 0.5147
  Time 66: 0.5384
  Time 68: 0.5709
  Time 74: 0.5950
  Time 75: 0.6083
  Time 76: 0.6180
  Time 82: 0.6291
  Time 95: 0.6446
  Time 97: 0.6536
  Time 100: 0.6493
  Time 103: 0.6573
  Time 106: 0.6634
  Time 110: 0.6732
  Time 111: 0.6849
  Time 118: 0.7061
  Time 121: 0.7125
  Time 122: 0.7243
  Time 124: 0.7247
  Time 125: 0.7255
  Time 127: 0.7290
  Time 129: 0.7276
  Time 130: 0.7424
  Time 138: 0.7521
  Time 149: 0.7554
  Time 150: 0.7571
  Time 152: 0.7519
  Time 156: 0.7513
  Time 158: 0.7453
  Time 166: 0.7470
  Time 169: 0.7487
  Time 171: 0.7542
  Time 172: 0.7660
  Time 184: 0.7667
  Time 185: 0.7673
  Time 186: 0.7631
  Time 189: 0.7658
  Time 190: 0.7672
  Time 1