In [15]:
import re
import random

In [None]:
lm = LanguageModel("/mnt/c/D/random/uni_en_rel.csv", "/mnt/c/D/random/bi_en_rel.csv", "/mnt/c/D/random/tri_en_rel.csv", "/mnt/c/D/random/quad_en_rel.csv")

In [12]:
class FrequencyWrap:
    def __init__(self):
        childes = "/mnt/c/D/Documents/wordlist/en/childes_en_lexfreq_pmw.csv"
        subtlex = "/mnt/c/D/Documents/wordlist/en/SUBTLexUS.txt"
        efllex  = "/mnt/c/D/Documents/wordlist/en/EFLLex_NLP4J"
        bnc_spoken = "/mnt/c/D/Documents/wordlist/en/2_2_spokenvwritten.txt"
        bnc_written = "/mnt/c/D/Documents/wordlist/en/all.num.o5"
        self.childes = self.__load__(childes, 0, 1, has_header=False, sep="\t")
        self.subtlex = self.__load__(subtlex, 0, 5, has_header=True, sep=";")
        self.efllex  = self.__load__(efllex, 0, 7, has_header=True, sep="\t")
        self.bnc_spoken = self.__load__(bnc_spoken, 1, 3, has_header=True, sep="\t")
        self.bnc_written = self.__load__(bnc_written, 1, 3, has_header=True, sep=" ")
        
    def __load__(self, fn, lemma_idx, freq_idx, has_header, sep):
        out = dict()
        with open(fn, "r", encoding="utf-8") as f:
            if has_header:
                header = f.readline()
            for l in f:
                if not l.strip():
                    continue
                ps = l.rstrip().split(sep)
                lemma = ps[lemma_idx]
                freq = ps[freq_idx]
                try:
                    freq = float(freq)
                except:
                    freq = 0
                if lemma in out:
                    # TODO : define behavior for existing keys
                    # Default: overwrite
                    pass
                out[lemma] = freq
        return out
    
    def get_freqs(self, word):
        freq_dist = [0,0,0,0,0]
        word = word.lower()
        if word in self.childes:
            freq_dist[0] = self.childes[word]
        if word in self.subtlex:
            freq_dist[1] = self.subtlex[word]
        if word in self.efllex:
            freq_dist[2] = self.efllex[word]
        if word in self.bnc_spoken:
            freq_dist[3] = self.bnc_spoken[word]
        if word in self.bnc_written:
            freq_dist[4] = self.bnc_written[word]
        return freq_dist

In [8]:
class LanguageModelWrapper:
    def __init__(self, lm):
        self.lm = lm
    
    def get_freqs(self, word):
        pword = "^" + word + "$"
        uniprob = self.lm.get_unigram_prob(pword)
        biprob = self.lm.get_bigram_prob(pword)
        triprob = self.lm.get_trigram_prob(pword)
        quadprob = self.lm.get_quadgram_prob(pword)
        return [uniprob, biprob, triprob, quadprob]

In [9]:
class ComboWrapper:
    def __init__(self, fw, lmw):
        self.fw = fw
        self.lmw = lmw
        
    def get_freqs(self, word):
        pword = "^" + word + "$"
        lmprob = self.lmw.get_freqs(pword)
        fwprob = self.fw.get_freqs(word)
        combiprob = fwprob + lmprob
        return combiprob

In [None]:
semeval = "/mnt/c/D/random/semeval2012.csv"

In [None]:
fw = FrequencyWrap()

In [10]:
lmw = LanguageModelWrapper(lm)

In [51]:
cowo = ComboWrapper(fw, lmw)

In [18]:
# Create training instances
def create_training_instances(wrapper, name):
    out = open("/mnt/c/D/random/semeval_training_pw_{}.csv".format(name), "w", encoding="utf-8")
    with open(semeval, "r", encoding="utf-8") as f:
        for l in f:
            if not l.strip():
                continue
            ps = l.rstrip().split("\t")
            candidates = ps[1:]
            m = re.search(r"<head>([^<]+?)<head>", ps[0])
            if m is not None:
                head = m.group(1)
                head_freq = wrapper.get_freqs(head)
                for c in candidates:
                    if c == head:
                        continue
                    c_freq = wrapper.get_freqs(c)
                    if sum(c_freq) == sum(head_freq) == 0:
                        continue
                    reverse = random.random() > 0.5
                    hfs = "\t".join([str(x) for x in head_freq])
                    cfs = "\t".join([str(x) for x in c_freq])
                    if reverse:
                        out.write("{}\t{}\t{}\n".format(cfs, hfs, 0))
                    else:
                        out.write("{}\t{}\t{}\n".format(hfs, cfs, 1))
    out.close()

In [20]:
create_training_instances(cowo, "cowo")

In [21]:
from sklearn.ensemble import ExtraTreesClassifier

In [60]:
from sklearn.neural_network import MLPClassifier

In [61]:
# MLP performed best on combined frequencies: accuracy@2 100%
mlp = MLPClassifier()

In [66]:
et = ExtraTreesClassifier(n_estimators=2500, random_state=0)

In [23]:
def load_data(fn):
    datas, labels = [], []
    with open(fn) as f:
        for l in f:
            if not l.strip():
                continue
            ps = l.rstrip().split("\t")
            data = [float(x) for x in ps[:-1]]
            label = int(ps[-1])
            datas.append(data)
            labels.append(label)
    return datas, labels

In [39]:
dat, lab = load_data("/mnt/c/D/random/semeval_training_pw_cw.csv")

In [30]:
from sklearn.preprocessing import StandardScaler

In [40]:
ss = StandardScaler()

In [43]:
dat_scal = ss.fit_transform(dat)

In [63]:
mlp.fit(dat_scal[:6000], lab[:6000])



In [64]:
mlp.score(dat_scal[6000:], lab[6000:])

0.621283255086072

In [44]:
et.fit(dat_scal[:6000], lab[:6000])

In [45]:
et.score(dat_scal[6000:], lab[6000:])

0.6828377673448096

In [37]:
from itertools import combinations

In [58]:
def piped_line(expressions, wrapper, predictor):
    freqs = dict()
    for ex in expressions:
        freqs[ex] = wrapper.get_freqs(ex)
    pairwise = combinations(freqs.items(), 2)
    pairwise_l = list(pairwise)
    res = dict()
    for pair in pairwise_l:
        arr = pair[0][1] + pair[1][1]
        key = (pair[0][0], pair[1][0])
        arr_scal = ss.transform([arr])
        pred = predictor.predict(arr_scal)
        res[key] = pred
    c = dict()
    for d in res:
        k1 = d[0]
        p = res[d][0]
        if p == 0:
            if k1 not in c:
                c[k1] = 0
            c[k1] += 1
    sort = sorted(c.items(), key=lambda x:x[1], reverse=True)
    exps = [x[0] for x in sort]
    return exps

In [65]:
out = open("/mnt/c/D/random/tsar_ranking_mlp_en.csv", "w")
with open("/mnt/c/D/random/tsar2022_en_trial_gold.tsv") as f:
    for l in f:
        if not l.strip():
            continue
        sentence, complex_word, *candidates = l.rstrip().split("\t")
        sorted_candidates = piped_line(candidates, cowo, mlp)
        out.write("{}\t{}\t{}\n".format(sentence, complex_word, "\t".join(sorted_candidates)))
out.close()

mandatory 0
mandatory 0
mandatory 0
mandatory 0
mandatory 1
mandatory 0
mandatory 1
mandatory 1
required 1
required 1
required 0
required 1
required 0
required 1
required 1
essential 0
essential 0
essential 1
essential 0
essential 1
essential 1
forced 0
forced 1
forced 0
forced 1
forced 1
important 1
important 0
important 1
important 1
manadatory 0
manadatory 1
manadatory 1
necessary 1
necessary 1
obligatory 1
infused 0
infused 0
infused 0
infused 1
infused 1
infused 1
infused 1
infused 0
infused 0
infused 1
infused 1
infused 1
infused 0
infused 0
infused 1
infused 1
introduced 0
introduced 1
introduced 1
introduced 1
introduced 1
introduced 1
introduced 1
introduced 0
introduced 1
introduced 1
introduced 1
introduced 1
introduced 0
introduced 1
introduced 1
filled 1
filled 1
filled 1
filled 1
filled 1
filled 1
filled 0
filled 1
filled 1
filled 1
filled 1
filled 0
filled 1
filled 1
impressed 1
impressed 1
impressed 1
impressed 1
impressed 0
impressed 0
impressed 1
impressed 1
impressed