In [3]:
import re
import random
from language_model import LanguageModel
lm = LanguageModel("/mnt/c/D/random/uni_es_rel.csv", "/mnt/c/D/random/bi_es_rel.csv", "/mnt/c/D/random/tri_es_rel.csv", "/mnt/c/D/random/quad_es_rel.csv")

In [4]:
class FrequencyWrap:
    def __init__(self):
        childes = "/mnt/c/D/Documents/wordlist/es/childes_es_lexfreq_pmw.csv"
        subtlex = "/mnt/c/D/Documents/wordlist/es/subtlex-esp"
        efllex  = "/mnt/c/D/Documents/wordlist/es/ELELex_Freeling"
        bnc_spoken = "/mnt/c/D/Documents/wordlist/es/corlec_spoken.csv"
        bnc_written = "/mnt/c/D/Documents/wordlist/es/chile.csv"
        crea = "/mnt/c/D/Documents/wordlist/es/crea.txt"
        lexin = "/mnt/c/D/Documents/wordlist/es/lexin.txt"
        self.childes = self.__load__(childes, 0, 1, has_header=False, sep="\t")
        self.subtlex = self.__load__(subtlex, 0, 2, has_header=True, sep="\t")
        self.efllex  = self.__load__(efllex, 0, 7, has_header=True, sep="\t")
        self.bnc_spoken = self.__load__(bnc_spoken, 0, 1, has_header=False, sep="\t")
        self.bnc_written = self.__load__(bnc_written, 0, 1, has_header=False, sep="\t")
        self.crea = self.__load__(crea, 0, 1, has_header=True, sep="\t")
        self.lexin = self.__load__(lexin, 0, 1, has_header=True, sep="\t")
        
    def __load__(self, fn, lemma_idx, freq_idx, has_header, sep):
        out = dict()
        with open(fn, "r", encoding="utf-8") as f:
            if has_header:
                header = f.readline()
            for l in f:
                if not l.strip():
                    continue
                ps = l.rstrip().split(sep)
                lemma = ps[lemma_idx]
                freq = ps[freq_idx]
                try:
                    freq = float(freq)
                except:
                    freq = 0
                if lemma in out:
                    # TODO : define behavior for existing keys
                    # Default: overwrite
                    pass
                out[lemma] = freq
        return out
    
    def get_freqs(self, word):
        freq_dist = [0,0,0,0,0,0,0]
        word = word.lower()
        if word in self.childes:
            freq_dist[0] = self.childes[word]
        if word in self.subtlex:
            freq_dist[1] = self.subtlex[word]
        if word in self.efllex:
            freq_dist[2] = self.efllex[word]
        if word in self.bnc_spoken:
            freq_dist[3] = self.bnc_spoken[word]
        if word in self.bnc_written:
            freq_dist[4] = self.bnc_written[word]
        if word in self.crea:
            freq_dist[5] = self.crea[word]
        if word in self.lexin:
            freq_dist[6] = self.lexin[word]
        return freq_dist

In [5]:
class LanguageModelWrapper:
    def __init__(self, lm):
        self.lm = lm
    
    def get_freqs(self, word):
        pword = "^" + word + "$"
        uniprob = self.lm.get_unigram_prob(pword)
        biprob = self.lm.get_bigram_prob(pword)
        triprob = self.lm.get_trigram_prob(pword)
        quadprob = self.lm.get_quadgram_prob(pword)
        return [uniprob, biprob, triprob, quadprob]

In [6]:
class ComboWrapper:
    def __init__(self, fw, lmw):
        self.fw = fw
        self.lmw = lmw
    def get_freqs(self, word):
        pword = "^" + word + "$"
        lmprob = self.lmw.get_freqs(pword)
        fwprob = self.fw.get_freqs(word)
        combiprob = fwprob + lmprob
        return combiprob

In [7]:
lmw = LanguageModelWrapper(lm)

In [8]:
fw = FrequencyWrap()

In [9]:
cw = ComboWrapper(fw, lmw)

In [10]:
semeval = "/mnt/c/D/random/2010_es.csv"

In [75]:
# Create training instances
def create_training_instances(wrapper, name):
    out = open("/mnt/c/D/random/semeval_training_es_{}.csv".format(name), "w", encoding="utf-8")
    with open(semeval, "r", encoding="utf-8") as f:
        for l in f:
            if not l.strip():
                continue
            ps = l.rstrip().split("\t")
            candidates = ps[1:]
            m = re.search(r"<head>([^<]+?)<head>", ps[0])
            if m is not None:
                head = m.group(1)
                head_freq = wrapper.get_freqs(head)
                for c in candidates:
                    if c == head:
                        continue
                    c_freq = wrapper.get_freqs(c)
                    if sum(c_freq) == sum(head_freq) == 0:
                        continue
                    reverse = random.random() > 0.5
                    hfs = "\t".join([str(x) for x in head_freq])
                    cfs = "\t".join([str(x) for x in c_freq])
                    if reverse:
                        out.write("{}\t{}\t{}\n".format(cfs, hfs, 0))
                    else:
                        out.write("{}\t{}\t{}\n".format(hfs, cfs, 1))
    out.close()

In [76]:
create_training_instances(cw, "cw")

In [111]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

In [112]:
from sklearn.neural_network import MLPClassifier

In [39]:
mlp = MLPClassifier(max_iter=1000)

In [122]:
et = ExtraTreesClassifier(n_estimators=50, random_state=0)

In [37]:
rf = RandomForestClassifier(n_estimators=2500, random_state=0)

In [12]:
def load_data(fn):
    datas, labels = [], []
    with open(fn) as f:
        for l in f:
            if not l.strip():
                continue
            ps = l.rstrip().split("\t")
            data = [float(x) for x in ps[:-1]]
            label = int(ps[-1])
            datas.append(data)
            labels.append(label)
    return datas, labels

In [126]:
dat, lab = load_data("/mnt/c/D/random/semeval_training_es_lmw.csv")

In [78]:
from sklearn.preprocessing import StandardScaler

In [127]:
ss = StandardScaler()

In [128]:
dat_scal = ss.fit_transform(dat)

In [129]:
test_dat = dat_scal[-500:]
test_lab = lab[-500:]

In [130]:
train_dat = dat_scal[:-500]
train_lab = lab[:-500]

In [123]:
et.fit(train_dat, train_lab)

In [124]:
et.score(test_dat, test_lab)

0.742

In [142]:
def piped_line(expressions, wrapper, predictor, negative_sort=False, dmatrix_transform=False):
    freqs = dict()
    for ex in expressions:
        freqs[ex] = wrapper.get_freqs(ex)
    pairwise = combinations(freqs.items(), 2)
    pairwise_l = list(pairwise)
    res = dict()
    for pair in pairwise_l:
        arr = pair[0][1] + pair[1][1]
        key = (pair[0][0], pair[1][0])
        arr_scal = ss.transform([arr])
        if dmatrix_transform:
            arr_scal = xgb.DMatrix(np.array(arr_scal))
        pred = predictor.predict(arr_scal)
        res[key] = pred
    c = dict()
    for d in res:
        print(d[0], res[d][0])
        k1 = d[0]
        p = round(res[d][0])
        if p == 0:
            if k1 not in c:
                c[k1] = 0
            c[k1] += 1
        else:
            if negative_sort:
                if k1 in c:
                    c[k1] -= 1
    sort = sorted(c.items(), key=lambda x:x[1], reverse=True)
    exps = [x[0] for x in sort]
    return exps

In [34]:
from itertools import combinations

In [156]:
class RandomPredictor:
    def predict(self, arr):
        return [1] if random.random() > 0.5 else [0]

In [157]:
rp = RandomPredictor()

In [38]:
def run(wrapper, predictor, n1, n2, shuffle=False, negative_sort=False, dmatrix_transform=False):
    out = open("/mnt/c/D/random/tsar_ranking_es_{}_{}_{}_{}.csv".format(n1,n2,("shuffle" if shuffle else ""),("ns" if negative_sort else "")), "w")
    with open("/mnt/c/D/random/tsar2022_es_trial_gold.tsv") as f:
        for l in f:
            if not l.strip():
                continue
            sentence, complex_word, *candidates = l.rstrip().split("\t")
            if shuffle:
                random.shuffle(candidates)
            sorted_candidates = piped_line(candidates, wrapper, predictor, negative_sort, dmatrix_transform)
            out.write("{}\t{}\t{}\n".format(sentence, complex_word, "\t".join(sorted_candidates)))
    out.close()

In [139]:
run(lmw, bst, "cw", "bst-x", True, True, True)

pintoresco 0.0
pintoresco 1.0
pintoresco 1.0
pintoresco 1.0
pintoresco 1.0
pintoresco 1.0
pintoresco 1.0
pintoresco 1.0
local 1.0
local 1.0
local 1.0
local 1.0
local 1.0
local 1.0
local 1.0
costumbrista 1.0
costumbrista 1.0
costumbrista 1.0
costumbrista 1.0
costumbrista 1.0
costumbrista 1.0
de folclore 1.0
de folclore 1.0
de folclore 1.0
de folclore 1.0
de folclore 1.0
tradicional 1.0
tradicional 1.0
tradicional 1.0
tradicional 1.0
de folclor 1.0
de folclor 1.0
de folclor 1.0
típico 1.0
típico 1.0
popular 1.0
fallecido 1.0
fallecido 1.0
fallecido 1.0
fallecido 1.0
fallecido 1.0
fallecido 1.0
fallecido 1.0
acabado 1.0
acabado 1.0
acabado 1.0
acabado 1.0
acabado 1.0
acabado 1.0
extinto 1.0
extinto 1.0
extinto 1.0
extinto 1.0
extinto 1.0
desaparecido 1.0
desaparecido 1.0
desaparecido 1.0
desaparecido 1.0
muerto 0.0
muerto 0.0
muerto 0.0
finado 1.0
finado 1.0
inactivo 1.0
recurrir 1.0
recurrir 1.0
recurrir 1.0
recurrir 1.0
recurrir 1.0
recurrir 1.0
recurrir 1.0
recurrir 1.0
recurrir 1.0
re

lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
lograr 1.0
adueñarse 1.0
adueñarse 0.0
adueñarse 1.0
adueñarse 1.0
adueñarse 1.0
adueñarse 1.0
adueñarse 0.0
adueñarse 1.0
adueñarse 1.0
adueñarse 1.0
adueñarse 1.0
apoderarse 0.0
apoderarse 1.0
apoderarse 1.0
apoderarse 1.0
apoderarse 1.0
apoderarse 0.0
apoderarse 1.0
apoderarse 1.0
apoderarse 1.0
apoderarse 1.0
ganar 1.0
ganar 1.0
ganar 1.0
ganar 1.0
ganar 1.0
ganar 1.0
ganar 1.0
ganar 1.0
ganar 1.0
dominar 1.0
dominar 1.0
dominar 1.0
dominar 1.0
dominar 1.0
dominar 1.0
dominar 1.0
dominar 1.0
invadir 1.0
invadir 1.0
invadir 0.0
invadir 1.0
invadir 1.0
invadir 1.0
invadir 1.0
ocupar 1.0
ocupar 1.0
ocupar 1.0
ocupar 1.0
ocupar 1.0
ocupar 1.0
vencer 1.0
vencer 0.0
vencer 0.0
vencer 1.0
vencer 0.0
tomar 1.0
tomar 1.0
tomar 1.0
tomar 1.0
colonizar 1.0
colonizar 1.0
colonizar 1.0
conquistar 1.0
conquistar 1.0
obtener 1.0
