In [1]:
from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel
import spacy
import os
import random
import codecs
from time import time
from collections import defaultdict
import numpy as np

In [22]:
wordset = defaultdict(int)

In [40]:
def readWords(filename):
    with open(filename,"r") as file:
        for line in file:
            words = line.split("\t")
#             print(words)
            for w in words:
                ww = w.split("-")
                wordset[ww[0]] += 1

readWords("eacl2012-data/positive-examples.txtinput")
readWords("eacl2012-data/negative-examples.txtinput")

In [59]:
from collections import Counter
c = Counter(wordset)

In [77]:
len(c)

1478

In [193]:
sett = c.most_common(150)

In [194]:
mostfreq = {}
for i in sett:
    mostfreq[i[0]] = i[1]

In [195]:
mostfreq

{'animal': 230,
 'vertebrate': 152,
 'vehicle': 140,
 'feeling': 122,
 'science': 94,
 'worker': 86,
 'mammal': 86,
 'performer': 78,
 'discipline': 76,
 'organization': 72,
 'building': 72,
 'bird': 68,
 'food': 66,
 'container': 62,
 'entertainer': 58,
 'disease': 58,
 'illness': 52,
 'beverage': 52,
 'instrument': 50,
 'relative': 50,
 'emotion': 46,
 'solid': 46,
 'sport': 46,
 'fluid': 44,
 'phenomenon': 44,
 'insect': 42,
 'invertebrate': 42,
 'liquid': 40,
 'commodity': 40,
 'leader': 40,
 'housing': 40,
 'adult': 38,
 'chemical': 36,
 'equipment': 36,
 'trait': 36,
 'integer': 36,
 'athlete': 34,
 'alcohol': 32,
 'tree': 32,
 'drug': 32,
 'professional': 32,
 'molecule': 32,
 'tumor': 32,
 'document': 30,
 'game': 30,
 'clothing': 30,
 'house': 30,
 'information': 30,
 'statement': 28,
 'organ': 28,
 'road': 28,
 'garment': 28,
 'symbol': 26,
 'reptile': 26,
 'machine': 26,
 'herb': 26,
 'dwelling': 26,
 'fruit': 26,
 'word': 24,
 'furniture': 24,
 'payment': 24,
 'music': 24,


In [196]:
tdataset = []
fdataset = []
def readEntailment(filename, dataset):
    with open(filename,"r") as file:
        for line in file:
            words = line.split("\t")
            ll = []
            for w in words:
                ww = w.split("-")
                ll.append(ww[0])
#             print(ll)
            val = True
            for w in ll:
                if w not in mostfreq:
                    val = False
            if val:
                dataset.append(ll)

readEntailment("eacl2012-data/positive-examples.txtinput", tdataset)
readEntailment("eacl2012-data/negative-examples.txtinput", fdataset)

In [197]:
positives = [w+[1] for w in tdataset]

In [198]:
negatives = [w+[0] for w in fdataset]

In [199]:
positives

[['aircraft', 'vehicle', 1],
 ['airplane', 'vehicle', 1],
 ['alcohol', 'fluid', 1],
 ['algebra', 'science', 1],
 ['asp', 'reptile', 1],
 ['asp', 'snake', 1],
 ['asp', 'vertebrate', 1],
 ['bear', 'carnivore', 1],
 ['bear', 'mammal', 1],
 ['bear', 'vertebrate', 1],
 ['beer', 'beverage', 1],
 ['beer', 'liquid', 1],
 ['beverage', 'fluid', 1],
 ['beverage', 'liquid', 1],
 ['biochemistry', 'science', 1],
 ['biplane', 'aircraft', 1],
 ['biplane', 'airplane', 1],
 ['biplane', 'vehicle', 1],
 ['bird', 'animal', 1],
 ['castle', 'building', 1],
 ['castle', 'dwelling', 1],
 ['castle', 'house', 1],
 ['castle', 'housing', 1],
 ['chick', 'animal', 1],
 ['chick', 'bird', 1],
 ['chick', 'vertebrate', 1],
 ['collagen', 'molecule', 1],
 ['collagen', 'protein', 1],
 ['competitiveness', 'trait', 1],
 ['dog', 'carnivore', 1],
 ['dog', 'mammal', 1],
 ['dwelling', 'housing', 1],
 ['eagle', 'animal', 1],
 ['eagle', 'bird', 1],
 ['eagle', 'vertebrate', 1],
 ['employee', 'worker', 1],
 ['fish', 'animal', 1],
 ['

In [200]:
negatives = random.sample(negatives, len(positives))

In [201]:
print(len(negatives), len(positives))

101 101


In [202]:
total = negatives + positives

In [203]:
random.shuffle(total)

In [204]:
outdir='wordvectors/gaussians/'

In [205]:
def KLDivergence(m1,m2,c1,c2):
    return 0.5*(np.sum(np.log(c2) - np.log(c1)) - m1.shape[0] + np.sum(np.reciprocal(c2)*c1) + np.dot((m1-m2)*np.reciprocal(c2),(m1-m2)))

In [206]:
def findMaxKL(pt):
    data1 = np.load(outdir+pt[0]+'/'+'0.npz')
    data2 = np.load(outdir+pt[0]+'/'+ '0.npz')

In [207]:
def findBestKL(pt):
    minkl = float('inf')
    liss = {}
    maxwt1 = 0
    maxwt2 = 0
    for w in os.listdir(outdir+pt[0]):
        data1 = np.load(outdir+pt[0]+'/'+w)
        for x in os.listdir(outdir+pt[1]):
            data2 = np.load(outdir+pt[1]+'/'+x)
            cov1 = data1['cov']
            cov2 = data2['cov']
            kld = KLDivergence(data1['means'],data2['means'],cov1,cov2)
#             print(float(data1['weights']), float(data2['weights']))
            liss[(float(data1['weights']), float(data2['weights']))] = kld
            maxwt1 = max(maxwt1,float(data1['weights']))
            maxwt2 = max(maxwt2,float(data2['weights']))
            minkl = min(minkl, kld)
#     print(maxwt1,maxwt2)
    print(liss[(maxwt1,maxwt2)])
#     print(minkl)
# findBestKL(['animal','beverage'])
findBestKL(['computer','animal'])
findBestKL(['animal','computer'])

720.5837474959214
915.4892913281565


In [208]:
def compareBroadness(pt):
    maxwt = 0
    for w in os.listdir(outdir+pt[0]):
        data1 = np.load(outdir+pt[0]+'/'+w)
        for x in os.listdir(outdir+pt[1]):
            data2 = np.load(outdir+pt[1]+'/'+x)
            cov1 = data1['cov']
            cov2 = data2['cov']
            val = np.sum(np.log(c2) - np.log(c1))
            print(val)
    