In [1]:
from tswift import Artist, Song, TswiftError
import random

import numpy as np
import torch
from sklearn.mixture import GaussianMixture

import codecs
import pickle
import logging
import nltk
stopwords = nltk.corpus.stopwords
remove_these = set(stopwords.words('english'))

def load_external_embeddings(params, emb_path):
    """
    Reload pretrained embeddings from a text file.
    """
    
    word2id = {}
    vectors = []

    # load pretrained embeddings
    _emb_dim_file = params.emb_dim
    with codecs.open(emb_path) as f:
        for i, line in enumerate(f):
            if len(line.split()) == 2:
                i -= 1
                continue
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            if np.linalg.norm(vect) == 0:  # avoid to have null embeddings
                vect[0] = 0.01
            assert word not in word2id
            assert vect.shape == (_emb_dim_file,), i
            word2id[word] = len(word2id)
            vectors.append(vect[None])

    logging.info("Loaded %i pre-trained word embeddings" % len(vectors))
    
    dico = word2id
    
    embeddings = np.concatenate(vectors, 0)
    embeddings = torch.from_numpy(embeddings).float()
    embeddings = embeddings.cuda() if params.cuda and torch.cuda.is_available() else embeddings
    assert embeddings.size() == (len(word2id), params.emb_dim), ((len(word2id), params.emb_dim, embeddings.size()))

    return dico, embeddings

pkl_file = open("model/ft_params.pkl", 'rb')
params = pickle.load(pkl_file)
out_dico, out_emb = load_external_embeddings(params, "data/ft_postspec.txt")

In [2]:
import nltk
stopwords = nltk.corpus.stopwords
remove_these = set(stopwords.words('english') + stopwords.words('spanish'))

with open('mxm_reverse_mapping.txt','r') as f:        # Load mapping from contracted word to full word string in the mxm dataset
        lines = f.readlines()
        _map = {}
        for l in lines:
            _input, output = l.split("<SEP>")
            _map[_input] = output   

with open('mxm_dataset_train.txt','r') as f:
        lines = f.readlines()
        words = lines[17].replace('%','').split(',')
        words = [_map[word.replace('\n','')].replace('\n','') for word in words]
        words = [word for word in words if word not in remove_these and "en_" + word in out_dico]
        
print(len(words))
print(words)

4102


In [18]:
import random
cos = torch.nn.CosineSimilarity(dim=0)
embeddings = [out_emb[out_dico["en_" + word]] for word in words]
averages = []
minimums = []
for i,emb in enumerate(embeddings):
    
    _sum = 0
    num_compares = 0
    _min = -1
    indices = random.sample(range(len(embeddings)), 2500)
    
    if i in indices:
        indices.remove(i)
        
    for j,comp_emb in enumerate(list(np.array(embeddings)[indices])):
        if i == j:
            continue
        dist = torch.linalg.norm(emb - comp_emb)# cos(emb, comp_emb)
        _min = dist if _min == -1 or dist < _min else _min
        _sum += dist
        num_compares += 1
        
    averages.append(_sum / num_compares)
    minimums.append(_min)
    if i % 50 == 0:
        print(i)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100


In [19]:
ratios = []
for i in range(len(averages)):
    ratios.append(minimums[i] / averages[i])

print(sum(minimums) / len(minimums))
# print(sum(maximums) / len(maximums))
print(sum(averages) / len(averages))
print(sum(ratios) / len(ratios))

tensor(0.8614, device='cuda:0')
tensor(1.3745, device='cuda:0')
tensor(0.6267, device='cuda:0')


In [5]:
print(torch.linalg.norm(embeddings[0]))

tensor(1., device='cuda:0')


In [11]:
import math
print(torch.linalg.norm(torch.tensor([1., 1., 1.]) / math.sqrt(3) - torch.tensor([-1., -1., -1.]) / math.sqrt(3)))

tensor(2.0000)
