In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn.functional as F
from torch.distributions import constraints

from collections import Counter

from sklearn.mixture import GaussianMixture,BayesianGaussianMixture

In [2]:
directory='wordvectors/bert_entailment/'
outdir='wordvectors/gaussians/'

In [3]:
def read_vectors(word):
    dir1 = directory + word
    count = 0
    tensors = []
    for f in os.listdir(dir1):
        if not f[-3:] == "txt":
            count += 1
            temp = torch.load(dir1+"/"+f, map_location=torch.device('cpu'))
            tensors.append(temp)
    if tensors == []:
        return []
    return torch.stack(tensors)

In [9]:
def findGaussians(word):
    tensors = read_vectors(word)
    if tensors == []:
        return []
    dpgmm = BayesianGaussianMixture(n_components=10, covariance_type="diag",weight_concentration_prior=0.1,max_iter=200).fit(tensors)
    labels = dpgmm.predict(tensors)
    counts = Counter(labels).keys()
    weights = [dpgmm.weights_[count] for count in counts]
    means = [dpgmm.means_[count] for count in counts]
    diag_covariances = [dpgmm.covariances_[count] for count in counts]
    return weights, means, diag_covariances
    

In [None]:
for word in os.listdir(directory):
    gaussians = findGaussians(word)
    if gaussians == []:
        continue
    os.mkdir(outdir+word+'/')
    for i in range(len(gaussians[0])):
        np.savez(outdir+word+'/'+str(i)+'.npz',weights=gaussians[0][i], means = gaussians[1][i], cov=gaussians[2][i])