In [1]:
# make necessary imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils.data
from torch.optim import *
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torchvision.utils import save_image

In [None]:
class VAE(nn.Module):
    """
    parametrizable VAE module
    """
    def __init__(self, dim_input, dim_output):
        """
        flexible constructor for Neural network: allow for user-defined input 
        and latent space dimensions
        :param dim_input: required, dimensionality of input data
        .param dim_output: required, dimensionality of latent space
        """
        super(VAE, self).__init__()
        
        self.fc1  = nn.Linear(dim_input, 400)
        self.fc21 = nn.Linear(400, dim_output)
        self.fc22 = nn.Linear(400, dim_output)
        self.fc3  = nn.Linear(dim_output, 400)
        self.fc4  = nn.Linear(400, dim_input)
    
    
    def encode(self, X):
        """
        encodes the input image into two vectors: mean and variance
        :param X: input data in torch Tensor format
        :returns: mu and var
        """
        hidden1 = F.relu(self.fc1(X))
        return self.fc21(hidden1), self.fc22(hidden1)
    
    
    def reparameterize(self, mu, logvar):
        """
        implementation of the reparameterization trick, allowing for training with random sampling
        :param mu: mean values tensor
        :param var: logvariance tensor
        :returns: random tensor from the Gaussian distribution
        """
        # get variance
        std = torch.exp(0.5*logvar)
        
        # get random tensor from normal distribution of mean 0 and var 1 of size like std
        eps = torch.randn_like(std)
        return mu+eps*std
    
    
    def decode(self, z):
        """
        project a tensor from the latent space back into original coordinates
        :param z: tensor in the latent space to be decoded
        """
        hidden3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(hidden3))
    
    
    def forward(self, x):
        """
        forward function of VAE NN
        :param x: input image in torch Tensor format
        returns: x decoded from latent space along with mean and logvar tensors
        """
        mu, logvar = self.encode(x.view(-1, 784))
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar 

In [7]:
path = "../../bigassdata/GoogleNews-vectors-negative300.bin"
dt = np.dtype([(str(i), "f4") for i in range(1,301)])

In [8]:
import numpy as np
with open(path) as file:
    data = np.fromfile(file, dtype=dt)

In [9]:
import pandas as pd
df = pd.DataFrame(data)

In [10]:
df.shape

(3036882, 300)

In [11]:
df.head(2)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,6.409692e-10,1.492372e-19,8.483157000000001e-33,0.2374849,-6.462373e-27,1.547436e+26,-1.776396e-15,-524291.5625,-1.009749e-28,-3.1e-05,...,4.440923e-16,-2.842234e-14,-2.351005e-38,2.524411e-29,-8388794.0,-1.192119e-07,1.701424e+38,562962300000000.0,-2.910448e-11,-1.654373e-24
1,2.584957e-26,-134218700.0,2.351041e-38,-4.768477e-07,2.0194359999999999e-19,0.0703125,0.08691406,0.087891,0.0625,0.069336,...,0.06396484,-0.1064453,-0.1435547,-0.04223633,0.02404785,-0.1689453,-0.08886719,-0.08056641,0.06494141,0.0612793


In [None]:
df.head(10000).to_csv("word2vec_4.csv")
df.head(100000).to_csv("word2vec_5.csv")
df.head(500000).to_csv("word2vec_6.csv")
df.head(1000000).to_csv("word2vec_7.csv")