In [2]:
import gzip
import os
from urllib.request import urlretrieve
import numpy as np
import sparse
import pandas as pd
%matplotlib inline

def download_files():
    "Download the file if they don't already exist."
    for filename in ['nips.tns.gz',
                     'mode-1-papers.map.gz',
                     'mode-2-authors.map.gz',
                     'mode-3-words.map.gz',
                     'mode-4-years.map.gz']:
        url = 'https://s3.us-east-2.amazonaws.com/frostt/frostt_data/nips/' + filename
        if not os.path.exists(filename):
            urlretrieve(url, filename)

            
def load_data(path):
    """Load the sparse tensor dataset at `path`"""
    values = []
    coords = []
    with gzip.open(path, 'rb') as f:
        for line in f:
            data = line.strip().split(b' ')
            coords.append([int(i) - 1 for i in data[:-1]])
            values.append(float(data[-1]))
    coords = np.array(coords, dtype=np.int32).T
    values = np.array(values, dtype=np.float64)
    return sparse.COO(coords, data=values)


def load_mode(path):
    """Load the axis information for the mode file at `path`"""
    with gzip.open(path, 'rt') as f:
        return pd.Series(f.read().splitlines())

In [3]:
download_files()

In [4]:
values = []
coords = []

In [5]:
with gzip.open('nips.tns.gz', 'rb') as f:
    for line in f:
        data = line.strip().split(b' ')
        coords.append([int(i) - 1 for i in data[:-1]])
        values.append(float(data[-1]))

In [6]:
data

[b'2482', b'2079', b'13967', b'17', b'2']

In [7]:
coords = np.array(coords, dtype=np.int32).T
values = np.array(values, dtype=np.float64)

In [9]:
len(coords)

4

In [34]:
coords.shape

(4, 9304827)

In [26]:
len(values)

9304827

In [35]:
values.shape

(9304827,)

In [27]:
tensor = sparse.COO(coords, data=values)

In [37]:
print(tensor)

<COO: shape=(2482, 2862, 14036, 17), dtype=float64, nnz=3101609, fill_value=0.0>


In [36]:
tensor.nbytes / 1e9  

0.074438616