-
Notifications
You must be signed in to change notification settings - Fork 4
/
loader.py
117 lines (97 loc) · 3.8 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""============================================================================
Dataset loading functions.
============================================================================"""
from datasets.dataset import Dataset
from GPy import kern
import numpy as np
import pandas as pd
from scipy.special import (expit as logistic,
logsumexp)
from sklearn.datasets import make_s_curve
# -----------------------------------------------------------------------------
def load_dataset(rng, name, emissions):
"""Given a dataset string, returns data and possibly true generative
parameters.
"""
loader = {
'bridges' : load_bridges,
'congress': load_congress,
's-curve' : gen_s_curve
}[name]
if name == 's-curve':
return loader(rng, emissions)
else:
return loader()
# -----------------------------------------------------------------------------
def load_bridges():
"""Load NYC bridges dataset:
https://data.cityofnewyork.us/Transportation/
Bicycle-Counts-for-East-River-Bridges/gua4-p9wg
"""
data = np.load(f'datasets/bridges.npy', allow_pickle=True)
data = data[()]
Y = data['Y']
labels = data['labels']
return Dataset('bridges', True, Y, labels=labels)
# -----------------------------------------------------------------------------
def load_congress():
"""Congress 109 data:
https://github.com/jgscott/STA380/blob/master/data/congress109.csv
https://github.com/jgscott/STA380/blob/master/data/congress109members.csv
"""
df1 = pd.read_csv(f'datasets/congress109.csv')
df2 = pd.read_csv(f'datasets/congress109members.csv')
assert (len(df1) == len(df2))
# Ensure same ordering.
df1 = df1.sort_values(by='name')
df2 = df2.sort_values(by='name')
Y = df1.values[:, 1:].astype(int)
labels = np.array([0 if x == 'R' else 1 for x in df2.party.values])
return Dataset('congress109', True, Y, labels=labels)
# -----------------------------------------------------------------------------
# Datasets with synthetic latent spaces.
# -----------------------------------------------------------------------------
def gen_s_curve(rng, emissions):
"""Generate synthetic data from datasets generating process.
"""
N = 500
J = 100
D = 2
# Generate latent manifold.
# -------------------------
X, t = make_s_curve(N, random_state=rng)
X = np.delete(X, obj=1, axis=1)
X = X / np.std(X, axis=0)
inds = t.argsort()
X = X[inds]
t = t[inds]
# Generate kernel `K` and latent GP-distributed maps `F`.
# -------------------------------------------------------
K = kern.RBF(input_dim=D, lengthscale=1).K(X)
F = rng.multivariate_normal(np.zeros(N), K, size=J).T
# Generate emissions using `F` and/or `K`.
# ----------------------------------------
if emissions == 'bernoulli':
P = logistic(F)
Y = rng.binomial(1, P).astype(np.double)
return Dataset('s-curve', False, Y, X, F, K, None, t)
if emissions == 'gaussian':
Y = F + np.random.normal(0, scale=0.5, size=F.shape)
return Dataset('s-curve', False, Y, X, F, K, None, t)
elif emissions == 'multinomial':
C = 100
pi = np.exp(F - logsumexp(F, axis=1)[:, None])
Y = np.zeros(pi.shape)
for n in range(N):
Y[n] = rng.multinomial(C, pi[n])
return Dataset('s-curve', False, Y, X, F, K, None, t)
elif emissions == 'negbinom':
P = logistic(F)
R = np.arange(1, J+1, dtype=float)
Y = rng.negative_binomial(R, 1-P)
return Dataset('s-curve', False, Y, X, F, K, R, t)
else:
assert(emissions == 'poisson')
theta = np.exp(F)
Y = rng.poisson(theta)
return Dataset('s-curve', False, Y, X, F, K, None, t)