-
Notifications
You must be signed in to change notification settings - Fork 77
/
zscoretrans.py
144 lines (139 loc) · 4.89 KB
/
zscoretrans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
import scipy
import numpy as np
import scipy.stats
def cdf( d ):
'''
Input: (d) iterable, a data set
Output: (f) NumPy array with (bins) rows and two columns
the first column are values in the range of (d)
the second column are CDF values
alternatively, think of the columns as the
domain and range of the CDF function
(finv) inverse of (f)
---------------------------------------------------------
Calculate the cumulative distribution function
and its inverse for a given data set
'''
# the number of data points
N = float( len( d ) )
# sorted array of data points
xs = np.sort( d )
# array of unique data points
xu = np.unique( xs )
# number of unique data points
U = len( xu )
# initialize an array of U zeros
cdf = np.zeros((U))
# for each unique data point..
for i in range( U ):
# count the number of points less than
# this point, and then divide by the
# total number of data points
cdf[i] = len( xs[ xs < xu[i] ] ) / N
# f : input value --> output percentage describing
# the number of points less than the input scalar
# in the modeled distribution; if 5 is 20% into
# the distribution, then f[5] = 0.20
f = np.vstack((xu,cdf)).T
# inverse of f
# finv : input percentage --> output value that
# represents the input percentage point of the
# distribution; if 5 is 20% into the distribution,
# then finv[0.20] = 5
finv = np.fliplr(f)
return f, finv
def fit( d ):
'''
Input: (d) NumPy array with two columns,
a domain and range of a mapping
Output: (f) function that interpolates the mapping d
----------------------------------------------------
This takes a mapping and returns a function that
interpolates values that are missing in the original
mapping, and maps values outside the range* of the
domain (d) to the maximum or minimum values of the
range** of (d), respectively.
----------------------------------------------------
*range - here, I mean "maximum minus minimum"
**range - here I mean the output of a mapping
'''
x, y = d[:,0], d[:,1]
def f(t):
# return the minimum of the range
if t <= x.min():
return y[ np.argmin(x) ]
# return the maximum of the range
elif t >= x.max():
return y[ np.argmax(x) ]
# otherwise, interpolate linearly
else:
intr = scipy.interpolate.interp1d( x, y )
return intr(t)
return f
def to_norm( data ):
'''
Input (data) 1D NumPy array of observational data
Output (z) 1D NumPy array of z-score transformed data
(inv) inverse mapping to retrieve original distribution
'''
# look at the dimensions of the data
dims = data.shape
# if there is more than one dimension..
if len( dims ) > 1:
# take the third column of the second dimension
z = data[:,2]
# otherwise just use data as is
else:
z = data
# grab the number of data points
N = len( z )
# grab the cumulative distribution function
f, inv = cdf( z )
# h will return the cdf of z
# by interpolating the mapping f
h = fit( f )
# ppf will return the inverse cdf
# of the standard normal distribution
ppf = scipy.stats.norm(0,1).ppf
# for each data point..
for i in range( N ):
# h takes z to a value in [0,1]
p = h( z[i] )
# ppf takes p (in [0,1]) to a z-score
z[i] = ppf( p )
# convert positive infinite values
posinf = np.isposinf( z )
z = np.where( posinf, np.nan, z )
z = np.where( np.isnan( z ), np.nanmax( z ), z )
# convert negative infinite values
neginf = np.isneginf( z )
z = np.where( neginf, np.nan, z )
z = np.where( np.isnan( z ), np.nanmin( z ), z )
# if the whole data set was passed, then add the
# transformed variable and recombine with data
if len( dims ) > 1:
z = np.vstack(( data[:,:2].T, z )).T
return z, inv
def from_norm( data, inv ):
'''
Input: (data) NumPy array of zscore data
(inv) mapping that takes zscore data back
to the original distribution
Output: (z) Data that should conform to the
distribution of the original data
'''
# convert to a column vector
d = data.ravel()
# create an interpolating function
# for the inverse cdf, mapping zscores
# back to the original data distribution
h = fit( inv )
# convert z-score data to cdf values in [0,1]
f = scipy.stats.norm(0,1).cdf( d )
# use inverse cdf to map [0,1] values to the
# original distribution, then add the mu and sd
z = np.array( [ h(i) for i in f ] )
# reshape the data
z = np.reshape( z, data.shape )
return z