In [1]:
%config IPCompleter.greedy=True
%matplotlib inline

In [146]:
import pandas as pd, numpy as np, os, sys
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from IPython.display import display, HTML
from IPython.core.debugger import Tracer
from IPython.core.debugger import Pdb

font = {'size'   : 18}
matplotlib.rc('font', **font)

def plotHorizontalBars(series, xlabel, title):
    xpos = np.arange(len(series.index), 0, -1)
    plt.barh(xpos, series, align='center', alpha=0.5, color='g')
    plt.grid(axis='x')
    plt.yticks(xpos, series.index)
    plt.xlabel(xlabel)
    plt.title(title)

def plotVerticalBars(series, ylabel, title):
    xpos = np.arange(len(series.index), 0, -1)
    plt.bar(xpos, series, align='center', alpha=0.5, color='g')
    plt.grid(axis='y')
    plt.xticks(xpos, series.index)
    plt.ylabel(ylabel)
    plt.title(title)

def greater_than_dist(x, y, binSize, minx=None, maxx=None):
    """
    x and y are pd Series
    
    both x is numeric and y is categorical. returns y's probabilities in each x bin
    """
    assert(y is not None)

    vals = np.sort(y.unique())
    cols = [x.name] + vals.astype(np.str).tolist()
    vals = pd.Series(np.zeros(vals.shape[0]), index=vals)
    
    df = pd.concat([x, y], axis=1, ignore_index=True)
    df.columns = ['x', 'y']
    minx = df.x.min() - 1e-12 if minx is None else minx
    maxx = df.x.max() if maxx is None else maxx

    df = df[(df.x >= minx) & (df.x < maxx)]
    bins = np.arange(minx, maxx, binSize)
    
    rs = np.array([])
    for bin_min in bins:
        z = df[df.x >= bin_min]
        probs = z.y.value_counts(normalize=True)
        probs = vals.align(probs, fill_value=0.0)[1].sort_index()
        probs = [bin_min] + probs.tolist()
        rs = np.append(rs, probs)
    rs = rs.reshape(-1, vals.shape[0]+1)
    rs = pd.DataFrame(rs, columns=cols)
    return rs

In [16]:
from sklearn import preprocessing

x = np.array([[1, -2, -2], [3, 0, 0], [0, 1, -1]])
x = x.astype(np.float32)
print(x)
print()
print(x.mean(axis=0))
print(x.std(axis=0))
print()
a = x-x.mean(axis=0)
b = a / x.std(axis=0)
print(a)
print(b)

[[ 1. -2. -2.]
 [ 3.  0.  0.]
 [ 0.  1. -1.]]

[ 1.3333334  -0.33333334 -1.        ]
[1.2472192 1.2472192 0.8164966]

[[-0.33333337 -1.6666666  -1.        ]
 [ 1.6666666   0.33333334  1.        ]
 [-1.3333334   1.3333334   0.        ]]
[[-0.26726127 -1.3363061  -1.2247448 ]
 [ 1.3363061   0.26726124  1.2247448 ]
 [-1.069045    1.069045    0.        ]]


In [11]:
scaled = preprocessing.scale(x, axis=0)
print(scaled)
print(scaled.mean(axis=0))
print(scaled.std(axis=0))

[[-0.2672612  -1.3363061  -1.2247448 ]
 [ 1.3363061   0.26726118  1.2247448 ]
 [-1.069045    1.069045    0.        ]]
[-3.973643e-08  0.000000e+00  0.000000e+00]
[0.9999999 0.9999999 0.9999999]




In [28]:
scaled = preprocessing.normalize(x, norm='l1', axis=0)
print(scaled)
scaled = preprocessing.normalize(x, norm='l2', axis=0)
print(scaled)

[[ 0.25       -0.6666667  -0.6666667 ]
 [ 0.75        0.          0.        ]
 [ 0.          0.33333334 -0.33333334]]
[[ 0.31622776 -0.8944272  -0.8944272 ]
 [ 0.94868326  0.          0.        ]
 [ 0.          0.4472136  -0.4472136 ]]


In [29]:
# L1-norm: normalize with the sum
a = np.array([1,3,0])
l1 = np.sum(a)
print(l1)
print(a/l1)

# L2=norm: add up sqaure, then sqrt the sum. Use that to normalize
l2 = np.sqrt(np.sum(np.square(a)))
print(l2)
print(a/l2)

4
[0.25 0.75 0.  ]
3.1622776601683795
[0.31622777 0.9486833  0.        ]


In [36]:
a = preprocessing.MinMaxScaler(feature_range=(-10, 10)).fit_transform(x) # always axis=0
print(a)

[[ -3.3333335 -10.        -10.       ]
 [ 10.          3.333333   10.       ]
 [-10.         10.          0.       ]]


In [37]:
a = preprocessing.Binarizer(threshold=0.5).transform(x)
print(x)
print()
print(a)

[[ 1. -2. -2.]
 [ 3.  0.  0.]
 [ 0.  1. -1.]]

[[1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


In [38]:
from numpy import nan
y = np.array([nan, 0, 3, 2, 9, -8, 1, nan, 1, 5, 2, 4, 7, 6, -1]).reshape(-1, 3)
print(y)

[[nan  0.  3.]
 [ 2.  9. -8.]
 [ 1. nan  1.]
 [ 5.  2.  4.]
 [ 7.  6. -1.]]


In [44]:
from sklearn.preprocessing import Imputer
a1 = y[:, 1]
print(a1)
print(np.nanmedian(a1))

a = Imputer(strategy='median').fit_transform(y)
print(a)

[ 0.  9. nan  2.  6.]
4.0
[[ 3.5  0.   3. ]
 [ 2.   9.  -8. ]
 [ 1.   4.   1. ]
 [ 5.   2.   4. ]
 [ 7.   6.  -1. ]]


In [57]:
z = np.random.rand(1000)
bins = 13
z = z * bins # to make bins unity, so densities are normalized
a = np.histogram(z, bins=bins, range=(0, bins))
print(a)
a = np.histogram(z, bins=bins, range=(0, bins), normed=True)
print(a)
a = np.histogram(z, bins=bins, range=(0, bins), density=True)
print(a)

(array([84, 80, 81, 84, 69, 70, 77, 80, 73, 67, 69, 84, 82]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13.]))
(array([0.084, 0.08 , 0.081, 0.084, 0.069, 0.07 , 0.077, 0.08 , 0.073,
       0.067, 0.069, 0.084, 0.082]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13.]))
(array([0.084, 0.08 , 0.081, 0.084, 0.069, 0.07 , 0.077, 0.08 , 0.073,
       0.067, 0.069, 0.084, 0.082]), array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13.]))


In [63]:
z = np.sort(np.random.rand(10))
print(z)
bins = 10
z *= bins
z1 = [ z[z>x].shape[0] for x in range(bins) ]
print(z1)

[0.01023458 0.02336912 0.14638387 0.23242962 0.23479087 0.64852594
 0.64963265 0.90113063 0.90738677 0.97610273]
[10, 8, 7, 5, 5, 5, 5, 3, 3, 3]


In [144]:
samples = 80000
n_vals = 5
yv = [chr(ord('a')+i) for i in range(n_vals)]

x = range(samples)
y = np.array(yv * (samples // n_vals)); np.random.shuffle(y)
a = pd.DataFrame(np.column_stack((x, y)), columns=['x-axis', 'y-axis'])
a['x-axis'] = a['x-axis'].astype('int64')
display(a.head(10))
print(a.shape)

bin_size = samples // 10
rs = greater_than_dist(a['x-axis'], a['y-axis'], bin_size)
rs['x-axis'] = (rs['x-axis']+1e-7).astype('int64')
display(rs)

Unnamed: 0,x-axis,y-axis
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e
5,5,c
6,6,b
7,7,c
8,8,a
9,9,c


(80000, 2)


Unnamed: 0,x-axis,a,b,c,d,e
0,0,0.19999,0.200003,0.200003,0.200003,0.200003
1,8000,0.199781,0.199975,0.199503,0.200336,0.200406
2,16000,0.199925,0.200113,0.199534,0.20005,0.200378
3,24000,0.198789,0.20145,0.199486,0.200968,0.199307
4,32000,0.197275,0.201317,0.200129,0.201004,0.200275
5,40000,0.19738,0.20063,0.20128,0.20088,0.19983
6,48000,0.197131,0.199787,0.2026,0.199819,0.200663
7,56000,0.19655,0.19805,0.205425,0.201342,0.198633
8,64000,0.195762,0.198637,0.204138,0.202763,0.1987
9,72000,0.201275,0.19615,0.205151,0.198775,0.19865


In [124]:
x = a['x-axis']
y = a['y-axis']
xname, yname = x.name, y.name
print(xname, yname)

vals = np.sort(y.unique())
cols = ['bin'] + vals.astype(np.str).tolist()
print(cols)
vals = pd.Series(np.zeros(vals.shape[0]), index=vals)
print(vals)

df = pd.concat([x, y], axis=1, ignore_index=True)
df.columns = ['x', 'y']
print(df)

minx = maxx = None
binSize = 4
if minx is None:
    minx = df.x.min() - 1e-12
if maxx is None:
    maxx = df.x.max()
print(minx, maxx)

df = df[(df.x >= minx) & (df.x < maxx)]
bins = np.arange(minx, maxx, binSize)
print('bins:', bins)


x-axis y-axis
['bin', 'a', 'b', 'c', 'd', 'e']
a    0.0
b    0.0
c    0.0
d    0.0
e    0.0
dtype: float64
     x  y
0    0  c
1    1  d
2    2  b
3    3  b
4    4  d
5    5  a
6    6  e
7    7  b
8    8  c
9    9  d
10  10  b
11  11  c
12  12  e
13  13  a
14  14  a
15  15  d
16  16  a
17  17  c
18  18  e
19  19  e
-1e-12 19
bins: [-1.0e-12  4.0e+00  8.0e+00  1.2e+01  1.6e+01]


In [125]:
rs = np.array([])
for i, bin_min in enumerate(bins):
    z = df[df.x >= bin_min]
    probs = z.y.value_counts(normalize=True).sort_index()
    probs = vals.align(probs)[1]
    probs[pd.isnull(probs)] = 0.0
    probs = [bin_min] + probs.tolist()
    rs = np.append(rs, probs)
rs = rs.reshape(-1, vals.shape[0]+1)
rs = pd.DataFrame(rs, columns=cols)
print(rs)

            bin         a         b         c         d         e
0 -1.000000e-12  0.210526  0.210526  0.210526  0.210526  0.157895
1  4.000000e+00  0.266667  0.133333  0.200000  0.200000  0.200000
2  8.000000e+00  0.272727  0.090909  0.272727  0.181818  0.181818
3  1.200000e+01  0.428571  0.000000  0.142857  0.142857  0.285714
4  1.600000e+01  0.333333  0.000000  0.333333  0.000000  0.333333
