In [None]:
import math
import numpy as np
import scipy
import scipy.stats
from scipy.stats import gamma, norm, lognorm
import pandas as pd
import altair as alt
from detail.altairdf import altairDF
alt.renderers.enable("notebook")

In [None]:
fico = pd.DataFrame()
fico["percentage"] = [x/100 for x in [4.7, 6.8, 8.5, 10.0, 13.2, 17.1, 19.0, 20.7]]
fico["range"] = ["300-499", "500-549", "550-599", "600-649", "650-699", "700-749", "750-799", "800-850"]
fico

In [None]:
alt.Chart(fico).mark_bar().encode(x="range",y=alt.Y("percentage", axis=alt.Axis(format="%"))).properties(width=600, title="FICO scores, 2017")

In [None]:
plotdf = altairDF(np.linspace(0, 2, num=100).tolist(), \
                  [lambda x: gamma.pdf(5 * x, 1.5)], \
                  labels = ["1. normal"], ycol="pdf", xcol="query latency")
alt.Chart(plotdf).mark_line().encode(x="query latency", y="pdf", color="color")

In [None]:
plotdf = altairDF(np.linspace(0, 2, num=100).tolist(), \
                  [lambda x: gamma.pdf(5*x, 1.5), lambda x: gamma.pdf(3*x, 3)], \
                  labels = ["1. normal", "2. latest"], ycol="pdf", xcol="query latency")
alt.Chart(plotdf).mark_line().encode(x="query latency", y="pdf", color="color")

In [None]:
plotdf = altairDF(np.linspace(-4, 4, num=100).tolist(), \
                  [lambda x: norm.pdf(x, 0, 1)], \
                  labels = ["gaussian"], ycol="pdf")
base = alt.Chart().mark_line().encode(x="x", y="pdf", color="color")
rule1 = alt.Chart().mark_rule(color="green").encode(x='a1:Q')
rule2 = alt.Chart().mark_rule(color="orange").encode(x='a2:Q')
rule3 = alt.Chart().mark_rule(color="red").encode(x='a3:Q')
alt.layer(base, rule1, rule2, rule3, data = plotdf).transform_calculate(a1="0.5",a2="3",a3="4").properties(width=600, height=200)

In [None]:
plotdf = altairDF(np.linspace(-4, 4, num=100).tolist(), \
                  [lambda x: norm.pdf(x, 0, 1), lambda x: norm.cdf(x, 0, 1)], \
                  labels = ["gaussian", "gaussian CDF"], ycol="pdf/cdf")
alt.Chart(plotdf).mark_line().encode(x="x", y="pdf/cdf", color="color").properties(width=600, height=200)

In [None]:
plotdf = altairDF(np.linspace(-4, 4, num=100).tolist(), \
                  [lambda x: norm.pdf(x, 0, 1)], \
                  labels = ["gaussian"], ycol="pdf")
t1df = altairDF(np.linspace(-4, -2, num=100).tolist(), \
                  [lambda x: norm.pdf(x, 0, 1)], \
                  labels = ["gaussian"], ycol="pdf")
t2df = altairDF(np.linspace(2, 4, num=100).tolist(), \
                  [lambda x: norm.pdf(x, 0, 1)], \
                  labels = ["gaussian"], ycol="pdf")
base = alt.Chart(plotdf).mark_line().encode(x="x", y="pdf", color="color")
t1 = alt.Chart(t1df).mark_area().encode(x="x", y="pdf", color="color")
t2 = alt.Chart(t2df).mark_area().encode(x="x", y="pdf", color="color")
alt.layer(base,t1,t2).properties(width=600, height=200)

In [None]:
plotdf = altairDF(np.linspace(0, 2, num=100).tolist(), \
                  [lambda x: norm.pdf(x, 1, .3), lambda x: gamma.pdf(2*x, 1), lambda x: gamma.pdf(x, 1.25)], \
                  labels = ["gaussian", "exponential", "gamma"], ycol="pdf", xcol="x")
alt.Chart(plotdf).mark_line().encode(x="x", y="pdf", color="color")

In [None]:
plotdf = altairDF(np.linspace(0, 3, num=100).tolist(), \
                  [lambda x: (norm.pdf(x, 1.5, .3) + gamma.pdf(x, 1.25))/2], \
                  labels = ["mystery!"], ycol="pdf", xcol="x")
alt.Chart(plotdf).mark_line().encode(x="x", y="pdf", color="color")

In [None]:
a1 = 1
a2 = 7
w1 = 0.5
w2 = 0.5

# Sample from a mixture of two gamma distributions
def mixsamp():
    r = scipy.stats.uniform.rvs(size=1)[0]
    if (r <= w1):
        return scipy.stats.gamma.rvs(a1, size=1)[0]
    else:
        return scipy.stats.gamma.rvs(a2, size=1)[0]

def mixcdf(x):
    return (w1 * scipy.stats.gamma.cdf(x, a1)) + (w2 * scipy.stats.gamma.cdf(x, a2))

# Sketch some data sampled from this distribution with a t-digest
from detail.tdigest import TDigest
sketch = TDigest(compression = 0.1)
for p in [mixsamp() for x in range(100000)]:
    sketch.update(p)



In [None]:
xvals = np.arange(sketch.cdfi(0), sketch.cdfi(1)).tolist()
df = pd.DataFrame()
df["x"] = xvals + xvals
df["cdf"] = [sketch.cdf(x) for x in xvals] + [mixcdf(x) for x in xvals]
df["src"] = (["tdigest"] * len(xvals)) + (["cdf"] * len(xvals))
alt.Chart(df).mark_line().encode(x="x", y="cdf", color="src")

In [None]:
plotdf = altairDF(np.linspace(0, 25, num=100).tolist(), \
                  [lambda x: mixcdf(x)], \
                  ["weird cdf"], ycol="cdf")
alt.Chart(plotdf).mark_line(color="blue").encode(x="x", y="cdf").properties(height=100)

In [None]:
plotdf = altairDF(np.linspace(0, 25, num=100).tolist(), \
                  [lambda x: sketch.cdf(x)], \
                  ["sketch cdf"], ycol="cdf")
alt.Chart(plotdf).mark_line(color="darkorange").encode(x="x", y="cdf").properties(height=100)

In [None]:
plotdf = altairDF(np.linspace(0, 25, num=100).tolist(), \
                  [lambda x: sketch.cdf(x), lambda x: mixcdf(x)], \
                  ["2. tdigest sketch", "1. weird cdf"], ycol="cdf")
alt.Chart(plotdf).mark_line().encode(x="x", y="cdf", color="color").properties(height=350)

In [None]:
weirddata = [mixsamp() for x in range(10000)]

In [None]:
p1, p2 = scipy.stats.norm.fit(weirddata)
plotdf = altairDF(np.linspace(0, 25, num=100).tolist(), \
                  [lambda x: mixcdf(x), lambda x: scipy.stats.norm.cdf(x, p1, p2)], \
                  ["1. weird cdf", "2. gaussian sketch"], ycol="cdf")
alt.Chart(plotdf).mark_line().encode(x="x", y="cdf", color="color").properties(height=350)

In [None]:
p1, p2, p3 = scipy.stats.gamma.fit(weirddata)
plotdf = altairDF(np.linspace(0, 25, num=100).tolist(), \
                  [lambda x: mixcdf(x), lambda x: scipy.stats.gamma.cdf(x, p1, p2, p3)], \
                  ["1. weird cdf", "2. gamma sketch"], ycol="cdf")
alt.Chart(plotdf).mark_line().encode(x="x", y="cdf", color="color").properties(height=350)

In [None]:
kdesketch = scipy.stats.gaussian_kde(weirddata)
plotdf = altairDF(np.linspace(-2, 25, num=100).tolist(), \
                  [lambda x: mixcdf(x), lambda x: kdesketch.integrate_box_1d(-10,x)], \
                  ["1. weird cdf", "2. kde sketch"], ycol="cdf")
alt.Chart(plotdf).mark_line().encode(x="x", y="cdf", color="color").properties(height=350)

In [None]:
plotdf = altairDF(np.linspace(0, 25, num=100).tolist(), \
                  [lambda x: mixcdf(x)], \
                  ["weird cdf"], ycol="cdf")
alt.Chart(plotdf).mark_line(color="blue").encode(x="x", y="cdf").properties(height=350)

In [None]:
scipy.stats.norm.rvs(size=4)

In [None]:
max(scipy.stats.norm.rvs(size=1000000))

In [None]:
plotdf = altairDF(np.linspace(-5, 5, num=100).tolist(), \
                  [lambda x: norm.pdf(x, 0, 1)], \
                  labels = ["gaussian"], ycol="pdf")
alt.Chart(plotdf).mark_line().encode(x="x", y="pdf", color="color").properties(width=600, height=200)

In [None]:
plotdf = altairDF(np.linspace(-3, 6, num=100).tolist(), \
                  [lambda x: norm.cdf(x, 0, 1), lambda x: math.pow(norm.cdf(x, 0, 1), 10), lambda x: math.pow(norm.cdf(x, 0, 1), 1000), lambda x: math.pow(norm.cdf(x, 0, 1), 1000000)], \
                  labels = ["CDF", "EVD ss=10", "EVD ss=1000", "EVD ss=1000000"], ycol="cdf")
base = alt.Chart().mark_line().encode(x="x", y="cdf", color="color").properties(width=600, height=200)
rule1 = alt.Chart().mark_rule(color="black").encode(x='a1:Q')
alt.layer(base, rule1, data=plotdf).transform_calculate(a1="5").properties(width=600, height=200)