In [None]:
# SETUP
from datascience import *
from prob140 import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import pylab
import math
from scipy import stats
from scipy import misc
from client.api.assignment import load_assignment
autograder = load_assignment('main.ok')

In [None]:
def sampleFromUniform():

    x = np.random.rand()
    print("Sampled:", x)
    return x

def search(x_limits, cdf, u):
    mid = (x_limits[0] + x_limits[1])/2
    diff = u - cdf(mid)
    if np.abs(diff) < 0.01:
        return mid
    if diff < 0:
        return search((x_limits[0], mid), cdf, u)
    return search((mid, x_limits[1]), cdf, u)

def plotAxes(cdfTable):
    cum = list(cdfTable.column(cdfTable.num_columns - 1))
    
    cur_axes = plt.gca()
    cur_axes.axes.get_xaxis().set_visible(False)
    plt.yticks([0] + cum)
    plt.ylim(-0.1, 1.1)
    plt.plot([0,0], [0,1], color="k", lw=3)
    plt.xlim(-0.02, 1)
    plt.scatter([0]*(len(cum) + 1), [0] + cum, s=55, color="k")

def plotDiscreteCDF(cdfTable, u=None):
    values = cdfTable.column(0)
    values = np.append(values[0] - 2, values)

    cum = cdfTable.column(cdfTable.num_columns - 1)
    cum = np.append(0, cum)

    for i in range(len(values) - 1):
        plt.plot([values[i], values[i+1]], [cum[i], cum[i]], color="darkblue")
        plt.plot([values[i+1], values[i+1]], [cum[i], cum[i+1]], ls="--", color="darkblue" )

    plt.scatter(values, cum, s=50, color="darkblue")    

    plt.plot([values[-1], values[-1] + 2], [1,1], color="darkblue")

    plt.xlim(values[0], values[-1] + 2)
    plt.ylim(-0.1, 1.1)
    plt.xlabel('$x$')
    plt.ylabel('CDF at $x$')
    plt.title('Graph of CDF');
    
    if u != None:
        for i in range(len(values)):
            if u <= cum[i]:
                index = values[i]
                break
        height = u

        plt.plot([values[0], (index+values[0])/2], [height, height], marker='>', color='red', lw=1)
        plt.plot([(index+values[0])/2, index], [height, height], color='red', lw=1)
        plt.plot([index, index], [height, height/2], marker="v", color="red", lw=1)
        plt.plot([index, index], [0, height/2], color="red", lw=1)

def plotContinuousCDF(x_limits, cdf, u=None):
    x = np.linspace(*x_limits, 100)
    plt.plot(x, cdf(x), color="darkblue")
    plt.xlabel('$x$')
    plt.ylabel('CDF at $x$')
    plt.title('Graph of CDF');
    
    
    
    if u != None:
        index = search(x_limits, cdf, u)
        height = u

        plt.plot([x_limits[0], (index+x_limits[0])/2], [height, height], marker='>', color='red', lw=1)
        plt.plot([(index+x_limits[0])/2, index], [height, height], color='red', lw=1)
        plt.plot([index, index], [height, height/2], marker="v", color="red", lw=1)
        plt.plot([index, index], [0, height/2], color="red", lw=1)


    plt.xlim(*x_limits)
        
import ipywidgets as widgets

from ipywidgets import interact
from IPython.display import display



def unit_interval_to_discrete(cdfTable):
    uniform_slider = widgets.FloatSlider(value=0.5, min=0,max=1,step=0.02, description='u')
    @interact(u = uniform_slider)
    def plot(u):
        plotDiscreteCDF(cdfTable, u)


def unit_interval_to_continuous(x_limits, cdf):
    uniform_slider2 = widgets.FloatSlider(value=0.5, min=0,max=1,step=0.02, description='u')
    
    @interact(u = uniform_slider2)
    def plot(u):
        if (cdf(u) > x_limits[1] or cdf(u) < x_limits[0]):
            plotContinuousCDF(x_limits, cdf)
        else:
            plotContinuousCDF(x_limits, cdf, u)

# Lab 9 #
Simulation is important for understanding and developing techniques that help us analyze data. Earlier in the term you saw `simulate_path` for simulating Markov Chains; Data 8 uses `proportions_from_distribution` to simulate multinomial variables; and so on.

In this lab we will see how to simulate random variables with a specified distribution. 

### Part 1. The Basic Idea ###
We will develop the method by considering examples of increasing complexity. 

Our starting point is a distribution on just four values. 

Suppose $X$ has the distribution displayed in the table `distribution`. 

In [None]:
values = make_array(-2, 1, 4, 7)
prob = make_array(0.3, 0.1, 0.2,0.4)
cdf = np.cumsum(prob)

In [None]:
distribution = Table().with_columns(
    "Value", values,
    "Probability", prob,
    "CDF", cdf)
distribution

Our initial goal is to generate one value of $X$.

### a) ###
The graphic below shows the four `CDF` values on the unit interval.

Imagine a number picked uniformly at random from the unit interval. That is, let $U$ be a random variable that has the uniform distribution on $(0, 1)$, and suppose you mark the value of $U$ on the unit interval shown in the graph.

In [None]:
plotAxes(distribution)

Find:

(i) $P(U < 0.3)$

(ii) $P(0.3 < U < 0.4)$

(iii) $P(0.4 < U < 0.6)$

(iv) $P(0.6 < U < 1)$


*Provide your answer and reasoning in this Markdown cell.*

### b) ###
Starting with a uniform $(0, 1)$ random variable $U$, propose a method of generating a value of $X$. 

Your method should take $U$ as its input and return one of the four possible values as output, in such a way that for each $i = -2, 1, 4, 7$, the chance of returning the value $i$ is $P(X = i)$.

Just describe your method in words. No formula or code is needed.

*Provide your answer and reasoning in this Markdown cell.*

### Part 2. Visualizing the Method ###
The method `plotDiscreteCDF` takes as its argument a distribution table that has a `CDF` column as the last column, and plots the cdf.

Run the cell below to get a graph of the cdf of the random variable $X$ in Part 1.

In [None]:
plotDiscreteCDF(distribution)

### a) ###
Let $F_X$ be the cdf of $X$. What is the value of $F_X(1.27)$, and is that the value the graph shows?

*Provide your answer and reasoning in this Markdown cell.*

### b) ###
At what points $x$ does the cdf have a jump? For each point $x$ at which there is a jump, find the size of the jump in terms of the distribution of $X$.

*Provide your answer and reasoning in this Markdown cell.*

### c) ###
Look at the graph above and look also at the graphic depicting the unit interval in Part 1. Which column or columns of `distribution` were needed to draw the graphic in Part 1, and which were needed to draw the graph of the cdf above?

*Provide your answer and reasoning in this Markdown cell.*

### d) ###
Run the cell below. The function `unit_interval_to_discrete` takes a distribution table as its argument (again with the cdf as the final column) and displays an animation of a method that takes a number on the unit interval and returns one of the values of $X$. How is the method that it is using related to the one you proposed in Part 1?

In [None]:
unit_interval_to_discrete(distribution)

*Provide your answer and reasoning in this Markdown cell.*

### e) ###
The method `plotDiscreteCDF` also takes a second argument which is a number between 0 and 1. You can generate that number uniformly at random by using `sampleFromUniform()`.

Run the cell below a few times. How is it related to the method you proposed in Part 1 for generating a value of $X$?

In [None]:
plotDiscreteCDF(distribution, sampleFromUniform())

### Part 3. Extension to Continuous Distributions ###
Now suppose you want to generate a random variable $X$ that has a specified continuous distribution, and that you are given a uniform $(0, 1)$ random number to start with. 

### a) ###
The function `plotContinuousCDF` plots the cdf of a continuous variable. The first two arguments:
- an interval (a, b) over which to draw the cdf
- a cdf function that takes in a value and returns the cumulative probability at that value

Here is the cdf of the exponential distribution with $\lambda = 0.5$ and hence expectation = SD = 2.

In [None]:
lamb = 0.5

def expon_cdf(x):
    return 1-np.exp(-lamb*x)

plotContinuousCDF((0, 8), expon_cdf)

Suppose you are given one uniform $(0, 1)$ random number and are asked to generate a random variable $T$ that has the exponential distribution shown above. Based on Parts 1 and 2, propose a method for doing this.

In this part of the lab you don't have to prove that the method works. Just propose it.

*Provide your answer and reasoning in this Markdown cell.*

### b) ###
Run the cell below. The animation is analogous to the one in Part 2: it takes a plotting interval and a continuous cdf function as its arguments, and demonstrates a method for picking a number on the positive real line starting with value on the unit interval that is the vertical axis. How is its method related to the one you proposed in (a)?

In [None]:
unit_interval_to_continuous((0, 8), expon_cdf)

#SOLUTON

### c) ###
The method `plotContinuousCDF` takes a third argument which is a number between 0 and 1. As before, you can generate that number uniformly at random by using `sampleFromUniform()`.

Run the cell below a few times. How is it related to the method you proposed in (a) for generating a value of a random variable $T$ that has the displayed exponential distribution?

In [None]:
plotContinuousCDF((0, 8), expon_cdf, sampleFromUniform())

### d) ###
Now suppose you have a uniform $(0, 1)$ random number and want to generate a value of a random variable $Z$ that has the standard normal distribution. Demonstrate a method for doing this by making appropriate changes to the code in (c).

### Part 4. Proof that the Method Works ###
Suppose you are trying to generate a random variable that has a continuous, strictly increasing cdf $F$, and suppose that you will use the method you proposed in Part 3. In this part of the lab you will describe that method mathematically and prove that it works.

The method:
- Start with a random variable $U$ that has the uniform $(0, 1)$ distribution.
- Return the value $g(U)$ for a function $g$ that you should define based on the target cdf $F$ and your proposal in Part 3.
- The claim that you have to prove is that the random variable $X = g(U)$ has cdf $F$.

### a) ###
In terms of $F$, what is your function $g$?

*Provide your answer and reasoning in this Markdown cell.*

### b) ###
For any number $\alpha$ between 0 and 1, what is $P(U \le \alpha)$?

*Provide your answer and reasoning in this Markdown cell.*

### c) ###
Let $F_X$ be the cdf of $X$. Show that $F_X = F$, the target cdf.

We'll start you off on the proof. Use as many more lines as you need. Your final expression should be $F(x)$.

\begin{align*}
F_X(x) &= P(X \le x) \\
&= P(g(U) \le x) \\
&= \ldots \\
&= \ldots \\
&= F(x)
\end{align*}

*Provide your answer and reasoning in this Markdown cell.*

### d) ###
Let $U$ be uniform on $(0, 1)$. Find a function $g$ such that the random variable $g(U)$ has the exponential distribution with parameter $\lambda$. Double check your answer by applying the change of variable formula for densities.

*Provide your answer and reasoning in this Markdown cell.*

### e) ###
Let $U$ be uniform on $(0, 1)$. Let $\Phi$ be the standard normal cdf. Find a function $g$ in terms of $\Phi$ so that the random variable $g(U)$ has the standard normal distribution. You don't have to double check this one by using the change of variable formula.

*Provide your answer and reasoning in this Markdown cell.*

### f) ###
Let $U$ be uniform on $(0, 1)$. Find a function $g$ such that the random variable $g(U)$ has the same distribution as the radius of a point picked at random from the unit disc.

*Provide your answer and reasoning in this Markdown cell.*

### Part 5. Empirical Verification that the Method Works ###

### a) ###
Create a table that is called `sim` for simulation and consists of one column called `Uniform` that contains 100,000 uniform $(0, 1)$ random numbers. The call `stats.uniform.rvs(size=100000)` will generate an array of the random numbers.

In [None]:
N = 100000
u = ...
sim = Table().with_column("uniform", u)

### b) ###
Augment `sim` with three columns, each a transformation of the column `Uniform`.
- `Exponential 1/2`: Transform `Uniform` by the function $g$ in your answer to Part 4(d) in the case $\lambda = 1/2$
- `Standard Normal`: Transform `Uniform` by the function $g$ in your answer to Part 4(e).
- `Radius`: Transform `Uniform` by the function $g$ in your answer to Part 4(f).

In [None]:

def g_exp(y):
    return ...

def g_norm(y):
    return ...

def g_rad(y):
    return ...
    
sim = ...

### c) ###
Draw a histogram of each of the four columns of `sim`. Does each one follow the distribution it is supposed to? Answer this question with attention to detail, such as histogram balance points, min, max, height, and so on. You don't need to use all of the measures in all cases, but try to avoid rough answers like, "It looks kind of exponential, so the method works."

In [None]:

#SOLUTIONONLY
sim.select("radius").hist(bins=50)

Since uniform $(0, 1)$ random numbers are central to all simulations, their quality is very important for the accuracy and reliability of simulations. Testing and assessing uniform random number generators is serious business, because random number generators don't really produce random numbers. They follow deterministic processes that produce results that have properties that resemble those of random numbers. That is why they are called Pseudo Random Number Generators or PRNGs. [Python uses the Mersenne Twister](https://docs.python.org/3.6/library/random.html), one of the most tested and reliable PRNGs. SciPy uses the [Mersenne Twister for RandomState](https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.RandomState.html#numpy.random.RandomState) and draws from a large number of discrete and continuous distributions. Take a look at the list on the RandomState page and see how many you can recognize.

In [None]:
_ = autograder.grade('q1')

In [None]:
# For your convenience, you can run this cell to run all the tests at once!
import os
_ = [autograder.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]

In [None]:
import gsExport
gsExport.generateSubmission()