In [139]:
import datetime
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from ipywidgets import widgets
import chart_studio.plotly as py

### Generating data for sin wave

First we will generate some data to plot the sin wave

x vector (x axis) = 100 points from 0 to 1 

t vector (y axis) = apply sin(2 * pi * x) to the previous generated points

It is important to think the t vector as a function broadcasted on the x vector

In [140]:
#Example of broadcasting
x = np.matrix([1,2,3]) # shape (1,3)
t = 2*x                # Broadcast multiplication of 2 on vector x
t

matrix([[2, 4, 6]])

In [145]:
# Create random data with numpy
np.random.seed(1)
N = 10
x = np.linspace(0, 1, N)
t = np.sin(2*np.pi*x)
x,t

(array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
        0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ]),
 array([ 0.00000000e+00,  6.42787610e-01,  9.84807753e-01,  8.66025404e-01,
         3.42020143e-01, -3.42020143e-01, -8.66025404e-01, -9.84807753e-01,
        -6.42787610e-01, -2.44929360e-16]))

### Plotting sin wave in plotly

Now that we have some data we can plot it!!!

I went to a few different tutorials to find out how to plot what I want.

The first place you should look is here https://plot.ly/python/line-charts/ which gives the basics on making plotly graph objects so we can add cool widgets in later. Basically what we are doing is making a scatter plot of the 100 points then putting it in line mode so it looks smooth. Changing color was a little more difficult but this page https://plot.ly/python/marker-style/ gives good examples

In [146]:
sinwave = go.Scatter(x=x, y=t,
                    mode='lines',
                    name='sinwave',
                    marker=dict(
                        color='green',
                        size=20,
                        line=dict(color='green',width=2)
                            )
                        )

fig.add_trace(sinwave)
data = [sinwave]

py.iplot(data, filename = 'SinSample', auto_open=False)

### Sampling points from the Sin wave with Gaussian noise

Now we want to sample points from our sin wave but add some Gaussian/Normal noise to it. This is important because everything you measure in the real world has some noise or error to it.

As we move along the x axis, we want to use the value of t in our sin wave as the mean for the normal distribution function.


This is a great introduction to histograms

https://help.plot.ly/histogram/

You can get more information on the normal distribution here

https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.random.normal.html

### Example of normal distribution from one value of t

We will start off simple and take one value of t as input for mean of the normal distribution. 

Lets take a value near the middle. 

The standard deviation (sigma) is set to 0.1 

We will sample 1000 points

In [158]:
#If you look at the plot above the sin wave is around 0 half way through
middle = int((N - 1)/2)
mid_x = x[middle] # Half of points. 
mid_t = t[middle] # Half of points. 
mid_t

0.3420201433256689

In [148]:
G_N = 1000
mu, sigma = mid_t, 0.1 # mean and standard deviation

G = np.random.normal(mu, sigma, G_N)

In [149]:
t_hist = go.Histogram(x=G,
                      name='t_hist')

data_t =[t_hist]

fig = go.Figure(data=data_t)
py.iplot(data_t, filename = 't_hist', auto_open=False)

### Plotting the normal curve over the normal histogram

We will now do something a little more complicated... 

lets calculate the amount in the bins by hand and plot these as a curve over the histogram. 

This is called the normal curve

In [159]:
bins = 20
G_min = np.min(G) # Smallest value of gauss
G_max = np.max(G) # Get max  value of gauss
G_range = G_max - G_min        # Gaussian range

bin_length = G_range / bins     # Get length of bins
G_min

0.0366437052830384

In [160]:
b = 0
b_start = G_min
b_stop  = G_min + bin_length

bin_counts = []
bin_starts = []
while b < bins:                    # Loop over bins
    b_count = 0
    for g in G:                    # Loop over gaussian points
        if b_start <= g < b_stop:
            b_count += 1
            
    bin_counts.append(b_count)
    bin_starts.append(b_start)
    b_start = b_start + bin_length
    b_stop  = b_stop + bin_length
    b += 1
    

In [161]:
bin_counts

[4, 6, 15, 22, 41, 62, 102, 121, 153, 133, 122, 102, 54, 36, 18, 6, 0, 1, 1, 0]

### Plotting the normal curve over the histogram

Below you will see the histogram function modified by our calculated bin_length.

As well as the count of our 20 bins as dots on the normal curve. 

Our bin amounts don't match up exactly with the plotly plot but it is enough to get the point across.

The take home is that when we sample from a normal distribution we are more likely to get values near the middle because they have higher counts.

In [162]:
t_hist = go.Histogram(x=G,
                      name='t_hist',
                      xbins=dict(
                        start=G_min,
                        end=G_max,
                        size=bin_length
                    ))

normal_curve = go.Scatter(x=bin_starts, y=bin_counts,
                    mode='lines+markers',
                    name='normal curve',
                    marker=dict(
                        color='purple',
                        size=20,
                        line=dict(color='red',width=2)
                            )
                        )


data_t_normal_curve =[t_hist, normal_curve]

# fig = go.Figure(data=data_t_normal_curve)
py.iplot(data_t_normal_curve, filename = 't_hist_normalcurve', auto_open=False)

### Plot of gaussian noise on middle value of Sin wave

We will now plot the gaussian noise on the sin wave. Essentially the plot above is rotated to the right

In [191]:
# Making a vector of our middle value. just for plotting
mid_x_vector = mid_x * np.ones(G_N)

# Now we can add in the points for the normal curve that we had above
# I am dividing each count by the max count and then dividing
# that by 10 to make it look good on the plot
bin_counts_normalized = (bin_counts/np.max(bin_counts)) / 10

#Now lets add that to mid_x so we will see the distribution in the correct place on x axis
mid_x_bin_counts_normalized = (mid_x * np.ones(bins)) + bin_counts_normalized

In [192]:
sinwave = go.Scatter(x=x, y=t,
                    mode='lines',
                    name='sinwave',
                    marker=dict(
                        color='green',
                        size=20,
                        line=dict(color='green',width=2)
                            )
                        )



normal_dist = go.Scatter(x=mid_x_vector, y=G,
                    mode='markers',
                    name='normal dist',
                    opacity=0.50,
                    marker=dict(
                        color='blue',
                        size=2,
                        line=dict(color='red',width=2)
                            )
                        )

normal_curve = go.Scatter(x=mid_x_bin_counts_normalized, y=bin_starts,
                    mode='lines+markers',
                    name='normal curve (20 bins)',
                    marker=dict(
                        color='purple',
                        size=10,
                        line=dict(color='red',width=2)
                            )
                        )



data = [sinwave, normal_dist, normal_curve]

py.iplot(data, filename = 'SinSample', auto_open=False)

In [10]:
df = pd.read_csv(
    'https://raw.githubusercontent.com/yankev/testing/master/datasets/nycflights.csv')
df = df.drop(df.columns[[0]], axis=1)

In [11]:
month = widgets.IntSlider(
    value=1.0,
    min=1.0,
    max=12.0,
    step=1.0,
    description='Month:',
    continuous_update=False
)

use_date = widgets.Checkbox(
    description='Date: ',
    value=True,
)

container = widgets.HBox(children=[use_date, month])

textbox = widgets.Dropdown(
    description='Airline:   ',
    value='DL',
    options=df['carrier'].unique().tolist()
)

origin = widgets.Dropdown(
    options=list(df['origin'].unique()),
    value='LGA',
    description='Origin Airport:',
)


# Assign an empty figure widget with two traces
trace1 = go.Histogram(x=df['arr_delay'], opacity=0.75, name='Arrival Delays')
trace2 = go.Histogram(x=df['dep_delay'], opacity=0.75, name='Departure Delays')
g = go.FigureWidget(data=[trace1, trace2],
                    layout=go.Layout(
                        title=dict(
                            text='NYC FlightDatabase'
                        ),
                        barmode='overlay'
                    ))

In [12]:
def validate():
    if origin.value in df['origin'].unique() and textbox.value in df['carrier'].unique():
        return True
    else:
        return False


def response(change):
    if validate():
        if use_date.value:
            filter_list = [i and j and k for i, j, k in
                           zip(df['month'] == month.value, df['carrier'] == textbox.value,
                               df['origin'] == origin.value)]
            temp_df = df[filter_list]

        else:
            filter_list = [i and j for i, j in
                           zip(df['carrier'] == 'DL', df['origin'] == origin.value)]
            temp_df = df[filter_list]
        x1 = temp_df['arr_delay']
        x2 = temp_df['dep_delay']
        with g.batch_update():
            g.data[0].x = x1
            g.data[1].x = x2
            g.layout.barmode = 'overlay'
            g.layout.xaxis.title = 'Delay in Minutes'
            g.layout.yaxis.title = 'Number of Delays'


origin.observe(response, names="value")
textbox.observe(response, names="value")
month.observe(response, names="value")
use_date.observe(response, names="value")

In [13]:


container2 = widgets.HBox([origin, textbox])
widgets.VBox([container,
              container2,
              g])



VBox(children=(HBox(children=(Checkbox(value=True, description='Date: '), IntSlider(value=1, continuous_update…