In [None]:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

## Review
- Conducted at https://pollev.com/jeremysanchez during class time

## New material

### Every variable has a distribution

In [None]:
top_movies = Table.read_table('top_movies_2017.csv')
top_movies.show(6)

In [None]:
studio_distribution = top_movies.group('Studio')

In [None]:
studio_distribution.show(6)

### Visualizing distributions

#### **Task:** Visualize the distribution of studios responsible for the highest grossing movies as of 2017.

In [None]:
studio_distribution.barh('Studio')

In [None]:
studio_distribution.sort('count', descending=True).barh('Studio')
print("Five studios are largely responsible for the highest grossing movies")

_____

### Use binning for numerical distributions

#### **Task**: Visualize the distribution of how long the highest grossing movies as of 2017 have been out (in years).

In [None]:
ages = 2022 - top_movies.column('Year')

In [None]:
top_movies = top_movies.with_column('Age', ages)

In [None]:
top_movies.select('Title', 'Age').show(6)

In [None]:
min(ages), max(ages)

- If you want to make equally sized bins, `np.arange()` is a great tool to help you.

In [None]:
top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year')

- Otherwise, you can pick your own bins. These are just bins that we picked out.


In [None]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 101)

In [None]:
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

**Note:** The last bin does not include any observations!! 
 - This is because $101$ is the right endpoint of the last bin. It is not the beginning of any bin.

### Introducing the histogram

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')

#### **Discussion Question (1 min)**: Compare the bins $[25, 40)$ and $[40, 65)$. 

- Which one has more movies?
- Which one is more crowded?

____

#### **Task**: Find the height of the $[40,65)$ bin in the histogram above.

$$\text{height} = \frac{\text{percent}}{\text{width}}$$

Add a column containing what percent of movies are in each bin (the **area** of each bin)

In [None]:
binned_data = binned_data.with_column('Percent', 100*binned_data.column('Age count')/200)

In [None]:
binned_data.show(3)

In [None]:
percent = binned_data.where('bin', 40).column('Percent').item(0)

In [None]:
width = 65-40
height = percent / width

In [None]:
height

#### **Task**: Find the heights of the (rest of the) bins.

$$\text{height} = \frac{\text{percent}}{\text{width}}$$

In [None]:
height_table = binned_data.take(np.arange(binned_data.num_rows - 1))
height_table 

In [None]:
bin_widths = np.diff(binned_data.column('bin'))

In [None]:
bin_widths

In [None]:
height_table = height_table.with_column('Width', bin_widths)
height_table

In [None]:
height_table = height_table.with_column('Height',
                                        height_table.column('Percent')/height_table.column('Width'))

In [None]:
height_table

To check our work one last time:

In [None]:
top_movies.hist('Age', bins = my_bins, unit = 'Year')