In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Categorical Distributions

## Bar Charts ##

In [None]:
# Highest grossing movies as of 2017
# From http://www.boxofficemojo.com/alltime/adjusted.htm
top_movies = Table.read_table('top_movies_2017.csv')
top_movies

In [None]:
# Convert to millions of dollars for readability
millions = np.round(top_movies.column('Gross (Adjusted)') / 1000000, 3)
top_movies = top_movies.with_column('Millions', millions)
top_movies = top_movies.drop('Gross','Gross (Adjusted)')
top_movies

In [None]:
#TODO: add a line to only look at top 30
indices = np.arange(30)
top_30 = top_movies.sort('Millions', descending = True).take(indices)
top_30.show()

In [None]:
# TODO: Let's try a line plot to plot Year versus Millions. Does this make sense?
top_30.scatter('Year', 'Millions')

In [None]:
#TODO: Let's try barh instead 
#to look at a categorical variable (Movie) versus a numerical variable (Millions)
top_30.barh('Title', 'Millions')

The visualization above definitely gives us a nice visual about which movie (= categorical variable) made the most millions (numerical variable). But is it a distribution? Does it give as an idea of the proportion of individuals in a specific category? 
Let's look at another column: Studio, how about looking at the number of movies produced by each of the studios

In [None]:
#TODO: use group to look at the studio distribution in the whole data set
studio_distribution = top_movies.group('Studio')
studio_distribution
# let's try barh() again
studio_distribution.sort('count', descending = 'True').barh('Studio')


In [None]:
# How can we verify that this is really a distribution where all of the movies 
#are only represented once? Can we add up all the counts?
np.sum(studio_distribution.column('count'))

# Numerical Distributions

In [None]:
#TODO: let's look at how old the movies are, create a new column age
age = 2022 - top_movies.column('Year')
top_movies = top_movies.with_column('age', age)
top_movies


In [None]:
#TODO: Can we find the range of different ages in our table?
min(top_movies.column('age')), max(top_movies.column('age'))

In [None]:
#TODO: let's look at a basic histogram of age, what do you see?
top_movies.hist('age', unit= 'Year')

## Binning

In [None]:
#Use make_array to create my_bins
my_bins = make_array(0,5,10,40,65,102)
my_bins

In [None]:
# create a table of Age counts using .bin()
binned_movies = top_movies.bin('age', bins = my_bins)
binned_movies

In [None]:
#Let's verify again that all movies are only represented once
sum(binned_movies.column(1))

In [None]:
# Try equally spaced bins for Age

In [None]:
#Does the sum of the count add up correctly? Are all movies just represented once?

## Histograms

In [None]:
# Let's make our first histogram!
top_movies.hist('age', unit= 'Year')

In [None]:
# Let's try equally spaced bins!
top_movies.hist('age', normed = False, bins = np.arange(0,110,5), unit= 'Year')

In [None]:
# Let's try not specifying any bins!
top_movies.hist('age', bins = my_bins, unit= 'Year')

In [None]:
#Let's go back to uneven bins

# Add a column containing what percent of movies are in each bin



In [None]:
#Does it add up to 100%?

## Height ##

### Question: What is the height of the [40, 65) bin?

In [None]:
top_movies.hist('age', bins = my_bins, unit = 'Year')

In [None]:
binned_movies

In [None]:
# Step 1: Calculate % of movies in the [40, 65) bin (use the binned_data table)
percent = 100* binned_movies.take(np.arange(binned_movies.num_rows - 1)).column('age count')/200
percent

In [None]:
# Step 2: Calculate the width of the 40-65 bin
width = np.diff(binned_movies.column('bin'))
width

In [None]:
# Step 3: Area of rectangle = height * width
#         --> height = percent / width
height = percent/width
height

In [None]:
sum(percent)