In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Census

In [None]:
# From Lecture 6

# As of Feb 2018, this census file is online here: 
# data = 'http://www2.census.gov/programs-surveys/popest/datasets/2010-2015/national/asrh/nc-est2015-agesex-res.csv'

# We will used a local copy instead
data = 'nc-est2015-agesex-res.csv'

full_census_table = Table.read_table(data)
full_census_table
partial = full_census_table.select(['SEX', 'AGE', 4, 9])
us_pop = partial.relabeled(2, '2010').relabeled(3, '2015')
ratio = (us_pop.column(3) / us_pop.column(2))
census = us_pop.with_columns(
        'Change', us_pop.column(3) - us_pop.column(2), 
        'Total Growth', ratio - 1,
        'Annual Growth', ratio ** (1/5) - 1)
census.set_format([2, 3, 4], NumberFormatter)
census.set_format([5, 6], PercentFormatter)

In [None]:
# make a table of the US population by age
by_age = census.where('SEX', 0).drop('SEX').where('AGE', are.below(999))
by_age

In [None]:
by_age.select("AGE","2010","2015").plot("AGE")

In [None]:
by_age.select("AGE","Change").plot("AGE")

In [None]:
by_age.sort("Change", descending=True)

In [None]:
2010-68 # Bombing of Pearl Harbor was 12/7/1941

In [None]:
2015-68 # Bombing of Nagasaki was 8/9/1945

In [None]:
by_age.where('AGE', are.above(95)).show()

## Movies

In [None]:
actors = Table.read_table('actors.csv')
actors

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

Let's find the outlier: which actor appears in <10 movies?

In [None]:
actors.where('Number of Movies', are.below(10))

Now let's remove the outlier from the table, and make a scatter plot of the other actors

In [None]:
no_outlier = actors.where('Number of Movies', are.above(10))
no_outlier.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Number of Movies', are.above(50))

In [None]:
movies = Table.read_table('movies_by_year.csv')
movies

In [None]:
movies.plot('Year', 'Number of Movies')

In [None]:
century_21 = movies.where('Year', are.above(1999))
century_21.plot('Year', 'Number of Movies')

In [None]:
century_21.plot('Year', 'Total Gross')

In [None]:
century_21.where('Year', are.equal_to(2009))

## Categorical Distributions

In [None]:
top = Table.read_table('top_movies.csv')
top.set_format([2, 3], NumberFormatter)
top

What are the top 5 movies by gross? Make a bar chart.

In [None]:
top5 = top.take(np.arange(5))
top5.barh("Title", "Gross")

How large is the adjusted gross of the top 5 movies by Gross? Make a bar chart.

In [None]:
top5.barh("Title", "Gross (Adjusted)")

What are the top 5 movies by adjusted gross? Make a bar chart.

In [None]:
top5.sort("Gross (Adjusted)", descending=True)\
    .barh("Title", "Gross (Adjusted)")

We can also plot a bar chart of every column in a table by naming which column to plot by. This works for tables of two or more columns, and creates stacked bar charts.

In [None]:
top5.select('Title', 'Gross').barh('Title')

In [None]:
top5.select('Title', 'Gross', 'Gross (Adjusted)').barh('Title')

We can use the `group` method to answer the question: How many movies did each studio make?

In [None]:
studios = top.group('Studio')
studios

Make a bar chart to show which studios made the most movies.

In [None]:
studios.sort('count', descending=True).barh('Studio')