In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Census

In [None]:
# From Lecture 6

# As of Jan 2017, this census file is online here: 
data = 'http://www2.census.gov/programs-surveys/popest/datasets/2010-2015/national/asrh/nc-est2015-agesex-res.csv'

# A copy can be accessed here in case census.gov moves the file:
# data = 'http://inferentialthinking.com/notebooks/nc-est2015-agesex-res.csv'

full_census_table = Table.read_table(data)
full_census_table
partial = full_census_table.select(['SEX', 'AGE', 4, 9])
us_pop = partial.relabeled(2, '2010').relabeled(3, '2015')
ratio = (us_pop.column(3) / us_pop.column(2))
census = us_pop.with_columns(
        'Change', us_pop.column(3) - us_pop.column(2), 
        'Total Growth', ratio - 1,
        'Annual Growth', ratio ** (1/5) - 1)
census.set_format([2, 3, 4], NumberFormatter)
census.set_format([5, 6], PercentFormatter)

In [None]:
by_age = census.where('SEX', 0).drop('SEX').where('AGE', are.below(999))
by_age

In [None]:
by_age.select(0, 1, 2).plot(0)

In [None]:
by_age.select(0, 3).plot(0)

In [None]:
by_age.sort(3, descending=True)

In [None]:
2010-68 # Bombing of Pearl Harbor was 12/7/1941

In [None]:
2015-68 # Bombing of Nagasaki was 8/9/1945

In [None]:
by_age.where('AGE', are.above(95)).show()

## Movies

In [None]:
actors = Table.read_table('actors.csv')
actors

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Number of Movies', are.below(10))

In [None]:
no_outlier = actors.where('Number of Movies', are.above(10))
no_outlier.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Number of Movies', are.above(50))

In [None]:
movies = Table.read_table('movies_by_year.csv')
movies

In [None]:
movies.plot('Year', 'Number of Movies')

In [None]:
century_21 = movies.where('Year', are.above(1999))
century_21.plot('Year', 'Number of Movies')

In [None]:
century_21.plot('Year', 'Total Gross')

In [None]:
century_21.where('Year', are.equal_to(2009))

## Categorical Distributions

In [None]:
top = Table.read_table('top_movies.csv')
top

In [None]:
top5 = top.take(np.arange(5))
top5.barh(0, 2)

In [None]:
top5.barh(0, 3)

In [None]:
top5.sort(3, descending=True).barh(0, 3)

In [None]:
top5.select(0, 2).barh('Title')

In [None]:
top5.select(0, 2, 3).barh('Title')

In [None]:
studios = top.group('Studio')
studios

In [None]:
studios.sort('count', descending=True).barh('Studio')