In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 7 ##

## Census ##

In [None]:
full = Table.read_table('nc-est2019-agesex-res.csv')

In [None]:
# Keep only the columns we care about
partial = full.select('SEX', 'AGE', 'POPESTIMATE2014', 'POPESTIMATE2019')

In [None]:
# Make things easier to read
simple = partial.relabeled(2, '2014').relabeled(3, '2019')
simple

## Line Plots ##

In [None]:
# Remove the age totals
no_999 = simple.where('AGE', are.below(999))

In [None]:
# Our first chart!
overall = no_999.where('SEX', 0)
overall.plot('AGE', '2019')

In [None]:
# ^^ That plot should be labeled! Here are 3 ways to label it:

In [None]:
# US Population  <--- Just add a comment

overall.plot('AGE', '2019')

In [None]:
overall.plot('AGE', '2019')
print('US Population')  # <--- Print out what it is

In [None]:
overall.plot('AGE', '2019')
plots.title('US Population');    # <--- OPTIONAL; not needed for Data 8

## Males vs Females

In [None]:
# Let's compare male and female counts per age
males = no_999.where('SEX', 1).drop('SEX')
females = no_999.where('SEX', 2).drop('SEX')

In [None]:
pop_2019 = Table().with_columns(
    'Age', males.column('AGE'),
    'Males', males.column('2019'),
    'Females', females.column('2019')
)
pop_2019

In [None]:
pop_2019.plot('Age')

In [None]:
# Calculate the percent female for each age
total = pop_2019.column('Males') + pop_2019.column('Females')
pct_female = pop_2019.column('Females') / total * 100
pct_female

In [None]:
# Round it to 3 so that it's easier to read
pct_female = np.round(pct_female, 3)
pct_female

In [None]:
# Add female percent to our table
pop_2019 = pop_2019.with_column('Percent female', pct_female)
pop_2019

In [None]:
pop_2019.plot('Age', 'Percent female')

In [None]:
# ^^ Look at the y-axis! Trend is not as dramatic as you might think

pop_2019.plot('Age', 'Percent female')
plots.ylim(0, 100);  # Optional for Data 8

## Scatter Plots ##

In [None]:
# Actors and their highest grossing movies
actors = Table.read_table('actors.csv')
actors

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Average per Movie', are.above(400))

## Bar Charts ##

In [None]:
# Highest grossing movies as of 2017
top_movies = Table.read_table('top_movies_2017.csv')
top_movies

In [None]:
top10_adjusted = top_movies.take(np.arange(10))
top10_adjusted

In [None]:
# Convert to millions of dollars for readability
millions = np.round(top10_adjusted.column('Gross (Adjusted)') / 1000000, 3)
top10_adjusted = top10_adjusted.with_column('Millions', millions)
top10_adjusted

In [None]:
# A line plot doesn't make sense here: don't do this!
top10_adjusted.plot('Year', 'Millions')

In [None]:
top10_adjusted.barh('Title', 'Millions')

Exercise: Generate the chart shown in the slides: a bar chart of age (# years since release) for the 10 highest grossing movies (non-adjusted).