In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lists

Recall that arrays must have data all of the same type.

In [None]:
a = make_array(1, 2, 3)
a

In [None]:
b = make_array(1, 2, 'hello')
b

In [None]:
c = make_array(1, 2, 2.3)
c

Lists are like arrays, but can contain data of different types.  A list is starts with a '[' and ends with a ']'.

In [None]:
d = [1, 2, 'hello']
d

In [None]:
e = [1, 2, 3]
f = e + 10
f

## Rows

In [None]:
rc = Table(['Name', 'Park', 'Type', 'Num Rides'])
rc

In [None]:
rc = rc.with_row(['The Phoenix', 'Knoebels', 'Wooden', 47])
rc

In [None]:
rc = rc.with_row(['Twister', 'Knoebels', 'Wooden', 12]).with_row(['Impulse', 'Knoebels', 'Steel', 2])
rc

In [None]:
rc.where('Type', 'Wooden')

In [None]:
rc.where(3, 2)

In [None]:
rc.where(3, are.above(10))

In [None]:
rc.take(0)

In [None]:
rc.take(np.arange(0,3,2))

In [None]:
rc.take([0,2])

In [None]:
help(are)

#### Discussion

In [None]:
nba = Table.read_table('nba_salaries.csv')
nba.show(20)

In [None]:
nba = nba.relabeled(0, 'NAME').relabeled(3, 'SALARY')
nba

Create an array containing the names of all point guards (PG) who make more than $15M/year

Give a table containing the name, team, and salary of all players whose name contains the letter 'i', whose team contains the letter 'o', and whose salary was less than $1M/year

What was the average salary?

## Census

In [None]:
# As of Feb 2018, this census file is online here: 
# data = 'http://www2.census.gov/programs-surveys/popest/datasets/2010-2015/national/asrh/nc-est2015-agesex-res.csv'

# We will used a local copy instead

full = Table.read_table('NC-EST2014-AGESEX-RES.csv')
full

In [None]:
full.where(1, 0)

In [None]:
partial = full.select(['SEX', 'AGE', 4, 8])
partial

In [None]:
simple = partial.relabeled('POPESTIMATE2010', '2010').relabeled(3, '2014')
simple

In [None]:
partial.column("AGE")

What does AGE 999 mean?

In [None]:
partial.where('AGE', 999)

In [None]:
partial.where('AGE',0)

Which age groups have populations that are changing fastest?

In [None]:
census = simple.with_columns(
        'Change', simple.column(3) - simple.column(2), 
        'Growth', (simple.column(3) / simple.column(2)) ** (1/4) - 1)
census

In [None]:
census.set_format('Growth', PercentFormatter)
census.set_format([2, 3, 4], NumberFormatter)

How do the age distributions of men and women differ?

In [None]:
males = census.where('SEX', 1).where('AGE', are.below(999))
females = census.where('SEX', 2).where('AGE', are.below(999))

In [None]:
males.take(0)

In [None]:
females.sort('2014', descending=True).show(5)

In [None]:
males.sort('2014', descending=True).show(5)

In [None]:
Table().with_columns(
    'Age in 2014', females.column('AGE'),
    'Females', females.column('2014'),
    'Males', males.column('2014'),
).plot('Age in 2014')

In [None]:
Table().with_columns(
    'Age in 2014', females.column('AGE'),
    'Ratio', females.column('2014') / males.column('2014'),
).plot('Age in 2014')

In [None]:
Table().with_columns(
    'Age in 2014', females.column('AGE'),
    'Ratio', females.column('2014') / males.column('2014'),
).where('Age in 2014', are.between(20,50)).plot('Age in 2014')

In [None]:
# make a table of the US population by age
by_age = census.where('SEX', 0).drop('SEX').where('AGE', are.below(999))
by_age

In [None]:
by_age.select("AGE","2010","2014").plot("AGE")

In [None]:
by_age.select("AGE","Change").plot("AGE")

In [None]:
by_age.select("AGE", "Change").where("AGE", are.between(60,75)).plot("AGE")

In [None]:
by_age.sort("Change", descending=True)

In [None]:
age_range = make_array(67,64,66,65)

In [None]:
2010-age_range

In [None]:
2014-age_range

## Movies

In [None]:
actors = Table.read_table('actors.csv')
actors

In [None]:
actors.plot('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

Let's find the outlier: which actor appears in <10 movies?

In [None]:
actors.where('Number of Movies', are.below(10))

Now let's remove the outlier from the table, and make a scatter plot of the other actors

In [None]:
no_outlier = actors.where('Number of Movies', are.above(10))
no_outlier.scatter('Number of Movies', 'Average per Movie')

In [None]:
actors.where('Number of Movies', are.above(50))

In [None]:
movies = Table.read_table('movies_by_year.csv')
movies

In [None]:
movies.plot('Year', 'Number of Movies')

In [None]:
century_21 = movies.where('Year', are.above(1999))
century_21.plot('Year', 'Number of Movies')

In [None]:
century_21.plot('Year', 'Total Gross')

In [None]:
century_21.where('Year', are.equal_to(2009))

## Categorical Distributions

In [None]:
top = Table.read_table('top_movies.csv')
top.set_format([2, 3], NumberFormatter)
top

What are the top 5 movies by gross? Make a bar chart.

What are the top 5 movies by adjusted gross? Make a bar chart.

We can also plot a bar chart of every column in a table by naming which column to plot by. This works for tables of two or more columns, and creates stacked bar charts.

In [None]:
top5.select('Title', 'Gross').barh('Title')

In [None]:
top5.select('Title', 'Gross', 'Gross (Adjusted)').barh('Title')

We can use the `group` method to answer the question: How many movies did each studio make?

In [None]:
studios = top.group('Studio')
studios

Make a bar chart to show which studios made the most movies.

Making the 'colleges' table: given table of undergrads, ask for `ugrads.group('STUDENT PROGRAM')`.

In [None]:
colleges=Table.read_table('colleges.csv')
colleges

How then do we make a bar chart, with college with most students first?