In [None]:
from datascience import *
import numpy as np

import matplotlib
matplotlib.use('Agg', warn=False)
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Table.interactive_plots()

## Functions review

In [None]:
def middle(a, b, c):
    """The second largest of numbers a, b, c"""
    total = a + b + c
    return total - max(a, b, c) - min(a, b, c)

In [None]:
def f_to_kelvin(t):
    """Convert a fahrenheit temperature float to Kelvin as a float"""
    return (t-32) * (5/9) + 273.15

def f_to_kelvin_string(t):
    """Convert a fahrenheit temperature string to Kelvin as a float"""
    return (float(t) - 32) * (5/9) + 273.15

In [None]:
## Printing vs returning

def inclusive_range(start, stop, step):
    return np.arange(start, stop+step, step)

## Functions with tables

In [None]:
# https://www.bls.gov/news.release/atus.t01.htm#tus_tu_nr1.f.1 ; Bureau of Labor Stats data
time_use = Table.read_table("time_use.csv")
time_use.show()

In [None]:
## Creating a bar chart of the average hours per day for each activity subtotal, females only
fem_subtotal = time_use.where("Gender", are.equal_to("Female")).where("Type", are.equal_to("Subtotal"))
fem_subtotal
#fem_subtotal.barh("Activity", "Average hours per day")

In [None]:
## Use the data above to find the percentages - kind of annoying to replace data all the time
time_use.column("Average hours per day") / sum(time_use.column("Average hours per day")) * 100

In [None]:
## A function will create the table for us!

def with_percents(tbl, column_for_counts):
    percents = 100 * tbl.column(column_for_counts) / sum(tbl.column(column_for_counts))
    percent_label = "Percent of " + column_for_counts
    return tbl.with_columns(percent_label, percents)

# We can include functions within other functions

def barh_percents(tbl, column_for_labels, column_for_counts):
    with_percents(tbl, column_for_counts).barh(column_for_labels, "Percent of " + column_for_counts)

In [None]:
with_percents(fem_subtotal, "Average hours per day")
barh_percents(fem_subtotal, "Activity", "Average hours per day")

In [None]:
## Functions as values

def f(s):
    return np.round(s / sum(s) * 100, 2)

#f(make_array(5, 10, 15))
#f
#str(f)

## Applying functions to columns

In [None]:
words = Table.read_table("words.csv")#, encoding='iso8859')
words

In [None]:
# https://en.wikipedia.org/wiki/Grimm%27s_law
def consonant_shift(text):
    # Just a few ways in which consonants shifted from Proto-Indo-European
    # to early Germanic (which then came into English).
    # Examples:  Pater -> Father, Pod -> Foot, Tres -> Three, Decem -> Ten, Qod -> What
    return text.replace("t", "th").replace("p", "f").replace("d", "t").replace("qo", "who")

In [None]:
## Using apply:
shifted = words.apply(consonant_shift, "Proto Indo-European")
shifted

In [None]:
with_shift = words.with_columns("Consonant-shifted", shifted)
with_shift

In [None]:
## How did some words shift from "p" to "f"? 
with_shift.where("Proto Indo-European", are.containing("pat"))

In [None]:
# Note the shift sometimes (often) doesn't match:
#   In qatwrpods, t -> d happened, not t -> th
with_shift.where("Proto Indo-European", are.containing("pod"))

In [None]:
with_shift.where("Proto Indo-European", are.equal_to("ad"))

## `group`

In [None]:
## From the DOJ/FBI Crime Reporting Stats
crime = Table.read_table("us_crime.csv")
crime

In [None]:
## Let's look at some of the table. What areas had the highest population?
crime.sort("Population", descending=True)

In [None]:
## We want to analyze states only. Let's do some cleaning.
states = crime.where("State", are.not_equal_to("United States-Total"))
states

In [None]:
## What we're used to: How many entries do we have for each year? For each state?
states.group("Year")

In [None]:
## Now: let's get the total number of crimes for each state in the dataset
states.group("State", sum)

In [None]:
## Notice the empty "State sum" column
states.group("Year", sum)

In [None]:
## Graph violent crime per capita, over time
by_year = states.group("Year", sum)
by_year = by_year.with_columns("Violent crime per capita", by_year.column(3) / by_year.column(2))
by_year

In [None]:
by_year.plot("Year", "Violent crime total sum")

In [None]:
by_year.plot("Year", "Violent crime per capita")