In [None]:
# make sure to run this cell
from datascience import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

%matplotlib inline
plt.style.use('fivethirtyeight')
warnings.simplefilter(action="ignore", category=FutureWarning)

# Household Composition

1. [Household Size](#size)
2. [Age Composition](#age)
3. [Sex & Age Composition](#sexage)

Use our class survey files to practice generating variables related to household composition.

For your reference:
Head=1
Spouse=2
Son/daughter=3
Mother/father=4
Other relative=5

### Household Size <a id='size'></a>

In [None]:
survey=Table.read_table('Household Survey 2019.csv')
survey

In [None]:
roster=Table.read_table('Household Roster 2019.csv')
roster

In [None]:
roster.group('HH ID')

In [None]:
size=roster.group('HH ID')
size

In [None]:
size=size.relabel('count','HH size')
size

In [None]:
roster=roster.join('HH ID', size)
roster

### Age composition <a id='age'></a>

Determine what percentage are in each age group.
First, make a new column indicating which age group bin the person falls into.

In [None]:
categories=make_array()
for age in roster.column('Age'):
    if age<10:
        categories=np.append(categories, 1)
    elif age<19:
        categories=np.append(categories, 2)
    elif age<56:
        categories=np.append(categories, 3)
    else:
        categories=np.append(categories, 4)

roster=roster.with_column('Categories', categories)
roster

Pivot to find out how many are in each age group.

In [None]:
Category_small=roster.pivot('Categories', 'HH ID')
Category_small

Rename the column titles

In [None]:
Category_small=Category_small.relabel('1.0','Under 10')
Category_small=Category_small.relabel('2.0','10-19')
Category_small=Category_small.relabel('3.0','20-55')
Category_small=Category_small.relabel('4.0','56+')
Category_small

Replace these columns with percentages instead of counts.  Use the HH size variable we created to generate percentages.

In [None]:
roster=roster.join('HH ID', Category_small)
roster['Percent Under 10']=(roster['Under 10']/roster['HH size'])*100
roster['Percent 10-19']=(roster['10-19']/roster['HH size'])*100
roster['Percent 20-55']=(roster['20-55']/roster['HH size'])*100
roster['Percent 56+']=(roster['56+']/roster['HH size'])*100

In [None]:
roster.show(10)

In [None]:
roster.group('Sex F=1; M=0')

### Sex and age composition <a id='sexage'></a>

Let's take this one step further: determine how many are in each age group by sex.

In [None]:
males=roster.where("Sex F=1; M=0", are.equal_to(0))
females=roster.where("Sex F=1; M=0", are.equal_to(1))

In [None]:
Category_small_m=males.pivot('Categories', 'HH ID')
## no young boys! 
#Category_small_m=Category_small.relabel('1.0','Under 10')
Category_small_m=Category_small_m.relabel('2.0','males 10-19')
Category_small_m=Category_small_m.relabel('3.0','males 20-55')
Category_small_m=Category_small_m.relabel('4.0','males 56+')
Category_small_m

In [None]:
Category_small_f=females.pivot('Categories', 'HH ID')
# No young girls either!
# Category_small_f=Category_small_f.relabel('1.0','females Under 10')
Category_small_f=Category_small_f.relabel('2.0','females 10-19')
Category_small_f=Category_small_f.relabel('3.0','females 20-55')
Category_small_f=Category_small_f.relabel('4.0','females 56+')
Category_small_f

In [None]:
roster_m=roster.join('HH ID', Category_small_m)
roster_mf=roster_m.join('HH ID', Category_small_f)
roster_mf

In [None]:
#optional - could make each of these categories a percent of household size
#roster_mf=roster_mf.with_column('Percent Females Under 10', 'females under 10'/'HH size')

Or try this method!  - which configuration do you like best?

In [None]:
doublepivot=roster.pivot('HH ID',['Sex F=1; M=0','Categories'])
doublepivot.show()

In [None]:
doublepivot=roster.pivot('Sex F=1; M=0',['HH ID','Categories'])
doublepivot.show()

For these types of joins, will we ever need the left join?