In [None]:
# run this cell
from datascience import *
from pandas import read_stata
import numpy as np

import matplotlib
matplotlib.use('Agg', warn=False)
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In this exercise we will join data to learn about the education of teen mothers and thier children in urban Colombia from the 2016 ELCA survey.  Upload the roster data.

In [None]:
filename = 'Colombia Roster.csv'
colombia_roster = Table.read_table(filename)
colombia_roster

First, we need to figure out who were teen mothers.  For each mother on the roster, keep the oldest child. (use motherid)  
This requires a number of steps: figure out who is the oldest child and then link that child's age to the mother's observation 

Keep only the variables householdid, mother_id and age. Rename motherID as indivID and rename age as childage.  Combine hhID & indivID into a unique_indivID

In [None]:
children = colombia_roster.select('hhid', 'mother_id', 'age')
children.relabel('mother_id', 'indivID').relabel('age', 'childage')
children = children.where('indivID', are.above(0))
children

Create a mother id so we can group by this to find the oldest age

In [None]:
mother_unique_id = children.column('...') * 1000 + children.column('...')
mother_unique_id

Now we don't need hhid or indivID

In [None]:
children = children.with_column('mother_uniqueid', mother_unique_id)
children = children.select('childage', 'mother_uniqueid')
children

Find the oldest age of each mother.

In [None]:
oldest_children = children.group('mother_uniqueid', max)
oldest_children

Link this to the mother.  Open the roster data set again, create the unique_indivID combining hhID &indivID & join the data sets.

In [None]:
unique_id = colombia_roster.column('...') * 1000 + colombia_roster.column('...')
unique_id

In [None]:
colombia_roster = colombia_roster.with_column('unique_id', unique_id)
colombia_roster = colombia_roster.with_column('mother_uniqueid', unique_id)
colombia_roster

In [None]:
# Join only keeps individuals who have children in the data
mothers = colombia_roster.join('mother_uniqueid', oldest_children)
mothers

Use Mother's age and age of oldest child to calculate mother's age at first birth.

In [None]:
age_first_birth = mothers.column('...') - mothers.column('...')
mothers = mothers.with_column('age first birth', age_first_birth)
mothers

Make a histogram of mother's age at first birth.  Remove crazy outliers.

In [None]:
...

Make a new "TeenMom" variable indicating women who had thier oldest child before age 20. Save this data table. (We probably missed some women who were teen mothers - why?)  

In [None]:
teen_mom = []
for i in mothers.column('age first birth'):
    if i < 20:
        teen_mom.append('Yes')
    else:
        teen_mom.append('No')
mothers = mothers.with_column('Teen Mom', teen_mom)
mothers

What percentage of women were teen mothers?

In [None]:
mothers.group('...')

How does the level of education of women who were teen mothers compared to non-teen mothers?

In [None]:
mothers.where('...', '...').group('...')

In [None]:
...

Now let's see if children of teen mothers have worse education than children of women who were not teen mothers.  We will need to merge in the TeenMom variable to the child data. Drop unique_motherID and rename unique_indivID.  Save only the variables unique_indivID and TeenMom in this small table.

In [None]:
mother_info = mothers.select('mother_uniqueid', 'Teen Mom', 'age')
mother_info = mother_info.relabel('...', 'mother age')
mother_info

In [None]:
# this will be our children with mother info table, but the mother info is not yet joined.
children_with_mother_info = colombia_roster.drop('mother_uniqueid')
children_with_mother_info = children_with_mother_info.where('mother_id', are.above(0))
children_with_mother_info

Create a mother unique ID

In [None]:
mother_uniqueid = children_with_mother_info.column('...') * 1000 + children_with_mother_info.column('...')
children_with_mother_info = children_with_mother_info.with_column('mother_uniqueid', mother_uniqueid )
children_with_mother_info

Join Mother info to the Child

In [None]:
children_with_mother_info = children_with_mother_info.join('mother_uniqueid', mother_info)
children_with_mother_info

What percentage of children had mothers who were teen mothers?  Only consider chidlren < age 19

In [None]:
children_with_mother_info.where(...)...

Let's examine the education of the children.  Only keep the ones currently attending.

In [None]:
enrolled_children = children_with_mother_info.where('...', are.above(0))
enrolled_children

Find the average grade for each age.

In [None]:
mean_grade_enrolled_children = enrolled_children.group('...', np.mean).select('age', 'current_grade mean')
mean_grade_enrolled_children = mean_grade_enrolled_children.relabel('current_grade mean', 'mean grade')
mean_grade_enrolled_children = mean_grade_enrolled_children.where('...', are.below(19))
mean_grade_enrolled_children

Join that mean grade to the child & their actual grade.

In [None]:
enrolled_children = enrolled_children.join('age', mean_grade_enrolled_children, 'age')
enrolled_children

Calculate if the child is delayed by more than 1 year for their age.

In [None]:
delayed_edu = (enrolled_children.column('current_grade') + ...) - enrolled_children.column('mean grade')
delayed_edu

In [None]:
delayed = []
for i in delayed_edu:
    if i < 0:
        delayed.append('Yes')
    else:
        delayed.append('No')
enrolled_children = enrolled_children.with_column('delayed', delayed)
enrolled_children

Are children of teen moms more or less likely to be enrolled?

In [None]:
enrolled_children.where('...', '...').group('...')