In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('max_colwidth', None)

plt.style.use('fivethirtyeight')
sns.set()
sns.set_context("talk")
%matplotlib inline
pd.set_option('display.max_columns', None)

# Lab [Number]: Mastering Metrics Recreation

In this lab, we'll be recreating the findings from this [paper](http://assets.press.princeton.edu/chapters/s10363.pdf). The original analysis was done using a software called STATA. You can view the original STATA code [here](http://www.masteringmetrics.com/wp-content/uploads/2020/04/NHIS2009_hicompare_v2.do).  In this notebook, we'll being doing the same analysis using ```python``` and ```pandas``` instead.

## Load Data
Load the data.csv file into a pandas dataframe.  

In [4]:
# Run this cell to load our sample data
data_string = r"C:\Users\am513\PS-88-21-DEV\lab\aaronfolder\NHIS2009_clean.dta" 
df = pd.read_stata(data_string, convert_missing=True) # keeps STATA nulls as the original .
df.head()

Unnamed: 0,year,inc1,inc2,inc3,inc4,inc5,inc6,inc7,inc8,serial,hhweight,pernum,perweight,sampweight,age,marstat,sex,famsize,relate,racenew,educ,educrec1,empstat,incfam07on,health,uninsured,age2,fml,nwhite,hi,yedu,empl,hlth,inc,incmp,brooks,marradult,marradult_empl,adltempl,hi_hsb1
0,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,1,4931,1,7795.0,11057.0,20,Never married,Male,1 person,Householder,Black/African American,"Some college, no degree",1 to 3 years of college,Working for pay at job/business,"$0 - $34,999",Very Good,Not covered,400.0,0.0,1.0,0.0,14,1.0,4.0,19282.931641,17500.0,1.0,0.0,0.0,0.0,.
1,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,3,7871,1,8938.0,22029.0,29,Married,Female,4 people,Householder,White,"Some college, no degree",1 to 3 years of college,Not in labor force,"$0 - $34,999",Very Good,Not covered,841.0,1.0,0.0,0.0,14,0.0,4.0,19282.931641,17500.0,1.0,1.0,0.0,1.0,.
2,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,3,7871,2,7602.0,20363.0,10,NIU,Female,4 people,Child (bio/adopt/in-law/step/foster) of householder,White,Grade 4,Grade 4,NIU,"$0 - $34,999",Excellent,Covered,100.0,1.0,0.0,1.0,4,0.0,5.0,19282.931641,17500.0,0.0,0.0,0.0,1.0,.
3,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,3,7871,3,8594.0,0.0,4,NIU,Female,4 people,Child (bio/adopt/in-law/step/foster) of householder,White,NIU,NIU,NIU,"$0 - $34,999",Excellent,Covered,16.0,1.0,0.0,1.0,.,0.0,5.0,19282.931641,17500.0,0.0,0.0,0.0,1.0,.
4,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,3,7871,4,8967.0,0.0,35,Married,Male,4 people,Spouse,White,Grade 11,Grade 11,Working for pay at job/business,"$0 - $34,999",Very Good,Not covered,1225.0,0.0,0.0,0.0,11,1.0,4.0,19282.931641,17500.0,1.0,1.0,1.0,1.0,0


Let's start by taking a look at the data to see if there are any issues to fix before we begin our analysis. 
 * Hint: There is a built in ```pandas``` tool that will help us do this. Check out how to use this tool [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html)

In [10]:
# Use the tool in the hint to get a description of the data set
#...

df.describe()

Unnamed: 0,year,inc1,inc2,inc3,inc4,inc5,inc6,inc7,inc8,serial,hhweight,pernum,perweight,sampweight,age2,fml,nwhite,hi,empl,hlth,inc,incmp,brooks,marradult,marradult_empl,adltempl
count,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0,80634.0
mean,2009.47229,19292.210938,41708.820312,61106.566406,86013.53125,167919.546875,25375.941406,109663.078125,70811.601562,20422.576345,2882.747861,2.240172,3418.693888,3558.650222,1740.557861,0.51637,0.254024,0.825917,0.442084,3.884143,68465.328125,64391.335938,0.586142,0.374482,0.24192,0.818389
std,0.472091,9.27561,29.468706,3.59512,27.765621,75.035095,22.388733,62.218945,23.039238,11907.151576,1975.252937,1.395689,2350.206434,5924.825923,1770.217041,0.49995,0.435293,0.379184,0.496557,1.052505,54488.144531,48474.695312,0.492567,0.484127,0.428243,0.863409
min,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,1.0,724.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19282.931641,17500.0,0.0,0.0,0.0,0.0
25%,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,10047.0,1501.0,1.0,1870.0,0.0,256.0,0.0,0.0,1.0,0.0,3.0,19282.931641,17500.0,0.0,0.0,0.0,0.0
50%,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,20350.0,2441.0,2.0,2853.0,0.0,1156.0,1.0,0.0,1.0,0.0,4.0,61102.972656,62500.0,1.0,0.0,0.0,1.0
75%,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,30757.0,3433.75,3.0,4165.0,5465.0,2704.0,1.0,1.0,1.0,1.0,5.0,85985.78125,87500.0,1.0,1.0,0.0,2.0
max,2009.0,19282.931641,41679.34375,61102.972656,85985.78125,167844.53125,25398.326172,109725.273438,70834.640625,41177.0,26014.0,18.0,31688.0,121699.0,7225.0,1.0,1.0,1.0,1.0,5.0,167844.53125,150000.0,1.0,1.0,1.0,2.0
