In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
#Set the display precision
pd.set_option('precision', 4)

In [2]:
df = pd.DataFrame.from_csv('atus_00005.csv')
df['weighted_sleep']=df.BLS_PCARE_SLEEP * df.WT06
df.EMPSTAT.astype('category')
df['SEX'] = df['SEX'].replace([1, 2], ['Male', 'Female'])
df.SEX.astype('category')
df.AGE = pd.cut(df.AGE, [15, 20, 25, 35, 45, 55, 65, 90], right = 'False')

In [3]:
#Load the state names
num,name = np.loadtxt('state_id.txt',unpack=True,dtype=str,delimiter=',')

# replace with state names
df['STATEFIP'] = df['STATEFIP'].replace(num.astype(np.int64), name)

In [4]:
#Calculated weighted averages on grouped data
def wavg(group, name):
    d = group['BLS_PCARE_SLEEP']
    w = group[name]
    return (d * w).sum() / w.sum()


In [5]:
#This section groups the data by metro area
grouped = df.groupby(['YEAR', 'METAREA', 'SEX' ] )


In [6]:
#Error calulation using replicate weights, need to use all 160 different sets of data weights
#Find the correct columns
column_names = df.columns.values
indices = [i for i, s in enumerate(column_names) if 'RWT06_' in s]
column_names = column_names[indices]

In [7]:
meanData = grouped.apply(wavg, 'WT06')
data = []
for name in column_names:
     data.append((grouped.apply(wavg, name)-meanData)**2)

error = np.sqrt(sum(data)*(4./160))/60

In [8]:
meanData.head()


YEAR  METAREA  SEX   
2005  0        Female    522.270
               Male      511.201
      460      Female    537.926
               Male      686.769
      1121     Female    481.474
dtype: float64

In [9]:
error.head()

YEAR  METAREA  SEX   
2005  0        Female    0.034
               Male      0.046
      460      Female    0.114
               Male      3.786
      1121     Female    0.192
dtype: float64