# American Time Use Survery (ATUS) data

[Reference document](https://www.bls.gov/tus/atusintcodebk18.pdf)

[Activity lexicode reference](https://www.bls.gov/tus/lexiconwex2018.pdf)

In [48]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce
%matplotlib notebook
sns.set()

_data_folder = os.path.join(os.getcwd(), 'data')

In [19]:
df_summary = pd.read_csv(
    os.path.join(_data_folder, 'atussum-2018', 'atussum_2018.dat'))
df_summary.head()

Unnamed: 0,TUCASEID,TUFINLWGT,TRYHHCHILD,TEAGE,TESEX,PEEDUCA,PTDTRACE,PEHSPNON,GTMETSTA,TELFS,...,t181501,t181599,t181601,t181801,t189999,t500101,t500103,t500105,t500106,t500107
0,20180101180006,9456372.0,-1,42,1,40,1,2,2,3,...,0,0,0,0,0,0,0,0,0,0
1,20180101180021,4010486.0,-1,58,1,39,1,2,1,5,...,0,0,0,0,0,0,0,0,0,0
2,20180101180025,35193940.0,-1,65,1,37,2,2,1,5,...,0,0,0,0,0,0,0,0,0,0
3,20180101180054,2251398.0,-1,72,1,40,2,2,1,5,...,0,0,0,0,0,4,0,0,0,0
4,20180101180060,1029459.0,-1,66,2,40,2,2,2,2,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df_response = pd.read_csv(
    os.path.join(_data_folder, 'atusresp-2018', 'atusresp_2018.dat'))
df_response.head()

Unnamed: 0,TUCASEID,TULINENO,TUYEAR,TUMONTH,TEABSRSN,TEERN,TEERNH1O,TEERNH2,TEERNHRO,TEERNHRY,...,TXSPEMPNOT,TXSPUHRS,TXTCC,TXTCCTOT,TXTCOC,TXTHH,TXTNOHH,TXTO,TXTOHH,TXTONHH
0,20180101180006,1,2018,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,0,0,-1,-1,-1,-1,-1
1,20180101180021,1,2018,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,0,0,-1,-1,-1,-1,-1
2,20180101180025,1,2018,1,-1,-1,-1,-1,-1,-1,...,0,0,-1,0,0,-1,-1,-1,-1,-1
3,20180101180054,1,2018,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,0,0,-1,-1,-1,-1,-1
4,20180101180060,1,2018,1,14,-1,-1,1300,-1,1,...,-1,-1,-1,0,0,-1,-1,-1,-1,-1


## Data analysis
### Work hours for employed

In [21]:
df_employed = df_summary[
    True
    & (df_summary['TELFS'] == 1) # Employed
    & (df_summary['TRDPFTPT'] == 1)  # full-time
    & (df_summary['TEHRUSLT'] > 0)  # remove noise
]

fns = ['count', np.mean, np.std]
df_employed.groupby('TESEX').agg({'TEHRUSLT': fns})

Unnamed: 0_level_0,TEHRUSLT,TEHRUSLT,TEHRUSLT
Unnamed: 0_level_1,count,mean,std
TESEX,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,2271,45.8679,9.376096
2,1990,43.046231,7.360765


In [22]:
df_employed['TEHRUSLT'].agg(['count', np.mean, np.std]).to_frame().T

Unnamed: 0,count,mean,std
TEHRUSLT,4261.0,44.550106,8.609534


In [49]:
sns.distplot(df_employed[df_employed['TESEX'] == 1]['TEHRUSLT'] / 1., 
             kde=False, bins=50, label='Male')
sns.distplot(df_employed[df_employed['TESEX'] == 2]['TEHRUSLT'] / 1., 
             kde=False, bins=50, label='Female')
plt.legend()
plt.xlabel('Hours worked per week')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

### How about a specific industry?

In [7]:
# business
df_w_resp = df_employed.merge(
    df_response[['TUCASEID', 'TRMJIND1']], how='left', on='TUCASEID')
df_business = df_w_resp[df_w_resp['TRMJIND1'].isin([9,])]
df_business.groupby('TESEX').agg({'TEHRUSLT': fns})

Unnamed: 0_level_0,TEHRUSLT,TEHRUSLT,TEHRUSLT
Unnamed: 0_level_1,count,mean,std
TESEX,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,346,45.309249,7.969127
2,260,43.296154,6.993156


In [8]:
df_business['TEHRUSLT'].agg(['count', np.mean, np.std]).to_frame().T

Unnamed: 0,count,mean,std
TEHRUSLT,606.0,44.445545,7.625284


### Activity hours for employed
#### Sleep (all employed)

In [9]:
sleeping_cols = [x for x in df_summary.columns if x.startswith('t0101')]
df_sleep = df_employed.join(
    df_employed[sleeping_cols].sum(axis=1).to_frame('total_sleep'))

def mean_fn(x): return np.mean(x) / 60.
def std_fn(x): return np.std(x / 60.)
df_sleep.groupby('TESEX').agg({'total_sleep': 
                               ['count', mean_fn, std_fn]})

Unnamed: 0_level_0,total_sleep,total_sleep,total_sleep
Unnamed: 0_level_1,count,mean_fn,std_fn
TESEX,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,2271,8.575892,2.048032
2,1990,8.69,2.128587


In [10]:
# note the unit difference -- here we use minutes
df_sleep['total_sleep'].agg(['count', np.mean, np.std]).to_frame().T

Unnamed: 0,count,mean,std
total_sleep,4261.0,517.750997,125.223735


#### Sleep (business)

In [11]:
df_sleep.merge(df_business[['TUCASEID']], how='inner', on=['TUCASEID'])\
    .groupby('TESEX').agg({'total_sleep': 
                           ['count', mean_fn, std_fn]})

Unnamed: 0_level_0,total_sleep,total_sleep,total_sleep
Unnamed: 0_level_1,count,mean_fn,std_fn
TESEX,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,346,8.471773,1.760404
2,260,8.694808,2.120231


In [12]:
# note the unit difference -- here we use minutes
df_sleep.merge(df_business[['TUCASEID']], how='inner', on=['TUCASEID'])\
    ['total_sleep'].agg(['count', np.mean, np.std]).to_frame().T

Unnamed: 0,count,mean,std
total_sleep,606.0,514.047855,115.668389


#### Housework (all employed)

In [13]:
housework_cols = [x for x in df_summary.columns 
                          if (x.startswith('t0201') 
                              or x.startswith('t0202')
                              or x.startswith('t0203'))]
df_housework = df_employed.join(
    df_employed[housework_cols].sum(axis=1).to_frame('total_housework'))
df_housework.groupby('TESEX').agg({'total_housework': 
                                 ['count', mean_fn, std_fn]})

Unnamed: 0_level_0,total_housework,total_housework,total_housework
Unnamed: 0_level_1,count,mean_fn,std_fn
TESEX,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,2271,0.798892,1.326651
2,1990,1.510653,1.813749


In [14]:
# note the unit difference -- here we use minutes
df_housework['total_housework'].agg(['count', np.mean, np.std]).to_frame().T

Unnamed: 0,count,mean,std
total_housework,4261.0,67.878198,96.767855


#### Housework (business)

In [15]:
df_housework.merge(df_business[['TUCASEID']], how='inner', on=['TUCASEID'])\
    .groupby('TESEX').agg({'total_housework': 
                           ['count', mean_fn, std_fn]})

Unnamed: 0_level_0,total_housework,total_housework,total_housework
Unnamed: 0_level_1,count,mean_fn,std_fn
TESEX,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,346,0.748314,1.225232
2,260,1.642628,1.913935


In [16]:
# note the unit difference -- here we use minutes
df_housework.merge(df_business[['TUCASEID']], how='inner', on=['TUCASEID'])\
    ['total_housework'].agg(['count', np.mean, np.std]).to_frame().T

Unnamed: 0,count,mean,std
total_housework,606.0,67.920792,97.285625


#### Leisure and sport (all employed)

In [17]:
leisure_and_sport_cols = [x for x in df_summary.columns 
                          if x.startswith('t12') or x.startswith('t12')]
df_l_and_s = df_employed.join(
    df_employed[leisure_and_sport_cols].sum(axis=1).to_frame('total_leisure_and_sport'))
df_l_and_s.groupby('TESEX').agg({'total_leisure_and_sport': 
                                 ['count', mean_fn, std_fn]})

Unnamed: 0_level_0,total_leisure_and_sport,total_leisure_and_sport,total_leisure_and_sport
Unnamed: 0_level_1,count,mean_fn,std_fn
TESEX,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,2271,4.117342,3.189402
2,1990,3.564263,2.808569


In [24]:
# note the unit difference -- here we use minutes
df_l_and_s['total_leisure_and_sport'].agg(['count', np.mean, np.std]).to_frame().T

Unnamed: 0,count,mean,std
total_leisure_and_sport,4261.0,231.542361,181.828585


## Visualization

In [85]:
# only for ppl in the business industry
df_tmp = reduce(lambda x1, x2: pd.merge(x1, x2, how='left', on='TUCASEID'), 
                [df_business[['TUCASEID']], 
                 df_employed[['TUCASEID', 'TESEX', 'TEHRUSLT']],
                 df_housework[['TUCASEID', 'total_housework']]])
df_tmp.rename(columns={'TESEX': 'Gender',
                       'TEHRUSLT': 'Working and \nwork-related activities',
                       'total_housework': 'Housework activities'}, inplace=True)
df_tmp['Gender'] = df_tmp['Gender'].apply(lambda x: 'Male' if x == 1 else 'Female')
df_tmp['Working and \nwork-related activities'] = \
    df_tmp['Working and \nwork-related activities']
df_tmp['Housework activities'] = df_tmp['Housework activities'] / 60 * 7.

df_tmp_grouped = df_tmp.groupby('Gender').mean().reset_index()
df_bar = pd.melt(df_tmp_grouped, id_vars=['Gender'], 
                 value_vars=['Working and \nwork-related activities', 
                             'Housework activities'])
df_bar

Unnamed: 0,Gender,variable,value
0,Female,Working and \nwork-related activities,43.296154
1,Male,Working and \nwork-related activities,45.309249
2,Female,Housework activities,11.498397
3,Male,Housework activities,5.238198


In [88]:
f, ax = plt.subplots(figsize=(7, 2.4))
sns.barplot(x='value', y='variable', hue='Gender', data=df_bar, orient='h')
plt.xlabel('Hour')
plt.ylabel('')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [89]:
sns.scatterplot(x='Working and \nwork-related activities', 
                y='Housework activities', hue='Gender', data=df_tmp)
plt.xlabel('Working and work-related activities')
plt.tight_layout()

<IPython.core.display.Javascript object>