In [83]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import seaborn as sns
from mizani.formatters import percent_format
import os
from plotnine import *
import numpy as np
import sys
import numpy as np
from datetime import datetime
from scipy.stats import norm
import statsmodels.api as sm
import statsmodels.formula.api as smf
from mizani import transforms

In [84]:
# Current script folder
current_path = os.getcwd()
dirname = "/".join(current_path.split("/")[:-2]) + "/"

# location folders
data_in = dirname + "da_data_repo/cps-earnings/clear/"
data_out = dirname + "da_case_studies/ch09-gender-age-earnings/"
output = dirname + "da_case_studies/ch09-gender-age-earnings/output/"
func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)

In [85]:
# Import the prewritten helper functions 
from py_helper_functions import *

In [86]:
data_all = pd.read_csv(data_in + "morg-2014-emp.csv")

In [87]:
#SELECT OCCUPATION
# keep only two occupation types: Market research analysts and marketing specialists 
#and Computer and Mathematical Occupations
data_all.loc[data_all["occ2012"]==735,"sample"]=1
data_all.loc[((data_all["occ2012"]>=1005) & (data_all["occ2012"]<=1240)),"sample"]=2
data_all.loc[data_all["sample"].isna(),"sample"]=0

In [88]:
data_all = data_all.loc[
    (data_all["sample"] == 1) | (data_all["sample"] == 2), :
].reset_index(drop=True)

In [89]:
data_all.head()

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,chldpres,prcitshp,state,ind02,occ2012,class,unionmme,unioncov,lfsr94,sample
0,33,731019430119001,January,AL,2992.1015,1538.46,40,43,2,,...,0,"Native, Born In US",63,"Electric power generation, transmission and di...",1006,"Private, For Profit",No,No,Employed-At Work,2.0
1,346,673032906039520,January,AK,411.5571,1346.15,40,39,3,,...,0,"Native, Born In US",94,Executive offices and legislative bodies (9211...,1030,Government - Local,No,No,Employed-At Work,2.0
2,651,207004430306994,January,AZ,3410.8853,2500.0,40,44,4,,...,0,"Foreign Born, US Cit By Naturalization",86,Computer systems design and related services (...,1020,"Private, For Profit",No,No,Employed-At Work,2.0
3,657,236096309400800,January,AZ,3916.3279,2500.0,40,43,4,,...,10,"Foreign Born, US Cit By Naturalization",86,Business support services (5614),1020,"Private, For Profit",No,No,Employed-At Work,2.0
4,724,914299270769003,January,AZ,5115.4707,1250.0,45,43,1,,...,0,"Native, Born In US",86,Computer systems design and related services (...,1020,"Private, For Profit",No,No,Employed-At Work,2.0


In [90]:
data_all["sample"].value_counts()

2.0    4740
1.0     281
Name: sample, dtype: int64

In [91]:
data_all["female"]=(data_all.sex==2).astype(int)
data_all["w"]=data_all["earnwke"]/data_all["uhours"]
data_all["lnw"]=np.log(data_all["w"])
data_all["agesq"]=np.power(data_all["age"],2)

In [92]:
i=1
data=data_all.loc[data_all["sample"]==i,:]
data.to_csv(data_out+"earnings_inference.csv",index=False)

In [93]:
#####################
#DISTRIBUTION OF EARNINGS
#######################
data.loc[:,["earnwke","uhours","w"]].describe()

Unnamed: 0,earnwke,uhours,w
count,281.0,281.0,281.0
mean,1206.18694,40.153025,29.061656
std,709.545222,10.325838,14.712495
min,40.0,5.0,7.25
25%,700.0,40.0,17.78825
50%,1096.15,40.0,25.95
75%,1538.0,40.0,37.019
max,2884.61,80.0,84.6


In [94]:
data.loc[data.w>=1,["earnwke","uhours","w"]].describe()

Unnamed: 0,earnwke,uhours,w
count,281.0,281.0,281.0
mean,1206.18694,40.153025,29.061656
std,709.545222,10.325838,14.712495
min,40.0,5.0,7.25
25%,700.0,40.0,17.78825
50%,1096.15,40.0,25.95
75%,1538.0,40.0,37.019
max,2884.61,80.0,84.6


In [95]:
data["female"].value_counts()

1    172
0    109
Name: female, dtype: int64

In [96]:
data.groupby(['occ2012', 'female']).size()

occ2012  female
735      0         109
         1         172
dtype: int64

In [97]:
##############################
#linear regressions
##############################

# First, look at them one by one

In [98]:
reg1=smf.ols(formula="lnw~female",data=data).fit()
reg1.summary(slim=True)

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.008
No. Observations:,281,F-statistic:,3.39
Covariance Type:,nonrobust,Prob (F-statistic):,0.0666

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.3149,0.048,69.004,0.000,3.220,3.409
female,-0.1131,0.061,-1.841,0.067,-0.234,0.008


In [99]:
reg2=smf.ols(formula="lnw~female",data=data).fit(cov_type="HC1")
reg2.summary(slim=True)

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.008
No. Observations:,281,F-statistic:,3.347
Covariance Type:,HC1,Prob (F-statistic):,0.0684

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.3149,0.049,67.810,0.000,3.219,3.411
female,-0.1131,0.062,-1.829,0.067,-0.234,0.008


In [100]:
reg3=smf.ols(formula="lnw~age",data=data).fit(cov_type="HC1")
reg3.summary(slim=True)

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.098
Model:,OLS,Adj. R-squared:,0.095
No. Observations:,281,F-statistic:,26.36
Covariance Type:,HC1,Prob (F-statistic):,5.31e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.7317,0.101,27.006,0.000,2.533,2.930
age,0.0135,0.003,5.135,0.000,0.008,0.019


In [101]:
reg4=smf.ols(formula="lnw~age+agesq",data=data).fit(cov_type="HC1")
reg4.summary(slim=True)

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.168
Model:,OLS,Adj. R-squared:,0.163
No. Observations:,281,F-statistic:,27.47
Covariance Type:,HC1,Prob (F-statistic):,1.29e-11

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1935,0.341,3.499,0.000,0.525,1.862
age,0.0962,0.018,5.360,0.000,0.061,0.131
agesq,-0.0010,0.000,-4.608,0.000,-0.001,-0.001


In [102]:
reg5=smf.ols(formula="lnw~lspline(age,[30,40])",data=data).fit(cov_type="HC1")
reg5.summary(slim=True)

0,1,2,3
Dep. Variable:,lnw,R-squared:,0.173
Model:,OLS,Adj. R-squared:,0.164
No. Observations:,281,F-statistic:,19.33
Covariance Type:,HC1,Prob (F-statistic):,2.07e-11

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.3834,0.369,3.753,0.000,0.661,2.106
"lspline(age, [30, 40])[0]",0.0624,0.014,4.559,0.000,0.036,0.089
"lspline(age, [30, 40])[1]",0.0165,0.010,1.680,0.093,-0.003,0.036
"lspline(age, [30, 40])[2]",-0.0026,0.006,-0.444,0.657,-0.014,0.009


In [73]:
#lowess not working, R code is the following:
#reg6 <- loess(lnw ~ age, data, control = loess.control(surface = "direct"))
#summary(reg6)

In [75]:
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML

In [103]:
Stargazer([reg1,reg2])

0,1,2
,,
,Dependent variable:lnw,Dependent variable:lnw
,,
,(1),(2)
,,
Intercept,3.315***,3.315***
,(0.048),(0.049)
female,-0.113*,-0.113*
,(0.061),(0.062)
Observations,281,281


In [104]:
Stargazer([reg3,reg4,reg5])

0,1,2,3
,,,
,Dependent variable:lnw,Dependent variable:lnw,Dependent variable:lnw
,,,
,(1),(2),(3)
,,,
Intercept,2.732***,1.193***,1.383***
,(0.101),(0.341),(0.369)
age,0.014***,0.096***,
,(0.003),(0.018),
agesq,,-0.001***,


In [118]:
##############################
# graphs
##############################
ggplot(data.loc[:,["age","lnw"]].dropna(), aes(x="age", y="lnw")) + geom_point(color=color[0])+ geom_smooth(
    method="loess", color=[1]
)  

AttributeError: 'int' object has no attribute 'lower'

In [109]:
+ scale_x_continuous(
    expand=(0.01, 0.01), limits=(20, 65), breaks=seq(20, 65, by=5)
) + scale_y_continuous(
    expand=(0.01, 0.01), limits=(1.5, 4.5), breaks=seq(1.5, 4.5, by=0.50)
) + labs(
    x="Age (years)", y="ln(earnings per hour)"
) + theme_bw()

AttributeError: 'int' object has no attribute 'lower'