/
PropensityScoreWeighting.R
343 lines (263 loc) · 15.7 KB
/
PropensityScoreWeighting.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#----------------------------#
# Propensity Score Weighting #
# Author: Enrijeta Shino #
# Last Update: Jan 10, 2017 #
#----------------------------#
#-----------------------------------------------------------------------------------------------------#
# Replication code for "Using Propensity Score Matching to Determine the Efficacy of Secondary Career #
# Academies in Raising Educational Aspirations" by Jay W. Rojewski, In Heok Lee, Sinan Gemici #
#-----------------------------------------------------------------------------------------------------#
#load required survey library
require(survey)
#normalize the base year student sampling weight so that it sums to the sample size
ELS.data.imputed$bystuwt = ELS.data.imputed$bystuwt/mean(ELS.data.imputed$bystuwt)
#bystuwt this is the sampling weight
#imputed$bystuwt/mean divideing the weight by the mean it is called "the normalization" of the weight
#define design object that describes the sample characteristics
#the variable psu identifies the primary sampling units (cluster ids)
#the variable STRATA_ID identifies the strata ids
#the variable bystuwt identifies the base-year sampling weights for
#respondents of the 2002 and 2004 waves (Base year and 1st follow-up)
surveyDesign = svydesign(ids=~psu, strata=~STRAT_ID, weights=~bystuwt,
data = ELS.data.imputed, nest=T)
#svydesign defines the object
#since the data are stratified we can use multilevel analysis as well, but we're not going to becuase there #is no need for it in this case
#get weighted percentages of treated and untreated cases
#the treatment variable is "BYS33K" - Ever in career academy from BY Student Questionnaire
(treatmentTable = svymean(~BYS33K, surveyDesign)) #8.8% of cases received treatment
#0.08 of the pop received career academy treatment.
table(ELS.data.imputed$BYS33K)
mean(ELS.data.imputed$BYS33K)
#-------------------------------------------------
#obtain weights for estimating the ATT for propesnity scores obtained with logistic regression
ELS.data.imputed$weightATT = with(ELS.data.imputed,ifelse(BYS33K==1,
1, pScores/(1-pScores)))
with(ELS.data.imputed, by(weightATT,BYS33K,summary))
#"with" allows to put the name of the variables without adding the name of the dataset
#the minimum should never be 0
#obtain weights for estimating the ATE
ELS.data.imputed$weightATE = with(ELS.data.imputed,ifelse(BYS33K==1,
1/pScores, 1/(1-pScores)))
#"ifelse" adds the condition
with(ELS.data.imputed, by(weightATE,BYS33K,summary))
#we run the summary with this command
#this weight seems to be too strong(or extreme) the max for treated is equal to 145.1.
#this can happen due to misspecification, lack of the common support
#one way to check for extreme weight we have to check the covariate balance first
#one conseqeuence of the extreme weight due to misspecification is failure to get covariate balance
#and the standard errors get inflated, making it difficult for us to find a treatment effect
#======================================================
#STRATEGIES TO DEAL WITH EXTREME WEIGHTS
#TRUNCATION OPTION
#check the 99th percentile weight:
with(ELS.data.imputed,quantile(weightATE, 0.99))
#check how many observatios will have truncated weights
with(ELS.data.imputed,sum(weightATE>quantile(weightATE, 0.99)))
#truncate weights at the 99th percentile:
ELS.data.imputed$weightATETruncated = with(ELS.data.imputed,
ifelse(weightATE > quantile(weightATE, 0.99),
quantile(weightATE, 0.99),weightATE))
#use "ifelse" to include truncation
#check truncated weights
with(ELS.data.imputed, by(weightATETruncated,BYS33K,summary))
#USE STABILIZAED WEIGHTS
#obtain stabilized weights for the estimation of ATE
#need to calcultae the constants first
ELS.data.imputed$C=with(ELS.data.imputed,ifelse(BYS33K==1,pScores,1-pScores))
surveyDesign = svydesign(ids=~psu, strata=~STRAT_ID, weights=~bystuwt,
data = ELS.data.imputed, nest=T) #recreate survey design
(constants = svyby(~C, by=~BYS33K, design=surveyDesign, FUN=svymean))
#"surveyby" gives the constant
#double paranthesis (one in the beggining and oen in the end of the command) in the end of the command is a #trick to print the output otherwise it won't print anything
ELS.data.imputed$stabilizedWeightATE = ifelse(ELS.data.imputed$BYS33K==1,
constants[1,2]/ELS.data.imputed$C,
constants[2,2]/ELS.data.imputed$C)
#check stabilized weigts for extremeness
with(ELS.data.imputed, by(stabilizedWeightATE,BYS33K,summary))
#compare the distributions of weights and stabilized weights for the ATE
tiff("Chapter3_figure3-1.tif", res=600, compression = "lzw", height=6, width=15, units="in")
with(ELS.data.imputed,
hist(weightATE[BYS33K==1], density = 10, angle = 45, main="Propensity Score Weights for the ATE",
breaks=seq(0,150,by=1),
xlim=c(0,150), ylim=c(0,100), xlab="Shaded = IPTW | Gray = Stabilized IPTW") )
with(ELS.data.imputed,
hist(stabilizedWeightATE[BYS33K==1], col=gray(0.4,0.25), breaks=seq(0,150,by=1),
xlim=c(0,150), ylim=c(0,100),add=T) )
dev.off()
#=====================================================
#WEIGHTS WITH PROPENSITY SCORE
#obtain weights with propensity scores estimated with random forests
#obtain weights for estimating the ATT for propesnity scores obtained with logistic regression
ELS.data.imputed$weightATTRf = with(ELS.data.imputed,ifelse(BYS33K==1,
1, pScoresRf/(1-pScoresRf)))
#pScoresRf this is for the random forest
with(ELS.data.imputed, by(weightATTRf,BYS33K,summary))
#obtain weights for estimating the ATE
ELS.data.imputed$weightATERf = with(ELS.data.imputed,ifelse(BYS33K==1,
1/pScoresRf, 1/(1-pScoresRf)))
with(ELS.data.imputed, by(weightATERf,BYS33K,summary))
#======================================================
#obtain weights with propensity scores estimated with GBM
#obtain weights for estimating the ATT for propesnity scores obtained with logistic regression
ELS.data.imputed$weightATTGBM = with(ELS.data.imputed,ifelse(BYS33K==1,
1, pScoresGBM[,1]/(1-pScoresGBM[,1])))
#pScoresGBM weight for the generalized boosted model
with(ELS.data.imputed, by(weightATTGBM,BYS33K,summary))
#obtain weights for estimating the ATE
ELS.data.imputed$weightATEGBM = with(ELS.data.imputed,ifelse(BYS33K==1,
1/pScoresGBM[,1], 1/(1-pScoresGBM[,1])))
with(ELS.data.imputed, by(weightATEGBM,BYS33K,summary))
#we used only one imputed dataset for all the analysis above
#==========================================================
#obtain weights for multiple imputed datasets
library(mitools)
#add propensity score mean to imputed datasets
allImputations <- update(allImputations, weightATT = ifelse(BYS33K==1,
1, pScores/(1-pScores)))
#update gets all the imputed dataset and applies it to them
with(allImputations, by(weightATT,BYS33K,summary))
# this analysis ahs to be doublechecked because the values are exactly the same and that seems to be a #problem. The values should be similar but not equal
#======================================================
#EVALUATE COVARIATE BALANCE
#create final weights for the base year
#the final weigth is the product of the propensity score weight
#and the sampling weight
ELS.data.imputed$finalWeightBY <- with(ELS.data.imputed,bystuwt*weightATT)
#evaluate covariate balance for ATT
require(twang)
#twang is used gor GBM but it is also good for obtaining tables
balanceTable <- bal.stat(ELS.data.imputed, vars= covariateNames,
treat.var = "BYS33K",
w.all = ELS.data.imputed$finalWeightBY, get.ks=F, #w.all is the final weight
sampw = ELS.data.imputed$bystuwt, #sampw is the sampling weight
estimand="ATT", multinom=F)
#Table with results of balance evaluation. The columns are:
# tx.mn - The mean of the treatment group
# tx.sd - The standard deviation of the treatment group
# ct.mn - The mean of the control group
# ct.sd - The standard deviation of the control group
# std.eff.sz - The standardized effect size, (tx.mn-ct.mn)/tx.sd. (this should be less than -1, or -.05)
# stat - the t-statistic for numeric variables and the chi-square statistic for categorical variables
# p - the p-value for the test associated with stat (we cannot use this becuase it is affected by the #sample size)
balanceTable <- balanceTable$results
#calculate variance ratio (variance of the treated divided by the varaince of the control)
balanceTable$varRatio <- with(balanceTable, tx.sd^2/ct.sd^2)
#summarize the covariate balance quality
(summaryBalance <- rbind(
std.eff.sz = summary(abs(balanceTable$std.eff.sz)), #standardized effect sizes
varRatio = summary(balanceTable$varRatio) ))#variance ratios
#the max in the std.eff is equal to .01 which is very good
#==============================================================
#create a function to return balance summaries for this particular dataset
balanceSummarizer = function(data=ELS.data.imputed, samplingWeight="bystuwt",
PSWeight, treatment="BYS33K",
effect="ATT", covariateNames) {
finalWeight <- data[,samplingWeight]*data[,PSWeight]
#evaluate covariate balance for ATT
require(twang)
balance.table = bal.stat(data, vars= covariateNames,
treat.var = treatment,
w.all = finalWeight, get.ks=F,
sampw = data[,samplingWeight],
estimand=effect, multinom=F)
balance.table = balance.table$results
#calculate variance ratio
balance.table$varRatio <- with(balance.table, tx.sd^2/ct.sd^2)
#summarize the covariate balance quality
return(rbind(
std.eff.sz = summary(abs(balance.table$std.eff.sz)), #standardized effect sizes
varRatio = summary(balance.table$varRatio) ))#variance ratios
#close function
}
#this loop is a nice and elegant piece but not required for the newbeies in R
#====================================================================
#obtain summaries of covariate balances with different propensity scores
Table3.1 <- rbind(
logistic = balanceSummarizer(PSWeight="weightATT",covariateNames=covariateNames),
RF = balanceSummarizer(PSWeight="weightATTRf",covariateNames=covariateNames),
GBM = balanceSummarizer(PSWeight="weightATTGBM",covariateNames=covariateNames))
Table3.1 = data.frame(Table3.1,
index=rep(c("std.eff.sz","varRatio"),3), method=rep(c("Logistic","RF","GBM"),each=2))
write.csv(Table3.1, file="Table3-1.csv")
Table3.2 <- rbind(
logistic = balanceSummarizer(PSWeight="weightATE",covariateNames=covariateNames),
truncated = balanceSummarizer(PSWeight="weightATETruncated",covariateNames=covariateNames),
stabilized = balanceSummarizer(PSWeight="stabilizedWeightATE",covariateNames=covariateNames),
RF = balanceSummarizer(PSWeight="weightATERf",covariateNames=covariateNames),
GBM = balanceSummarizer(PSWeight="weightATEGBM",covariateNames=covariateNames))
Table3.2 = data.frame(Table3.2,
index=rep(c("std.eff.sz","varRatio"),5),
method=rep(c("Logistic","Truncated","Stabilized","RF","GBM"),each=2))
write.csv(Table3.2, file="Table3-2.csv")
#use different methods and compare them to find the one that does better
#===============================================================
#create a final weights for estimating the effect of career academy participation
#on income in the second folow up (2006)
ELS.data.imputed$finalWeight2006 = with(ELS.data.imputed,F2BYWT*weightATT)
#normalize the base-year to second folow-up (2006) final weight
ELS.data.imputed$finalWeight2006 = ELS.data.imputed$finalWeight2006/mean(ELS.data.imputed$finalWeight2006)
#check distribution of final weights
summary(ELS.data.imputed$finalWeight2006)
#===========================================================
#ESTIMATE THE ATT
#re-create the survey design including the final weight for 2006
surveyDesign2006 <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~finalWeight2006,
data = ELS.data.imputed, nest=T)
#we do this to clarify the weight
#this is multilevel data but we don't want to use multilevel modeling, "psu" is the cluster
#this creates the survey design
#create replicate weights for bootstrapping
surveyDesign2006Boot = as.svrepdesign(surveyDesign2006, type=c("bootstrap"),replicates=1000)
#this one will create the same design but with a 1000 bootrstaps
#we could also substitute "bootstrap" with "jackknife"
#obtain ATT as weighted mean differences.
(weightedMeans=svyby(formula=~F2ERN5P2,by=~BYS33K,design=surveyDesign2006Boot,
FUN=svymean,covmat=TRUE))
(ATT2006 = svycontrast(weightedMeans, contrasts=c(-1,1)))
#svyby funstion to breakdown the estimates by using
#covmat- surevy covariate matrix
#svycontrast- to get the difference between means
pnorm(ATT2006[1]/c(ATT2006)[2])
pnorm(0.23395/0.0845)
pnorm(0.23395/0.0845, lower.tail=F)
#this is the p-value
#obtain the group variances
(weightedVars=svyby(formula=~F2ERN5P2,by=~BYS33K,design=surveyDesign2006Boot,
FUN=svyvar,covmat=TRUE))
#use the same strategy to get group variances
#estimate the ATT for 2006 with regression analysis for complex survey data
outcomeModel2006 = svyglm(F2ERN5P2~BYS33K,surveyDesign2006)
summary(outcomeModel2006)
#glm model
#the two estimators are the same, only SE are different
#re-estimate the ATT with regression, but this time obtain standard errors with bootstrapping
outcomeModel2006Boot = svyglm(F2ERN5P2~BYS33K,surveyDesign2006Boot)
summary(outcomeModel2006Boot)
#=====================================================================
#Estimate the ATT with multiple imputed datasets
#here we repeat the same but with multiple imputed dataset
#first, add the final weight to imputed datasets
allImputations <- update(allImputations, finalWeightATT = F2BYWT*weightATT)
#normalize the final weight
allImputations <- update(allImputations, finalWeightATT = finalWeightATT/mean(finalWeightATT))
#create survey design object for multiple imputed dastests
surveyDesign2006MI <- svydesign(ids=~psu, strata=~STRAT_ID, weights=~finalWeightATT,
data = allImputations, nest=T)
#estimate the ATT with regression in each imputed dataset
outcomeModel2006MI = with(surveyDesign2006MI, svyglm(F2ERN5P2~BYS33K))
#combine estimates from multiplle imputed datasets
resultsModel2006MI = MIcombine(outcomeModel2006MI)
summary(resultsModel2006MI)
#MIcombine- will combine the estiamtes obtained by imputed datssets
#=====================================================================
#Doubly-robust (DR) estimation with regression estimation of
#propensity score weighted means using the propensity score as a covariate
#this is without covariates, even though would be good to include covatiates
#estimate a regression of the outcome with the propensity score as a covariate
outcomeModelDR = svyglm(F2ERN5P2~BYS33K+pScores+I(pScores^2)+I(pScores^3),surveyDesign2006Boot)
#Estimate ATT with a doubly-robust method
weightedMeansDR = predict(outcomeModelDR,
newdata=data.frame(BYS33K=0:1, pScores=0.125), vcov=TRUE, type="response")
(ATT2006DR = svycontrast(weightedMeansDR, c(-1,1)))
#predict- predicts the two means given the model
#svycontrast- to get the treatment effect