In [2]:
using CSV, DataFrames, GLM, Statistics
include("data_utils.jl")

recode_raw_survey_data! (generic function with 1 method)

#### Data Description
Q1 is income, Q2 is age, Q3 is education, and Q4 is car ownership. 

### 1. Generate Baseline OLS Coefficients

In [56]:
datapath = "data/raw/baseline.csv"
data = DataFrame!(CSV.File("data/raw/baseline.csv"))
recode_raw_survey_data!(data)
ols = lm(@formula(Q1 ~ Q2 +Q3 +Q4), data)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,LinearAlgebra.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Q1 ~ 1 + Q2 + Q3 + Q4

Coefficients:
──────────────────────────────────────────────────────────────────────────────
              Estimate  Std. Error   t value  Pr(>|t|)   Lower 95%   Upper 95%
──────────────────────────────────────────────────────────────────────────────
(Intercept)  -75.2504    18.3489    -4.10109    <1e-4   -111.361    -39.1396
Q2             0.30543    0.187875   1.62571    0.1051    -0.06431    0.675171
Q3             6.69969    1.07616    6.22556    <1e-8      4.5818     8.81758
Q4            16.0259     6.0055     2.66853    0.0080     4.20696   27.8448
──────────────────────────────────────────────────────────────────────────────

### 2. Set Reward Function

In [42]:
dscale = 3.0/3.0
beta = coef(ols)
println(beta)
perturb = [0, beta[2:4]...]./3
println(mean(data.Q1))
xmeans = [1, mean(data.Q2), mean(data.Q3), mean(data.Q4)]
println("means: ", xmeans)
println("perturbs: ", perturb)
println("mean prediction: ", sum(beta.*xmeans)*dscale)
println("mean prediction at max perturb: ", sum((beta.+perturb).*xmeans)*dscale)
println("mean prediction at min perturb: ", sum((beta.-perturb).*xmeans)*dscale)
println("maximum possible prediction: ", sum((beta.+ perturb).*[1, 85, 20, 1])*dscale)
println("minimum possible prediction: ", sum((beta.-perturb).*[1, 21, 10, 0])*dscale)
println("minimum no perturb: ", sum((beta).*[1, 21, 10, 0])*dscale)
println("maximum no perturb: ", sum(beta.*[1, 85, 20, 1])*dscale)



[-75.25042579332933, 0.30543025479655256, 6.699688591607267, 16.025854916560096]
53.61666666666667
means: [1.0, 37.76, 15.52, 0.8333333333333334]
perturbs: [0.0, 0.10181008493218419, 2.233229530535756, 5.341951638853366]
mean prediction: 53.61666666666669
mean prediction at max perturb: 96.57236415333206
mean prediction at min perturb: 10.66096918000135
maximum possible prediction: 159.39117208188722
minimum possible prediction: -26.30981161546248
minimum no perturb: -1.8395045265290548
maximum no perturb: 100.73077261308308


## Naive Step 1

In [46]:
xmeans = [1, mean(data.Q2), mean(data.Q3), mean(data.Q4)]
println(mean(data.Q1))
println(xmeans)

52.691029900332225
[1.0, 38.17607973421927, 15.634551495016611, 0.8205980066445183]


In [58]:
datapath = "data/raw/n_step1.csv"
data = DataFrame!(CSV.File(datapath))
recode_raw_survey_data!(data)
ols = lm(@formula(Q1 ~ Q2 +Q3 +Q4), data)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,LinearAlgebra.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Q1 ~ 1 + Q2 + Q3 + Q4

Coefficients:
────────────────────────────────────────────────────────────────────────────────
               Estimate  Std. Error   t value  Pr(>|t|)    Lower 95%   Upper 95%
────────────────────────────────────────────────────────────────────────────────
(Intercept)  -51.3109     15.1518    -3.38646    0.0008  -81.1294     -21.4924
Q2             0.400729    0.160917   2.49028    0.0133    0.0840471    0.717411
Q3             4.60589     0.889252   5.17951    <1e-6     2.85586      6.35593
Q4            20.3421      5.36285    3.79314    0.0002    9.78805     30.8961
────────────────────────────────────────────────────────────────────────────────

In [30]:
data.Q2

303-element SentinelArrays.SentinelArray{Int64,1,Int64,Missing,Array{Int64,1}}:
 29
 29
 69
 29
 49
 39
 39
 49
 29
 21
 39
 21
 29
  ⋮
 39
 39
 29
 69
 21
 29
 39
 21
 29
 49
 49
 29

In [59]:
xmeans = [1, mean(data.Q2), mean(data.Q3), mean(data.Q4)]
println(mean(data.Q1))

println("stds: ", [ std(data.Q1), std(data.Q2), std(data.Q3), std(data.Q4)])
println("means: ", xmeans)


52.691029900332225
stds: [38.136608212264626, 12.79839414068374, 2.291870171171589, 0.38432751813526606]
means: [1.0, 38.17607973421927, 15.634551495016611, 0.8205980066445183]


In [54]:
datapath = "data/raw/e_step1.csv"
data = DataFrame!(tCSV.File(datapath))
recode_raw_survey_data!(data)
ols = lm(@formula(Q1 ~ Q2 +Q3 +Q4), data)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,LinearAlgebra.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Q1 ~ 1 + Q2 + Q3 + Q4

Coefficients:
───────────────────────────────────────────────────────────────────────────────
               Estimate  Std. Error   t value  Pr(>|t|)   Lower 95%   Upper 95%
───────────────────────────────────────────────────────────────────────────────
(Intercept)  -52.7511     16.1289    -3.2706     0.0012  -84.4926    -21.0097
Q2             0.198445    0.16701    1.18823    0.2357   -0.130227    0.527117
Q3             5.07875     0.923483   5.49956    <1e-7     3.26135     6.89615
Q4            22.7685      5.8356     3.90166    0.0001   11.2842     34.2529
───────────────────────────────────────────────────────────────────────────────

### 3. Update Coefficients Based on Experimental Gradient Estimate

In [34]:
mean(data.Q3)

15.52

In [35]:
coef(ols)

4-element Array{Float64,1}:
 -75.25042579332933
   0.30543025479655256
   6.699688591607267
  16.025854916560096

In [36]:
xmeans

1×4 Array{Float64,2}:
 1.0  37.76  15.52  0.833333

In [None]:
beta.*