# Table of Contents
 <p>

In [1]:
using LaTeXStrings
using GaussianProcesses
using Distributions
using Base.LinAlg
using Distances
import PyPlot; plt=PyPlot
using DataFrames
using GeoJSON
using Optim
using GLM
import MultivariateStats
mvst=MultivariateStats
plt.rc("figure", dpi=300.0)
# plt.rc("figure", figsize=(6,4))
# plt.rc("figure", autolayout=true)
plt.rc("savefig", dpi=300.0)
plt.rc("text", usetex=true)
plt.rc("font", family="serif")
plt.rc("font", serif="Palatino")
;

In [2]:
Tucson_sales=readtable("Tucson_data/processed/Tucson_sales.csv", header=true)
# categorical variables
pool!(Tucson_sales, [
    :PropertyType, 
    :SchDistr, 
    :Financing, 
    :ValidationDescription, 
    :BuyerSellerRelated, 
    :Solar,
    :PersonalProperty, 
    :PartialInterest,
    :HEAT,
    :COOL,
    :QUALITY,
    :CLASS,
    :WALLS,
    :ROOF,
    :GARAGE,
    ])
believable = zeros(Bool, size(Tucson_sales,1))
for i in 1:size(Tucson_sales,1)
    if isna(Tucson_sales[i,:BuyerSellerRelated])
        continue
    end
    if Tucson_sales[i,:BuyerSellerRelated]!="No"
        continue
    end
    if isna(Tucson_sales[i,:PartialInterest])
        continue
    end
    if Tucson_sales[i,:PartialInterest]!="No"
        continue
    end
    if isna(Tucson_sales[i,:PropertyType])
        continue
    end
    if !(Tucson_sales[i,:PropertyType] in ("2-4 Plex", "Condo/Townhouse", "Single Family"))
        continue
    end
    if isna(Tucson_sales[i,:ValidationDescription])
        continue
    end
    if !(Tucson_sales[i,:ValidationDescription] in ("Good Sale", "Buyer/Seller has an Out-Of-State Address"))
        continue
    end
    if isna(Tucson_sales[i,:SQFT])
        continue
    end
    believable[i] = true
end

In [3]:
Y_dict=Dict{String, Vector{Float64}}()
X_dict=Dict{String, Array{Float64,2}}()
schdistrs = unique(Tucson_sales[:,:SchDistr])
for name in schdistrs
    Y_dict[name] = log(Tucson_sales[believable & (Tucson_sales[:,:SchDistr] .== name),:SalePrice])
    X_dict[name] =     Tucson_sales[believable & (Tucson_sales[:,:SchDistr] .== name),[:X_PRJ, :Y_PRJ]]'
end
sentinels_json=GeoJSON.parsefile("Tucson_data/processed/SchoolDistrict_borders/SchoolDistrict_sentinels.json")
sentinels=Dict{Tuple{String,String},GeoJSON.MultiPoint}()
for f in sentinels_json.features
    key = (f.properties["SchoolDistrict1"], f.properties["SchoolDistrict2"])
    sentinels[key] = f.geometry
end
sum(believable)

 in depwarn(::String, ::Symbol) at ./deprecated.jl:64
 in transpose(::DataFrames.DataFrame) at ./deprecated.jl:771
 in ctranspose(::DataFrames.DataFrame) at ./operators.jl:310
 in macro expansion; at ./In[3]:6 [inlined]
 in anonymous at ./<missing>:?
 in include_string(::String, ::String) at ./loading.jl:380
 in execute_request(::ZMQ.Socket, ::IJulia.Msg) at /Users/imolk/.julia/v0.5/IJulia/src/execute_request.jl:170
 in eventloop(::ZMQ.Socket) at /Users/imolk/.julia/v0.5/IJulia/src/IJulia.jl:143
 in (::IJulia.##25#31)() at ./task.jl:360
while loading In[3], in expression starting on line 4


22203

In [4]:
Tucson_sales[:logSalePrice] = log(Tucson_sales[:SalePrice]);

In [5]:
typealias MultiGP Vector{GP}

Array{GaussianProcesses.GP,1}

In [6]:
hyp = [-1.80728,11.7879,8.04658,-1.70133,1.66984]

5-element Array{Float64,1}:
 -1.80728
 11.7879 
  8.04658
 -1.70133
  1.66984

In [7]:
GP_dict=Dict{String, GP}()
lσβ=-hyp[5]
kern = SEIso(hyp[3], hyp[4])
for name in schdistrs
    X = X_dict[name]
    Y = Y_dict[name]
    if length(Y)==0
        println("no data for ", name)
        continue
    end
    fit = GP(X', Y, MeanConst(hyp[2]), kern, hyp[1])
    GP_dict[name] = fit
end
schdistrs_have = collect(keys(GP_dict))
multiGP = GaussianProcesses.GP[GP_dict[name] for name in schdistrs_have]
;

In [8]:
nobsv_ = sum([gp.nobsv for gp in multiGP])
for_mm = copy(Tucson_sales[believable,:])
for_mm[:SQFT] =  for_mm[:SQFT] .- mean(for_mm[:SQFT])
for_mm[:SQFT] = for_mm[:SQFT] ./ std(for_mm[:SQFT])
for_mm[:ROOMS] = for_mm[:ROOMS] .- mean(for_mm[:ROOMS])
for_mm[:ROOMS] = for_mm[:ROOMS] ./ std(for_mm[:ROOMS])
for_mm[:STORIES] = for_mm[:STORIES] .- mean(for_mm[:STORIES])
for_mm[:STORIES] = for_mm[:STORIES] ./ std(for_mm[:STORIES])
reordering = collect(1:nobsv_)
istart = 1
for name in schdistrs_have
    indices = find(for_mm[:SchDistr] .== name)
    nobsv_schdistr = length(indices)
    iend = istart+nobsv_schdistr-1
    reordering[istart:iend,:] = indices
    istart = iend+1
end
for_mm = for_mm[reordering,:]
;

In [9]:
contrasts_dict = Dict()
for (term,col) in eachcol(for_mm)
    DataFrames.is_categorical(col) || continue
    contrasts_dict[term] = DataFrames.FullDummyCoding()
end
mf=DataFrames.ModelFrame(logSalePrice ~ SQFT + QUALITY + STORIES + CLASS + ROOF + GARAGE + SchDistr, 
    for_mm, 
    contrasts=contrasts_dict)
mm=ModelMatrix(mf)
p = size(mm,2)
D_=mm.m
size(D_)

(22203,32)

In [10]:
type MultiGPCovarsPlugin{MT<:GaussianProcesses.Mean, KT<:GaussianProcesses.Kernel}
    D::Array{Float64,2}
    y::Vector{Float64}
    mgp::MultiGP
    p::Int
    dim::Int
    nobsv::Int
    logNoise::Float64
    m::MT
    k::KT
    σβ::Float64
    # Auxiliary data
    cK::PDMats.PDMat        # (k + obsNoise²)
    alpha::Vector{Float64}  # (k + obsNoise²)⁻¹y
    mLL::Float64            # Marginal log-likelihood
    dmLL::Vector{Float64}   # Gradient marginal log-likelihood
    βhat::Vector{Float64}   # fitted β coefficients
    resid::Vector{Float64}  # residuals
    function MultiGPCovarsPlugin(D::Array{Float64,2}, 
        y::Vector{Float64},
        mgp::MultiGP, 
        p::Int,
        dim::Int,
        nobsv::Int,
        logNoise::Float64,
        σβ::Float64,
        m::MT,
        k::KT,
        )
        mgpcv = new(D, y, mgp, p, dim, nobsv, logNoise, σβ, m, k)
        update_mll!(mgpcv)
        return mgpcv
    end
end

In [11]:
function MultiGPCovarsPlugin(D::Array{Float64,2}, mgp::MultiGP, σβ::Float64)
    nobsv = sum([gp.nobsv for gp in mgp])
    size(D,1) == nobsv || throw(ArgumentError("incompatible dimensions of covariates matrix and gaussian processes"))
    first_gp = mgp[1]
    dim = first_gp.dim
    logNoise = first_gp.logNoise
    k = first_gp.k
    m = first_gp.m
    # harmonize parameters
    for gp in mgp
        gp.k = k
        gp.m = m
        gp.logNoise = logNoise
    end
    p = size(D,2)
    y = vcat([gp.y for gp in recent_multiGP]...)
    mgpcv = MultiGPCovarsPlugin{typeof(m),typeof(k)}(D, y, mgp, p, dim, nobsv, logNoise, σβ, m, k)
    return mgpcv
end

MultiGPCovarsPlugin{MT<:GaussianProcesses.Mean,KT<:GaussianProcesses.Kernel}

In [12]:
function get_cK!(mgpcv::MultiGPCovarsPlugin, cK::Matrix{Float64})
    cK = zeros(Float64, mgpcv.nobsv, mgpcv.nobsv)
    propagate_params!(mgpcv)
    istart=0
    for gp in mgpcv.mgp
        addcov!(view(cK, istart+1:istart+gp.nobsv, istart+1:istart+gp.nobsv), mgpcv.k, gp.X, gp.data)
        istart += gp.nobsv
    end
    for i in 1:mgpcv.nobsv
        cK[i,i] += max(exp(2*mgpcv.logNoise),1e-8)
    end
    return PDMats.PDMat(cK)
end

function get_cK(mgpcv::MultiGPCovarsPlugin)
    cK = zeros(Float64, mgpcv.nobsv, mgpcv.nobsv)
    return get_cK!(mgpcv, cK)
end

get_cK (generic function with 1 method)

In [13]:
function propagate_params!(mgpcv::MultiGPCovarsPlugin)
    for gp in mgpcv.mgp
        # harmonize parameters
        gp.k = mgpcv.k
        gp.m = mgpcv.m
        gp.logNoise = mgpcv.logNoise
    end
end

function get_βhat(cK::PDMats.AbstractPDMat, D::AbstractMatrix, σβ2::Float64, m::Vector{Float64})
    tmp = PDMats.Xt_invA_X(cK, D)
    for i in 1:p
        tmp[i,i] += σβ2
    end
    βhat = (tmp \ D') * (cK \ (mgpcv.y.-m))
#     βhat_dict=Dict(zip(coefnames(mf), βhat))
end

function update_mll!(mgpcv::MultiGPCovarsPlugin, cK::Matrix{Float64})
    propagate_params!(mgpcv)
    m = Array(Float64, mgpcv.nobsv)
    istart=0
    for gp in mgpcv.mgp
        m[istart+1:istart+gp.nobsv] = mean(mgpcv.m,gp.X)
        istart += gp.nobsv
    end
    cK = get_cK!(mgpcv, cK)
    mgpcv.cK = cK
    mgpcv.alpha = mgpcv.cK \ (mgpcv.y .- m)
    βhat = get_βhat(cK, mgpcv.D, mgpcv.σβ^2, m)
    fitted_vals = mgpcv.D * βhat
    mgpcv.mLL = -dot((mgpcv.y-m),mgpcv.alpha)/2.0 - logdet(mgpcv.cK)/2.0 - mgpcv.nobsv*log(2π)/2.0
end

function update_mll!(mgpcv::MultiGPCovars)
    cK_buffer = Array(Float64, mgpcv.nobsv, mgpcv.nobsv)
    update_mll!(mgpcv, cK_buffer)
end
function update_mll_and_dmll!(mgpcv::MultiGPCovars,
    cK::Matrix{Float64},
    Kgrads::Array{Float64,3}
    ; 
    noise::Bool=true, # include gradient component for the logNoise term
    mean::Bool=true, # include gradient components for the mean parameters
    kern::Bool=true, # include gradient components for the spatial kernel parameters
    beta::Bool=true, # include gradient components for the linear regression prior terms
    )
    update_mll!(mgpcv, cK)
    n_mean_params = GaussianProcesses.num_params(mgpcv.m)
    n_kern_params = GaussianProcesses.num_params(mgpcv.k)
    n_beta_params = GaussianProcesses.num_params(mgpcv.βkern)
    dmLL = Array(Float64, noise + mean*n_mean_params + kern*n_kern_params + beta*n_beta_params)
    logNoise = mgpcv.logNoise
    α = mgpcv.alpha
    ααinvcKI = α*α' - mgpcv.cK \ eye(mgpcv.nobsv)
    i=1
    if noise
        dmLL[i] = exp(2*logNoise)*trace(ααinvcKI)
        i+=1
    end
    if mean
        Mgrads = vcat([GaussianProcesses.grad_stack(gp.m, gp.X) for gp in recent_multiGP]...)
        for j in 1:n_mean_params
            dmLL[i] = dot(Mgrads[:,j],α)
            i+=1
        end
    end
    if kern
        Kgrad[:,:] = 0.0
        for j in i:i+n_kern_params-1
            dmLL[j] = 0.0
        end
        for iparam in 1:n_kern_params
            istart=0
            for gp in mgpcv.mgp
                Kview = view(Kgrad, istart+1:istart+gp.nobsv, istart+1:istart+gp.nobsv)
                ααview = view(ααinvcKI, istart+1:istart+gp.nobsv, istart+1:istart+gp.nobsv)
                grad_slice!(Kview, mgpcv.k, gp.X, gp.data, iparam)
                Kview[:,:] .*= ααview
                dmLL[i] += sum(Kview)/2
                istart += gp.nobsv
            end
            i+=1
        end
    end
    if beta
        for iparam in 1:num_params(mgpcv.βkern)
            grad_slice!(Kgrad, mgpcv.βkern, mgpcv.D', KernelData(mgpcv.βkern,mgpcv.D'), iparam)
            dmLL[i] = dot(ααinvcKI,Kgrad)/2.0
            i+=1
        end
    end
    mgpcv.dmLL = dmLL
end
function get_params(mgpcv::MultiGPCovars; noise::Bool=true, mean::Bool=true, kern::Bool=true, beta::Bool=true)
    params = Float64[]
    if noise; push!(params, mgpcv.logNoise); end
    if mean;  append!(params, GaussianProcesses.get_params(mgpcv.m)); end
    if kern; append!(params,  GaussianProcesses.get_params(mgpcv.k)); end
    if beta; append!(params,  GaussianProcesses.get_params(mgpcv.βkern)); end
    return params
end
function set_params!(mgpcv::MultiGPCovars, hyp::Vector{Float64}; 
                    noise::Bool=true, mean::Bool=true, kern::Bool=true, beta::Bool=true)
    i=1
    if noise
        mgpcv.logNoise = hyp[i]
        i+=1
    end
    if mean
        GaussianProcesses.set_params!(mgpcv.m, hyp[i:i+GaussianProcesses.num_params(mgpcv.m)-1])
        i+=GaussianProcesses.num_params(mgpcv.m)
    end
    if kern
        GaussianProcesses.set_params!(mgpcv.k, hyp[i:i+GaussianProcesses.num_params(mgpcv.k)-1])
        i+=GaussianProcesses.num_params(mgpcv.k)
    end
    if beta
        GaussianProcesses.set_params!(mgpcv.βkern, hyp[i:i+GaussianProcesses.num_params(mgpcv.βkern)-1])
        i+=GaussianProcesses.num_params(mgpcv.βkern)
    end
    propagate_params!(mgpcv)
end
@doc """
# Description
A function for optimising the GP hyperparameters based on type II maximum likelihood estimation. This function performs gradient based optimisation using the Optim pacakge to which the user is referred to for further details.

# Arguments:
* `gp::GP`: Predefined Gaussian process type
* `noise::Bool`: Noise hyperparameters should be optmized
* `mean::Bool`: Mean function hyperparameters should be optmized
* `kern::Bool`: Kernel function hyperparameters should be optmized
* `kwargs`: Keyword arguments for the optimize function from the Optim package
""" ->
function GaussianProcesses.optimize!(mgpcv::MultiGPCovars; noise::Bool=true, mean::Bool=true, kern::Bool=true, beta::Bool=true, 
                    method=ConjugateGradient(), kwargs...)
    cK_buffer = Array(Float64, mgpcv.nobsv, mgpcv.nobsv)
    Kgrads_buffer = Array(Float64, mgpcv.nobsv, mgpcv.nobsv, 3)
    function mll(hyp::Vector{Float64})
        try
            set_params!(mgpcv, hyp; noise=noise, mean=mean, kern=kern, beta=beta)
            update_mll!(mgpcv, cK_buffer)
            return -mgpcv.mLL
        catch err
             if !all(isfinite(hyp))
                println(err)
                return Inf
            elseif isa(err, ArgumentError)
                println(err)
                return Inf
            elseif isa(err, Base.LinAlg.PosDefException)
                println(err)
                return Inf
            else
                throw(err)
            end
        end        
    end

    function mll_and_dmll!(hyp::Vector{Float64}, grad::Vector{Float64})
        try
            set_params!(mgpcv, hyp; noise=noise, mean=mean, kern=kern, beta=beta)
            update_mll_and_dmll!(mgpcv, cK_buffer, Kgrads_buffer; noise=noise, mean=mean, kern=kern, beta=beta)
            grad[:] = -mgpcv.dmLL
            return -mgpcv.mLL
        catch err
             if !all(isfinite(hyp))
                println(err)
                return Inf
            elseif isa(err, ArgumentError)
                println(err)
                return Inf
            elseif isa(err, Base.LinAlg.PosDefException)
                println(err)
                return Inf
            else
                throw(err)
            end
        end 
    end
    function dmll!(hyp::Vector{Float64}, grad::Vector{Float64})
        mll_and_dmll!(hyp::Vector{Float64}, grad::Vector{Float64})
    end

    func = DifferentiableFunction(mll, dmll!, mll_and_dmll!)
    init = get_params(mgpcv;  noise=noise, mean=mean, kern=kern, beta=beta)  # Initial hyperparameter values
    results=optimize(func,init; method=method, kwargs...)                     # Run optimizer
    return results
end
mgpcv=MultiGPCovars(D_, recent_multiGP::MultiGP, βkern_);

LoadError: LoadError: UndefVarError: MultiGPCovars not defined
while loading In[13], in expression starting on line 35