In [1]:
using SSM
using Random
using Distributions
using Plots
using LinearAlgebra
using CSV
using DataFrames

In [2]:
robert = CSV.read("/Users/ryansenne/Downloads/robert_deltaflash_choice.csv", DataFrame)
# extract variables
choice = Array{Float64}(robert.choice)
delta_flashes = reshape(Array{Float64}(robert.delta_flashes), length(choice), 1)

18667×1 Matrix{Float64}:
 -10.0
  20.0
  -6.0
  -3.0
   4.0
   4.0
  -2.0
   5.0
   5.0
 -10.0
   ⋮
  -4.0
   4.0
  -2.0
   3.0
   6.0
   2.0
  -6.0
  -5.0
  -6.0

In [3]:
reg_model = SSM.BernoulliRegression(;λ=0.0)
reg_model.β = [0.0, 0.0]

2-element Vector{Float64}:
 0.0
 0.0

In [4]:
w = rand(length(choice))
obj = β -> -SSM.loglikelihood(SSM.BernoulliRegression(β, true, 0.0), delta_flashes, choice, w)

#11 (generic function with 1 method)

In [4]:
fit!(reg_model, delta_flashes, choice)

2-element Vector{Float64}:
 0.397990388649346
 0.2653700973088059

In [6]:
using StatsFuns
using Optim 

# function loglikelihood(β, X, y, w)
#     p = SSM.logistic.(X * β)
#     # clamp probabilities to avoid log(0) and log(1)
#     p = clamp.(p, 1e-16, 1-1e-16)
#     return -sum(w .* (y .* log.(p) + (1 .- y) .* log.(1 .- p)))
# end

# function grad!(g, β, X, y, w)
#     p = SSM.logistic.(X * β)
#     p = clamp.(p, 1e-16, 1-1e-16)
#     g .= -X' * (w .* (y .- p))
# end

w = ones(length(choice))

obj = β -> -SSM.loglikelihood(SSM.BernoulliRegression(β, true, 0.0), delta_flashes, choice, w)

g! = (g, β) -> SSM.gradient!(g, BernoulliRegression(β, true, 0.0), hcat(ones(length(delta_flashes)), delta_flashes), choice, w)

# res = optimize(β -> loglikelihood(β, hcat(ones(length(delta_flashes)), delta_flashes), choice, w), g!, zeros(2), LBFGS())
res = optimize(obj, g!, zeros(2), LBFGS())

 * Status: success

 * Candidate solution
    Final objective value:     8.983387e+03

 * Found with
    Algorithm:     L-BFGS

 * Convergence measures
    |x - x'|               = 6.79e-08 ≰ 0.0e+00
    |x - x'|/|x'|          = 1.71e-07 ≰ 0.0e+00
    |f(x) - f(x')|         = 6.18e-11 ≰ 0.0e+00
    |f(x) - f(x')|/|f(x')| = 6.88e-15 ≰ 0.0e+00
    |g(x)|                 = 2.40e-09 ≤ 1.0e-08

 * Work counters
    Seconds run:   0  (vs limit Inf)
    Iterations:    8
    f(x) calls:    46
    ∇f(x) calls:   46


In [32]:
using Optim
using StatsFuns

"""
    BernoulliRegression(β::Vector{<:Real}, include_intercept::Bool, λ::Float64=0.0)

Args:
- `β::Vector{<:Real}`: Coefficients of the regression model
- `include_intercept::Bool`: Whether to include an intercept term in the model
- `λ::Float64`: Regularization parameter for the model

Constructors:
- `BernoulliRegression(; include_intercept::Bool = true, λ::Float64=0.0)`
- `BernoulliRegression(β::Vector{<:Real}, include_intercept::Bool, λ::Float64=0.0)`

Example:
```julia
model = BernoulliRegression()
model = BernoulliRegression(include_intercept=false, λ=0.1)
model = BernoulliRegression([0.1, 0.2], true, 0.1)
```
"""
mutable struct BernoulliRegression
    β::Vector{<:Real}
    include_intercept::Bool
    λ::Float64
    # Empty constructor
    function BernoulliRegression(; include_intercept::Bool = true, λ::Float64=0.0) 
        @assert λ >= 0.0 "Regularization parameter must be non-negative."
        new(Vector{Float64}(), include_intercept, λ)
    end
    # Parametric Constructor
    function BernoulliRegression(β::Vector{<:Real}, include_intercept::Bool, λ::Float64=0.0)
        @assert λ >= 0.0 "Regularization parameter must be non-negative."
        new(β, include_intercept, λ)
    end
end

"""
    loglikelihood(model::BernoulliRegression, X::Matrix{<:Real}, y::Union{Vector{<:Real}, BitVector}, w::Vector{<:Real}=ones(length(y))

Calculate the log-likelihood of a Bernoulli regression model.

Args:
- `model::BernoulliRegression`: Bernoulli regression model
- `X::Matrix{<:Real}`: Design matrix
- `y::Union{Vector{<:Real}, BitVector}`: Response vector
- `w::Vector{<:Real}`: Weights for the observations

Example:
```julia
model = BernoulliRegression()
X = rand(100, 2)
y = rand(Bool, 100)
loglikelihood(model, X, y)
```
"""
function loglikelihood(model::BernoulliRegression, X::Matrix{<:Real}, y::Vector{<:Real}, w::Vector{<:Real}=ones(length(y)))
    # confirm that the model has been fit
    @assert !isempty(model.β) "Model parameters not initialized, please call fit! first."
    # add intercept if specified and not already included
    if model.include_intercept && size(X, 2) == length(model.β) - 1 
        X = hcat(ones(size(X, 1)), X)
    end
    # calculate log likelihood
    p = logistic.(X * model.β)
    # Clamp probabilities to avoid log(0) and log(1)
    p = clamp.(p, 1e-16, 1-1e-16)
    return sum(w .* (y .* log.(p) + (1 .- y) .* log.(1 .- p)))
end

"""
    loglikelihood(model::BernoulliRegression, X::Vector{<:Real}, y::Union{Float64, Bool, Int64}, w::Float64=1.0)

Calculate the log-likelihood of a single observation of a Bernoulli regression model.

Args:
- `model::BernoulliRegression`: Bernoulli regression model
- `X::Vector{<:Real}`: Design vector
- `y::Union{Float64, Bool, Int64}`: Response value
- `w::Float64`: Weight for the observation

Example:
```julia
model = BernoulliRegression()
X = rand(2)
y = rand(Bool)
loglikelihood(model, X, y)
```
"""
function loglikelihood(model::BernoulliRegression, X::Vector{<:Real}, y::Union{Float64, Bool, Int64}, w::Float64=1.0)
    # confirm that the model has been fit
    @assert !isempty(model.β) "Model parameters not initialized, please call fit! first."
    # add intercept if specified
    if model.include_intercept && length(X) == length(model.β) - 1
        X = vcat(1.0, X)
    end
    # calculate log likelihood
    p = logistic.(X' * model.β) # use stats fun for this
    # Clamp probabilities to avoid log(0) and log(1)
    p = clamp(p, 1e-16, 1-1e-16)
    return sum(w .* (y .* log.(p) + (1 .- y) .* log.(1 .- p)))
end

"""
    gradient!(grad::Vector{<:Real}, model::BernoulliRegression, X::Matrix{<:Real}, y::Union{Vector{<:Real}, BitVector}, w::Vector{<:Real}=ones(length(y))

Calculate the gradient of the negative log-likelihood function for a Bernoulli regression model. 

Args:
- `grad::Vector{<:Real}`: Gradient of the negative log-likelihood function
- `model::BernoulliRegression`: Bernoulli regression model
- `X::Matrix{<:Real}`: Design matrix
- `y::Union{Vector{<:Real}, BitVector}`: Response vector
- `w::Vector{<:Real}`: Weights for the observations
"""
function gradient!(g::Vector{<:Real}, model::BernoulliRegression, X::Matrix{<:Real}, y::Vector{<:Real}, w::Vector{<:Real}=ones(length(y)))
    # Calculate probabilities
    p = logistic.(X * model.β)
    # Clamp probabilities to avoid log(0) and log(1)
    p = clamp.(p, 1e-16, 1-1e-16)
    # Calculate gradient
    g .= -X' * Diagonal(w) * (y .- p) #+ (2 * model.λ * model.β)
end

"""
    fit!(model::BernoulliRegression, X::Matrix{<:Real}, y::Union{Vector{<:Real}, BitVector}, w::Vector{<:Real}=ones(length(y))

Fit a Bernoulli regression model using maximum likelihood estimation.

Args:
- `model::BernoulliRegression`: Bernoulli regression model
- `X::Matrix{<:Real}`: Design matrix
- `y::Union{Vector{<:Real}, BitVector}`: Response vector
- `w::Vector{<:Real}`: Weights for the observations

Example:
```julia
model = BernoulliRegression()
X = rand(100, 2)
y = rand(Bool, 100)
fit!(model, X, y)

model = BernoulliRegression()
X = rand(100, 2)
y = rand(Bool, 100)
w = rand(100)
fit!(model, X, y, w)
```
"""
function fit!(model::BernoulliRegression, X::Matrix{<:Real}, y::Vector{<:Real}, w::Vector{<:Real}=ones(length(y)))
    if model.include_intercept
        X = hcat(ones(size(X, 1)), X)
    end
    p = size(X, 2)
    model.β = zeros(p)
    y = convert(Vector{Float64}, y)
    # print likelihood and grad
    println("Initial log-likelihood: ", loglikelihood(model, X, y, w))
    println("Initial gradient: ", gradient!(zeros(p), model, X, y, w))
    objective = β -> -loglikelihood(BernoulliRegression(β, model.include_intercept, model.λ), X, y, w)
    objective_grad! = (β, g) -> gradient!(g, BernoulliRegression(β, model.include_intercept, model.λ), X, y, w)
    result = optimize(objective, objective_grad!, zeros(p), LBFGS(), Optim.Options(show_trace=true))
    println("Optimization result: ", result)
    model.β = result.minimizer
end

fit!

In [33]:
reg_model = BernoulliRegression(;λ=0.0)
fit!(reg_model, delta_flashes, choice)

Initial log-likelihood: -12938.978419512487
Initial gradient: [-1011.5, -36503.5]
Iter     Function value   Gradient norm 
     0              NaN              NaN
 * time: 0.0
Optimization result:  * Status: failure

 * Candidate solution
    Final objective value:     NaN

 * Found with
    Algorithm:     L-BFGS

 * Convergence measures
    |x - x'|               = 0.00e+00 ≤ 0.0e+00
    |x - x'|/|x'|          = NaN ≰ 0.0e+00
    |f(x) - f(x')|         = NaN ≰ 0.0e+00
    |f(x) - f(x')|/|f(x')| = NaN ≰ 0.0e+00
    |g(x)|                 = NaN ≰ 1.0e-08

 * Work counters
    Seconds run:   0  (vs limit Inf)
    Iterations:    0
    f(x) calls:    1
    ∇f(x) calls:   1



2-element Vector{Float64}:
 NaN
 NaN