https://github.com/JuliaStats/StatsModels.jl/issues/220

## Reproduce the slow-down (1)

In [1]:
using DataFrames
using GLM
using StatsBase: sample

n = 20
r = 10
x_symbols = [Symbol("x$i") for i in 1:n]
@time df = DataFrame(rand(100, n+1), [:y; x_symbols]);

  0.474206 seconds (1.89 M allocations: 108.855 MiB, 18.94% gc time, 99.95% compilation time)


In [2]:
x_vars = sample(x_symbols, r; replace=false)
@time F = term(:y) ~ sum(term(x) for x in x_vars)
@time cols = Tables.columntable(df)
@time mf = ModelFrame(F, cols, model=LinearModel)
@time mm = ModelMatrix(mf)
@time y = response(mf)
@time linmodel = fit(LinearModel, mm.m, y)
@time regmodel = StatsModels.TableRegressionModel(linmodel, mf, mm)

  0.150663 seconds (279.83 k allocations: 18.107 MiB, 41.57% gc time, 100.87% compilation time)
  0.234271 seconds (653.37 k allocations: 41.294 MiB, 3.43% gc time, 43.78% compilation time)
  1.638357 seconds (5.92 M allocations: 371.880 MiB, 5.04% gc time)
  0.326880 seconds (1.19 M allocations: 75.524 MiB, 6.10% gc time)
  0.007051 seconds (3.78 k allocations: 283.598 KiB, 99.77% compilation time)
  1.166609 seconds (5.13 M allocations: 290.797 MiB, 7.04% gc time, 0.11% compilation time)
  0.003807 seconds (3.76 k allocations: 276.488 KiB, 94.54% compilation time)


StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

y ~ 1 + x18 + x9 + x1 + x2 + x19 + x6 + x17 + x20 + x12 + x15

Coefficients:
──────────────────────────────────────────────────────────────────────────────
                    Coef.  Std. Error      t  Pr(>|t|)   Lower 95%   Upper 95%
──────────────────────────────────────────────────────────────────────────────
(Intercept)   0.533908      0.165316    3.23    0.0017   0.205429    0.862386
x18          -0.238304      0.100526   -2.37    0.0199  -0.438046   -0.0385612
x9            0.0118725     0.112359    0.11    0.9161  -0.211383    0.235128
x1           -0.0567232     0.0989838  -0.57    0.5681  -0.253402    0.139956
x2            0.141666      0.100305    1.41    0.1613  -0.0576383   0.340971
x19          -0.00386677    0.0942322  -0.04    0.9674  -0.191104    0.183371
x6            0.110545      0.0979872  

In [3]:
x_vars = sample(x_symbols, r; replace=false)
@time F = term(:y) ~ sum(term(x) for x in x_vars)
@time cols = Tables.columntable(df)
@time mf = ModelFrame(F, cols, model=LinearModel)
@time mm = ModelMatrix(mf)
@time y = response(mf)
@time linmodel = fit(LinearModel, mm.m, y)
@time regmodel = StatsModels.TableRegressionModel(linmodel, mf, mm)

  0.088355 seconds (79.69 k allocations: 5.129 MiB, 10.98% gc time, 99.57% compilation time)
  0.000023 seconds (29 allocations: 1.766 KiB)
  0.055767 seconds (136.59 k allocations: 8.852 MiB, 7.07% compilation time)
  0.060071 seconds (80.33 k allocations: 5.021 MiB, 99.64% compilation time)
  0.006385 seconds (3.77 k allocations: 282.848 KiB, 99.69% compilation time)
  0.000032 seconds (21 allocations: 23.188 KiB)
  0.003459 seconds (2.75 k allocations: 194.801 KiB, 95.89% compilation time)


StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

y ~ 1 + x10 + x9 + x17 + x1 + x15 + x19 + x2 + x5 + x18 + x8

Coefficients:
──────────────────────────────────────────────────────────────────────────────
                   Coef.  Std. Error      t  Pr(>|t|)   Lower 95%    Upper 95%
──────────────────────────────────────────────────────────────────────────────
(Intercept)   0.542319     0.147277    3.68    0.0004   0.249683    0.834954
x10          -0.110315     0.100161   -1.10    0.2737  -0.309334    0.0887033
x9           -0.0257753    0.112318   -0.23    0.8190  -0.248949    0.197398
x17          -0.0978551    0.0919387  -1.06    0.2900  -0.280535    0.0848252
x1           -0.00360651   0.0978651  -0.04    0.9707  -0.198062    0.190849
x15          -0.0294383    0.0966061  -0.30    0.7613  -0.221393    0.162516
x19           0.0122843    0.0912678   0.13  

## Reproduce the slow-down (2)

In [4]:
using DataFrames
using GLM
using StatsBase: sample

n = 20
r = 10
x_symbols = [Symbol("x$i") for i in 1:n]
df = DataFrame(rand(100, n+1), [:y; x_symbols]);

result = []
for _ in 1:10
    x_vars = sample(x_symbols, r; replace=false)
    F = term(:y) ~ sum(term(x) for x in x_vars)
    @time regmodel = lm(F, df)
    push!(result, regmodel)
end
result

  1.811475 seconds (6.59 M allocations: 447.620 MiB, 11.11% gc time)
  0.190934 seconds (223.53 k allocations: 14.381 MiB, 33.41% gc time, 2.02% compilation time)
  0.124020 seconds (223.53 k allocations: 14.366 MiB, 3.03% compilation time)
  0.124958 seconds (223.53 k allocations: 14.366 MiB, 2.99% compilation time)
  0.132233 seconds (223.54 k allocations: 14.377 MiB, 5.50% gc time, 2.83% compilation time)
  0.122327 seconds (223.54 k allocations: 14.368 MiB, 3.09% compilation time)
  0.125943 seconds (223.54 k allocations: 14.372 MiB, 3.15% compilation time)
  0.134166 seconds (223.53 k allocations: 14.370 MiB, 5.17% gc time, 3.01% compilation time)
  0.124253 seconds (223.54 k allocations: 14.367 MiB, 3.12% compilation time)
  0.125161 seconds (223.53 k allocations: 14.368 MiB, 3.32% compilation time)


10-element Vector{Any}:
 StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

y ~ 1 + x2 + x20 + x8 + x3 + x13 + x1 + x16 + x14 + x19 + x9

Coefficients:
───────────────────────────────────────────────────────────────────────────
                  Coef.  Std. Error      t  Pr(>|t|)   Lower 95%  Upper 95%
───────────────────────────────────────────────────────────────────────────
(Intercept)   0.373004     0.172179   2.17    0.0330   0.0308881  0.715119
x2            0.0667202    0.103282   0.65    0.5199  -0.138499   0.271939
x20           0.0819035    0.110057   0.74    0.4587  -0.136778   0.300585
x8           -0.0694852    0.104158  -0.67    0.5064  -0.276444   0.137474
x3           -0.14002      0.108876  -1.29    0.2018  -0.356354   0.0763143
x13          -0.0404483    0.109096  -0.37    0.7117  -0.25722    0.176324
x1            0.152583     0.109007   1.40

## Solution

In [5]:
using DataFrames
using GLM
using StatsBase: sample

struct MyLinearModel{T, Y, X}
    linmodel::T
    y_var::Y
    x_vars::X
end

function my_lm(y_var, x_vars, df)
    y = df[!, y_var]
    X = [ones(nrow(df)) Matrix(df[!, x_vars])]
    linmodel = lm(X, y)
    MyLinearModel(linmodel, y_var, x_vars)
end

function Base.show(io::IO, mylm::MyLinearModel)
    linmodel, y_var, x_vars = getfield.(Ref(mylm), (:linmodel, :y_var, :x_vars))
    ct = coeftable(linmodel)
    ct.rownms .= string.((Symbol("(Intercept)"), x_vars...,))
    print(io, "Formula: ")
    print(io, y_var, " ~ 1")
    for x in x_vars print(io, " + ", x) end
    print(io, "\n\n")
    show(io, ct)
    print(io, "\n")
end

n = 20
r = 10
x_symbols = [Symbol("x$i") for i in 1:n]
df = DataFrame(rand(100, n+1), [:y; x_symbols]);

myresult = []
for _ in 1:10
    x_vars = sample(x_symbols, r; replace=false)
    @time mylinmodel = my_lm(:y, x_vars, df)
    push!(myresult, mylinmodel)
end
myresult

  0.257736 seconds (604.40 k allocations: 37.423 MiB, 3.69% gc time)
  0.000040 seconds (56 allocations: 43.516 KiB)
  0.000026 seconds (56 allocations: 43.516 KiB)
  0.000024 seconds (56 allocations: 43.516 KiB)
  0.000024 seconds (56 allocations: 43.516 KiB)
  0.000024 seconds (56 allocations: 43.516 KiB)
  0.000088 seconds (56 allocations: 43.516 KiB)
  0.000069 seconds (56 allocations: 43.516 KiB)
  0.000062 seconds (56 allocations: 43.516 KiB)
  0.000058 seconds (56 allocations: 43.516 KiB)


10-element Vector{Any}:
 Formula: y ~ 1 + x4 + x10 + x19 + x18 + x20 + x5 + x2 + x6 + x1 + x8

──────────────────────────────────────────────────────────────────────────────
                    Coef.  Std. Error      t  Pr(>|t|)   Lower 95%   Upper 95%
──────────────────────────────────────────────────────────────────────────────
(Intercept)   0.714215      0.158673    4.50    <1e-04   0.398935   1.02949
x4           -0.160971      0.111156   -1.45    0.1511  -0.381836   0.0598948
x10          -0.0989967     0.0971448  -1.02    0.3109  -0.292021   0.0940279
x19           0.0319596     0.105443    0.30    0.7625  -0.177554   0.241473
x18           0.0993943     0.100193    0.99    0.3239  -0.0996867  0.298475
x20           0.000405853   0.103199    0.00    0.9969  -0.204649   0.205461
x5            0.00198443    0.100478    0.02    0.9843  -0.197664   0.201633
x2           -0.0430512     0.100212   -0.43    0.6685  -0.242169   0.156067
x6           -0.0363727     0.102886   -0.35    0.7

## Analysis of the slow-down

In [6]:
x_vars = sample(x_symbols, r; replace=false)
@show x_vars
@time F = term(:y) ~ sum(term(x) for x in x_vars)
@time lm(F, df)

x_vars = [:x11, :x13, :x10, :x20, :x16, :x8, :x17, :x19, :x2, :x1]
  0.060092 seconds (79.68 k allocations: 5.127 MiB, 99.49% compilation time)
  0.133748 seconds (223.52 k allocations: 14.367 MiB, 6.17% gc time, 2.87% compilation time)


StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

y ~ 1 + x11 + x13 + x10 + x20 + x16 + x8 + x17 + x19 + x2 + x1

Coefficients:
────────────────────────────────────────────────────────────────────────────
                   Coef.  Std. Error      t  Pr(>|t|)  Lower 95%   Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept)   0.761804     0.150843    5.05    <1e-05   0.462082   1.06153
x11          -0.0956561    0.104691   -0.91    0.3633  -0.303675   0.112363
x13           0.0373872    0.1135      0.33    0.7426  -0.188134   0.262909
x10          -0.112881     0.0945583  -1.19    0.2357  -0.300766   0.0750047
x20          -0.0350142    0.106714   -0.33    0.7436  -0.247053   0.177025
x16           0.0629904    0.106135    0.59    0.5544  -0.147898   0.273879
x8           -0.203277     0.097153   -2.09    0.0393  -0

In [7]:
@time my_lm(:y, x_vars, df)

  0.000063 seconds (56 allocations: 43.516 KiB)


Formula: y ~ 1 + x11 + x13 + x10 + x20 + x16 + x8 + x17 + x19 + x2 + x1

────────────────────────────────────────────────────────────────────────────
                   Coef.  Std. Error      t  Pr(>|t|)  Lower 95%   Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept)   0.761804     0.150843    5.05    <1e-05   0.462082   1.06153
x11          -0.0956561    0.104691   -0.91    0.3633  -0.303675   0.112363
x13           0.0373872    0.1135      0.33    0.7426  -0.188134   0.262909
x10          -0.112881     0.0945583  -1.19    0.2357  -0.300766   0.0750047
x20          -0.0350142    0.106714   -0.33    0.7436  -0.247053   0.177025
x16           0.0629904    0.106135    0.59    0.5544  -0.147898   0.273879
x8           -0.203277     0.097153   -2.09    0.0393  -0.396318  -0.0102366
x17          -0.0977642    0.102489   -0.95    0.3427  -0.301408   0.10588
x19           0.0284196    0.10075     0.28    0.7785  -0.171769   0.228608
x2          

In [8]:
x_vars = sample(x_symbols, r; replace=false)
@time my_lm(:y, x_vars, df)

  0.000062 seconds (56 allocations: 43.516 KiB)


Formula: y ~ 1 + x9 + x3 + x12 + x14 + x20 + x2 + x5 + x17 + x4 + x18

────────────────────────────────────────────────────────────────────────────
                   Coef.  Std. Error      t  Pr(>|t|)   Lower 95%  Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept)   0.595399     0.168472    3.53    0.0007   0.260649   0.930149
x9           -0.116444     0.0982973  -1.18    0.2393  -0.311759   0.0788707
x3           -0.00751677   0.106634   -0.07    0.9440  -0.219397   0.204363
x12           0.102296     0.102117    1.00    0.3192  -0.100608   0.3052
x14           0.0271792    0.103701    0.26    0.7939  -0.178873   0.233231
x20           0.0264579    0.100455    0.26    0.7929  -0.173144   0.226059
x2           -0.0325485    0.101418   -0.32    0.7490  -0.234064   0.168967
x5           -0.0372667    0.0977348  -0.38    0.7039  -0.231464   0.15693
x17          -0.122076     0.103019   -1.18    0.2392  -0.326772   0.0826202
x4           -0

In [9]:
x_vars = sample(x_symbols, r; replace=false)
@time F = term(:y) ~ sum(term(x) for x in x_vars)
@time cols = Tables.columntable(df)
@time mf = ModelFrame(F, cols, model=LinearModel)
@time mm = ModelMatrix(mf)
@time y = response(mf)
@time linmodel = fit(LinearModel, mm.m, y)
@time regmodel = StatsModels.TableRegressionModel(linmodel, mf, mm);

  0.075462 seconds (79.67 k allocations: 5.126 MiB, 99.15% compilation time)
  0.000024 seconds (29 allocations: 1.766 KiB)
  0.058544 seconds (136.64 k allocations: 8.861 MiB, 6.95% compilation time)
  0.060075 seconds (80.32 k allocations: 5.020 MiB, 99.66% compilation time)
  0.007243 seconds (3.77 k allocations: 282.191 KiB, 99.61% compilation time)
  0.000034 seconds (21 allocations: 23.188 KiB)
  0.003435 seconds (2.75 k allocations: 196.098 KiB, 95.75% compilation time)


In [10]:
typeof(cols)

NamedTuple{(:y, :x1, :x2, :x3, :x4, :x5, :x6, :x7, :x8, :x9, :x10, :x11, :x12, :x13, :x14, :x15, :x16, :x17, :x18, :x19, :x20), NTuple{21, Vector{Float64}}}

In [11]:
typeof(mf)

ModelFrame{NamedTuple{(:y, :x20, :x16, :x14, :x18, :x7, :x17, :x5, :x13, :x12, :x4), NTuple{11, Vector{Float64}}}, LinearModel}

In [12]:
typeof(mm) |> x -> (fieldnames(x), fieldtypes(x))

((:m, :assign), (Matrix{Float64}, Vector{Int64}))

In [13]:
typeof(y)

Vector{Float64} (alias for Array{Float64, 1})

In [14]:
typeof(linmodel)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}

In [15]:
typeof(regmodel)

StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}}}}, Matrix{Float64}}

In [16]:
regmodel.model == linmodel

true

In [17]:
regmodel.mf == mf

true

In [18]:
regmodel.mm == mm

true

In [19]:
@which lm(F, df)

In [20]:
@which fit(LinearModel, F, df)

In [21]:
@which fit(LinearModel, mm.m, y)