Skip to content

Commit

Permalink
implement @collapse, resolve #48
Browse files Browse the repository at this point in the history
  • Loading branch information
gergelyattilakiss committed Jun 23, 2024
1 parent ca4f414 commit f82de01
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 57 deletions.
28 changes: 21 additions & 7 deletions src/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,27 @@ end

function rewrite(::Val{:collapse}, command::Command)
dfname = command.df
target_columns = get_LHS.(command.arguments)
#target_columns = get_LHS.(command.arguments)
bitmask = build_bitmask(command)
by_cols = get_by(command)
# check that target_column does not exist in dfname
df2 = gensym()
sdf = gensym()
combine_epxression = Expr(:call, :combine, sdf, build_assignment_formula.(command.arguments)...)
gsdf = gensym()
if isnothing(by_cols)
combine_epxression = Expr(:call, :combine, sdf, build_assignment_formula.(command.arguments)...)
else
combine_epxression = Expr(:call, :combine, gsdf, build_assignment_formula.(command.arguments)...)
end
quote
local $df2 = copy($dfname)
local $sdf = view($df2, $bitmask, :)
$combine_epxression
if isnothing($by_cols)
$combine_epxression
else
local $gsdf = groupby($sdf, $by_cols)
$combine_epxression
end
end |> esc
end

Expand Down Expand Up @@ -84,18 +95,22 @@ function rewrite(::Val{:egen}, command::Command)
dfname = command.df
target_column = get_LHS(command.arguments[1])
by_cols = get_by(command)
@info "By columns are $by_cols"
bitmask = build_bitmask(command)
# check that target_column does not exist in dfname
df2 = gensym()
sdf = gensym()
RHS = replace_variable_references(sdf, command.arguments[1].args[2]) |> vectorize_function_calls
gsdf = gensym()
RHS = replace_variable_references(gsdf, command.arguments[1].args[2]) |> vectorize_function_calls
quote
if !($target_column in names($dfname))
local $df2 = copy($dfname)
$df2[!, $target_column] .= missing
local $sdf = view($df2, $bitmask, :)
$sdf[!, $target_column] .= $RHS
local $gsdf = groupby($sdf, $by_cols)
for (i,g) in $gsdf
g[!, $target_column] .= $RHS[i]
end
$df2 = combine($gsdf, names($gsdf))
$df2
else
ArgumentError("Column \"$($target_column)\" already exists in $(names($dfname))") |> throw
Expand All @@ -105,7 +120,6 @@ end

function get_by(command::Command)
options = command.options
@info "options are $options"
for opt in options
if opt isa Expr && opt.head == :call && opt.args[1] == :by
return opt.args[2:end]
Expand Down
117 changes: 67 additions & 50 deletions test/commands.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,73 @@ end
@test all(df2.y .== df.x)
@test all(df2.z .== minimum(df.x))
end
@testset "Known values by group(s)" begin
df = DataFrame(x = 1:6, z = 7:12, s = ["a", "b", "a", "c", "d", "d"], group = ["red", "red", "red", "blue", "blue", "blue"])
df2 = @collapse df y = sum(x), by(group)
@test df2.y == [6, 15]
df2 = @collapse df y = minimum(x), by(group)
@test df2.y == [1, 4]
df2 = @collapse df y = sum(x), by(group, s)
@test df2.y == [4, 2, 4, 11]
df2 = @collapse df y = minimum(x), by(group, s)
@test df2.y == [1, 2, 4, 5]
end
end

@testset "Egen" begin
df = DataFrame(x = 1:6, s = ["a", "b", "a", "c", "d", "d"], group = ["red", "red", "red", "blue", "blue", "blue"])

@testset "Column added" begin
df2 = @egen df y = mean(x)
@test "y" in names(df2)
@test "x" in names(df2) && "group" in names(df2)
@test df.x == df2.x
@test df.group == df2.group
end
@testset "Known values for not vectorized functions" begin
df2 = @egen df y = sum(x)
@test all(df2.y .== sum(df.x))
df2 = @egen df y = minimum(x)
@test all(df2.y .== minimum(df.x))
df2 = @egen df y = maximum(x)
@test all(df2.y .== maximum(df.x))
end
@testset "Known values for vectorized functions" begin
df2 = @egen df y = sum.(x)
@test all(df2.y .== df.x)
df2 = @egen df y = minimum.(x)
@test all(df2.y .== df.x)
df2 = @egen df y = maximum.(x)
@test all(df2.y .== df.x)
end
@testset "Do not replace special variable names" begin
df2 = @egen df y = missing
@test all(ismissing.(df2.y))
df2 = @egen df y = nothing
@test all(isnothing.(df2.y))
df2 = @egen df y = s isa String
@test all(df2.y)
df2 = @egen df y = s isa Missing
@test !any(df2.y)
df2 = @egen df y = "string" @if s isa String
end
@testset "Known values by group(s)" begin
df2 = @egen df y = sum(x), by(group)
@test all(df2.y .== [6, 6, 6, 15, 15, 15])
df2 = @egen df y = minimum(x), by(group)
@test all(df2.y .== [1, 1, 1, 4, 4, 4])
df2 = @egen df y = maximum(x), by(group)
@test all(df2.y .== [3, 3, 3, 6, 6, 6])
df2 = @egen df y = sum(x), by(group, s)
@test all(df2.y .== [4, 2, 4, 4, 11, 11])
df2 = @egen df y = minimum(x), by(group, s)
@test all(df2.y .== [1, 2, 1, 4, 5, 5])
df2 = @egen df y = maximum(x), by(group, s)
@test all(df2.y .== [3, 2, 3, 4, 6, 6])
end
@testset "Error handling" begin
@test_throws ArgumentError @egen df x = 1
end
end

@testset "Keep if" begin
Expand Down Expand Up @@ -149,56 +216,6 @@ end
end
end

@testset "Egen" begin
df = DataFrame(x = 1:4, s = ["a", "b", "c", "d"], group = ["red", "red", "blue", "blue"])

@testset "Column added" begin
df2 = @egen df y = mean(x)
@test "y" in names(df2)
@test "x" in names(df2) && "group" in names(df2)
@test df.x == df2.x
@test df.group == df2.group
end
@testset "Known values for not vectorized functions" begin
df2 = @egen df y = sum(x)
@test all(df2.y .== sum(df.x))
df2 = @egen df y = minimum(x)
@test all(df2.y .== minimum(df.x))
df2 = @egen df y = maximum(x)
@test all(df2.y .== maximum(df.x))
end
@testset "Known values for vectorized functions" begin
df2 = @egen df y = sum.(x)
@test all(df2.y .== df.x)
df2 = @egen df y = minimum.(x)
@test all(df2.y .== df.x)
df2 = @egen df y = maximum.(x)
@test all(df2.y .== df.x)
end
@testset "Do not replace special variable names" begin
df2 = @egen df y = missing
@test all(ismissing.(df2.y))
df2 = @egen df y = nothing
@test all(isnothing.(df2.y))
df2 = @egen df y = s isa String
@test all(df2.y)
df2 = @egen df y = s isa Missing
@test !any(df2.y)
df2 = @egen df y = "string" @if s isa String
end
@testset "Known values by group" begin
df2 = @egen df y = sum(x) @by group
@test all(df2.y .== [3, 3, 7, 7])
df2 = @egen df y = minimum(x) @by group
@test all(df2.y .== [1, 1, 3, 3])
df2 = @egen df y = maximum(x) @by group
@test all(df2.y .== [2, 2, 4, 4])
end
@testset "Error handling" begin
@test_throws ArgumentError @egen df x = 1
end
end

@testset "Egen with if" begin
df = DataFrame(x = 1:4, group=["red", "red", "blue", "blue"])
dfxz = DataFrame(x = 1:4, z = 1:4, group=["red", "red", "blue", "blue"])
Expand Down

0 comments on commit f82de01

Please sign in to comment.