In [2]:
using Plots
using DataFrames
using CSV
using Statistics
using Printf
import MultivariateStats
import GLM

In [31]:
data = DataFrame(CSV.File("./tree-sitter-analyzer/outputs/all.csv"))
data[ismissing.(data.group), :group] .= "whatever"
data.filesize = data.statements_total .+ data.expressions_total
data.isjupyter = data.group .== "python-jupyter"
data.isweb = endswith.(data.group, "-web")
data.islib = endswith.(data.group, "-lib")
data.isdatascience = data.group .== "python-datascience" .|| data.group .== "python-jupyter"
data.ispython = data.lang .== "python"
data.identifier_len_avg = ifelse.(ismissing.(data.identifier_len_avg), 0, data.identifier_len_avg)
data.identifier_len_avg = ifelse.(.! isfinite.(data.identifier_len_avg), 0, data.identifier_len_avg)
data.function_size_avg = ifelse.(ismissing.(data.function_size_avg), 0, data.function_size_avg)
data.function_size_avg = ifelse.(.! isfinite.(data.function_size_avg), 0, data.function_size_avg)
data = data[data.filesize .> 0 .&& isfinite.(data.filesize), :]
files = data[data.type .== "file" .&& data.statements_total .> 3, :]
projects = data[data.type .== "dir-total" .&& data.statements_total .> 10, :]
()

()

In [11]:
function print_basic_stats(formula, df, stat_fn)
	formula = GLM.apply_schema(formula, GLM.schema(formula, df))
	resp, pred = GLM.modelcols(formula, df)
	resp_name, pred_names = GLM.coefnames(formula)
	for i = 1:length(pred_names)
		pred_name = pred_names[i]
		pred0 = pred[resp .== 0, i]
		pred1 = pred[resp .== 1, i]
		@printf("%60s %8.5f %8.5f\n", pred_name, stat_fn(pred0), stat_fn(pred1))
	end
end

print_basic_stats (generic function with 1 method)

In [14]:
compared_groups(g) = g in ["python-web", "python-jupyter"]
# compared_groups(g) = g in ["python-datascience", "python-jupyter"]
# compared_groups(g) = startswith(g, "python-")
py = projects[compared_groups.(projects.group), :]
py_files = files[compared_groups.(files.group), :]

println("fraction of jupyter files ", mean(py_files.isjupyter), " = ", sum(py_files.isjupyter), " / ", size(py_files, 1))
println("fraction of jupyter projects ", mean(py.isjupyter), " = ", sum(py.isjupyter), " / ", size(py, 1))
println("mean file size = ", mean(py_files.filesize))
formula = GLM.@formula(isjupyter ~
	filesize +
	(expressions_total / statements_total) + # number of expressions per statement
	(binary_operators_total / filesize) +
	(chained_calls_total / filesize) +
	(class_defs_total / filesize) +
	(conditions_total / filesize) +
	(decorators_total / filesize) +
	(field_accesses_total / filesize) +
	(field_assignments_total / filesize) +
	(function_defs_total / filesize) +
	function_size_avg +
	(identifier_len_avg) +
	(indexing_total / filesize) +
	(invocations_total / filesize) +
	(lambda_functions_total / filesize) +
	(literals_total / filesize) +
	(loops_total / filesize) +
	(nested_functions_total / max(1, function_defs_total)) +
	(slicing_total / filesize) +
	(try_catches_total / filesize) +
	(variable_assignments_total / filesize) +
	0
	)
# print_basic_stats(formula, py_files, median)
print_basic_stats(formula, py_files, mean)
# print_basic_stats(formula, py_files, std)
GLM.glm(formula, py_files, GLM.Bernoulli(), GLM.LogitLink())


fraction of jupyter files 0.18195024514254585 = 1002 / 5507
fraction of jupyter projects 0.3488372093023256 = 15 / 43
mean file size = 276.9228254948248
                                                    filesize 232.93541 474.69062
                        expressions_total / statements_total  2.86266  2.65766
                           binary_operators_total / filesize  0.01379  0.04857
                              chained_calls_total / filesize  0.02129  0.01062
                                 class_defs_total / filesize  0.01864  0.00170
                                 conditions_total / filesize  0.04106  0.01022
                                 decorators_total / filesize  0.01571  0.00084
                             field_accesses_total / filesize  0.17140  0.09472
                          field_assignments_total / filesize  0.01303  0.00802
                              function_defs_total / filesize  0.05395  0.01355
                                           function_siz

StatsModels.TableRegressionModel{GLM.GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Distributions.Bernoulli{Float64}, GLM.LogitLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}

isjupyter ~ 0 + filesize + :(expressions_total / statements_total) + :(binary_operators_total / filesize) + :(chained_calls_total / filesize) + :(class_defs_total / filesize) + :(conditions_total / filesize) + :(decorators_total / filesize) + :(field_accesses_total / filesize) + :(field_assignments_total / filesize) + :(function_defs_total / filesize) + function_size_avg + identifier_len_avg + :(indexing_total / filesize) + :(invocations_total / filesize) + :(lambda_functions_total / filesize) + :(literals_total / filesize) + :(loops_total / filesize) + :(nested_functions_total / max(1, function_defs_total)) + :(slicing_total / filesize) + :(try_catches_total / filesize) + :(variable_assignments_total / filesize)

Coefficients:
────

In [None]:
compared_groups(g) = g in ["python-jupyter", "cs-lib"]
pyvscsharp = files[compared_groups.(files.group), :]
formula = GLM.@formula(ispython ~
	# filesize +
	(expressions_total / statements_total) + # number of expressions per statement
	(binary_operators_total / filesize) +
	(chained_calls_total / filesize) +
	# (class_defs_total / filesize) +
	(conditions_total / filesize) +
	# (decorators_total / filesize) +
	(field_accesses_total / filesize) +
	(field_assignments_total / filesize) +
	# (function_defs_total / filesize) +
	function_size_avg +
	(identifier_len_avg) +
	(indexing_total / filesize) +
	(invocations_total / filesize) +
	(lambda_functions_total / filesize) +
	(literals_total / filesize) +
	(loops_total / filesize) +
	# (nested_functions_total / max(1, function_defs_total)) +
	# (slicing_total / filesize) +
	# (try_catches_total / filesize) +
	(variable_assignments_total / filesize) +
	0
)
model = GLM.glm(formula, pyvscsharp, GLM.Bernoulli(), GLM.LogitLink())
println(model)

println(typeof(Float64.(GLM.modelmatrix(formula, projects))))
isdatascience_pred = GLM.predict(model, Float64.(GLM.modelmatrix(formula, projects)))

println(DataFrame(group = projects.group, dir = projects.dir, isds = isdatascience_pred))