# Implementação do algoritmo Apriori

In [56]:
using DataFrames
using Combinatorics

In [57]:
function apriori(transactions::Vector{Vector{Int64}}, articles::Vector{Int64}, threshold::Float64=0.3, max::Int64=4)
    combination_max_size = max
    combs_of_size = [Set(collect(combinations(articles,n))) for n in collect(1:combination_max_size)]
    transactions_count = length(transactions)
    result = []
    dict_comb = Dict{Array{Int64,1}, Float64}()
    for n in collect(1:combination_max_size)
        for comb in combs_of_size[n]
            found_percentage = sum([issubset(comb,t) for t in transactions]) / transactions_count
            if found_percentage < threshold
                for j in collect(n+1:combination_max_size)
                    if length(combs_of_size[j]) > 0
                        filter!(c -> !issubset(comb, c), combs_of_size[j])
                    end
                end
            else
                dict_comb[comb] = found_percentage
                push!(result, comb)
            end
        end
    end
    return dict_comb, result
end

apriori (generic function with 3 methods)

In [58]:
function association_rules(frequent_items_dict::Dict{Vector{Int64}, Float64}, metric::String="confidence", min_threshold::Float64=0.8, min_lift::Float64=0.0)
    metric_dict = Dict{String,Function}(
            "antecedent support" => (_, sA, __) -> sA,
            "consequent support" => (_, __, sC) -> sC,
            "support" => (sAC, _, __) -> sAC,
            "confidence" => (sAC, sA, _) -> sAC ./ sA,
            "lift" => (sAC, sA, sC) -> metric_dict["confidence"](sAC, sA, sC)./sC
    )
    columns_ordered = [
        "antecedent support",
        "consequent support",
        "support",
        "confidence",
        "lift"
		]
    rule_antecedents = []
    rule_consequents = []
    rule_supports = []
    for (k, sAC) in frequent_items_dict
        for idx in collect((length(k)-1):-1:1)
            for c in collect(combinations(k, idx))
                antecedent = collect(c)
                consequent = setdiff(k, antecedent)
      sA = frequent_items_dict[antecedent]
      sC = frequent_items_dict[consequent]
                score = metric_dict[metric](sAC, sA, sC)
                Lift = metric_dict["lift"](sAC, sA, sC)
                if score >= min_threshold && Lift >= min_lift
                        push!(rule_antecedents, antecedent)
                        push!(rule_consequents, consequent)
                        push!(rule_supports, [sAC, sA, sC])
                end
            end
        end
    end
    if isempty(rule_supports)
        return "Error"
    else
        rule_supports = hcat(rule_supports...)
        df_res = DataFrame(antecedents=rule_antecedents, consequents=rule_consequents)
  		sAC = rule_supports[1, :]
  		sA = rule_supports[2, :]
  		sC = rule_supports[3, :]
  		for m in columns_ordered
    		df_res[!, m] = metric_dict[m](sAC, sA, sC)
  		end
        return df_res
    end
end

association_rules (generic function with 4 methods)

# lendo e Transformando dados

In [31]:
using CSV
using DataFrames
using MLJ
using CategoricalArrays

In [32]:
df_init_data = DataFrame(CSV.File("Base.csv")); #importando dados
df_init_data = copy(df_init_data[:, ["NUMNOTA", "SUBCATEGORIA", "Categoria Média"]]); # Selecionando categorias importantes
df_init_data = unique(df_init_data); # Removendo ctegorias duplicadas
first(df_init_data, 5)

Row,NUMNOTA,SUBCATEGORIA,Categoria Média
Unnamed: 0_level_1,Int64,String,String15
1,139612,ALGA E NORI,RESTAURANTE
2,139612,SACHE GUARDANAPO,RESTAURANTE
3,142641,FARINHA PARA EMPANAR,RESTAURANTE
4,142641,OLEO DE ALGODAO,RESTAURANTE
5,900874,OLEOS ESPECIAIS,RESTAURANTE


In [33]:
# Displaying column types
eltype.(eachcol(df_init_data))

3-element Vector{DataType}:
 Int64
 String
 String15

In [34]:
df_focus_data = df_init_data[df_init_data."Categoria Média" .== "RESTAURANTE", [:NUMNOTA,:SUBCATEGORIA]];

In [35]:
df_focus_data.SUBCATEGORIA = CategoricalArray(df_focus_data.SUBCATEGORIA, ordered=false); # Mudando para dados categóricos

In [36]:
# Displaying column types
eltype.(eachcol(df_focus_data))

2-element Vector{DataType}:
 Int64
 CategoricalValue{String, UInt32}

In [37]:
# Agrupoando por transação
gd = groupby(df_focus_data, :NUMNOTA);

In [38]:
# Salvando transações no formato correto
trans::Vector{Vector{Int64}} = []
for k in keys(gd)
    push!(trans,Vector{Int64}(levelcode.(gd[k].SUBCATEGORIA))) # Adicionando vetor com codigo das categoras a lista trans
end

In [39]:
trans

28593-element Vector{Vector{Int64}}:
 [15, 278]
 [118, 193]
 [197, 207]
 [117, 195, 248, 288, 294, 322]
 [26, 56, 132, 165, 239, 324]
 [294, 307]
 [118, 199]
 [6, 53, 62, 93, 97, 117, 118, 121, 122, 140, 152, 154, 173, 199, 225, 288, 291]
 [38, 116, 117, 144, 145, 195, 214, 254, 324]
 [53, 108, 145, 158, 270]
 [109, 114, 180, 185, 247, 287, 294, 301, 323]
 [8, 11, 38, 81, 85, 93, 117, 278]
 [53, 64, 165, 168, 237, 254, 312]
 ⋮
 [8, 71, 117, 119, 154, 185]
 [6, 10, 13, 30, 37, 38, 53, 101, 174, 182, 192, 245, 288, 318, 323]
 [266]
 [131, 165, 239, 294, 313, 323]
 [64, 104, 242, 282]
 [30, 53, 106, 122, 224, 225, 254]
 [189, 247, 252, 268]
 [10, 13, 24, 53, 97, 99, 127, 145, 239, 247, 250, 288, 291, 323]
 [278]
 [6, 122]
 [38, 46, 56, 85, 193, 254, 317]
 [20, 115, 131, 132, 179]

In [40]:
maximum(unique(levelcode.(df_focus_data.SUBCATEGORIA))) ## Numero de itens

327

In [41]:
minimum(unique(levelcode.(df_focus_data.SUBCATEGORIA))) ## Numero de itens

1

In [42]:
# Salvando transações no formato correto
trans_str::Vector{Vector{String}} = []
for k in keys(gd)
    push!(trans_str,Vector{String}(gd[k].SUBCATEGORIA)) # Adicionando vetor com codigo das categoras a lista trans
end
trans_str

28593-element Vector{Vector{String}}:
 ["ALGA E NORI", "SACHE GUARDANAPO"]
 ["FARINHA PARA EMPANAR", "OLEO DE ALGODAO"]
 ["OLEOS ESPECIAIS", "OUTRAS MASSAS SECAS"]
 ["FARINHA DE TRIGO", "OLEO DE SOJA", "PAPEL HIGIENICO", "SACO DE LIXO", "SAL", "VASSOURAS"]
 ["ARROZ BRANCO", "CALDO DE GALINHA", "FUBA E POLENTA", "MAIONESE", "OUTROS TEMPEROS E ESPECIARIAS", "VINHO"]
 ["SAL", "TALHERES"]
 ["FARINHA PARA EMPANAR", "OUTRAS BEBIDAS"]
 ["ACUCAR REFINADO", "CAFE EM PO", "CASTANHA DE CAJU", "CREME CULINARIO", "DERIVADOS DE COCO", "FARINHA DE TRIGO", "FARINHA PARA EMPANAR", "FEIJAO CARIOCA", "FEIJAO PRETO", "GLUTAMATO MONOSSODICO", "LEITE CONDENSADO", "LEITE INTEGRAL", "MEXEDOR", "OUTRAS BEBIDAS", "OUTROS FEIJOES", "SACO DE LIXO", "SACO PLASTICO (BOBINA OU ZIP)"]
 ["AZEITONAS", "FARINHA DE ROSCA", "FARINHA DE TRIGO", "GUARDANAPO", "HAMBURGUEIRAS E ESTOJOS", "OLEO DE SOJA", "OUTROS CALDOS SOPAS E CREMES", "PEPINO EM CONSERVA", "VINHO"]
 ["CAFE EM PO", "ESPETO", "HAMBURGUEIRAS E ESTOJOS", "LIMPA A

In [43]:
dict_codes = Dict(levelcode.(df_focus_data.SUBCATEGORIA) .=> Vector{String}(df_focus_data.SUBCATEGORIA))

Dict{Int64, String} with 327 entries:
  316 => "TOMATE PELADO"
  214 => "OUTROS CALDOS SOPAS E CREMES"
  77  => "CHOCOLATE EM BARRA MEIO AMARGO"
  318 => "TOUCA DESCARTAVEL"
  121 => "FEIJAO CARIOCA"
  154 => "LEITE INTEGRAL"
  143 => "GRAO DE BICO"
  170 => "MASCARA DESCARTAVEL"
  242 => "OVO DE CODORNA"
  166 => "MANTEIGA POTE E TABLETE"
  83  => "COBERTURAS E CALDAS"
  286 => "SACHES MOSTARDA"
  308 => "TALHERES DE MESA"
  137 => "GENGIBRE"
  97  => "DERIVADOS DE COCO"
  30  => "ARROZ PARBOILIZADO"
  307 => "TALHERES"
  115 => "FARINHA DE MILHO"
  101 => "DETERGENTE (LAVA LOUCAS)"
  8   => "AGUA COM GAS"
  183 => "MOLHO DE TOMATE"
  206 => "OUTRAS MASSAS FRESCAS"
  320 => "TRIGO PARA QUIBE"
  194 => "OLEO DE GERGELIM"
  71  => "CHAS"
  ⋮   => ⋮

# Rodando o Algoritmo

In [59]:
@time begin
itens = collect(1:327); #articles are represented as unique ids from 1 to 327
threshold = 0.001;
transactions = trans;
dict_comb, frequentes = apriori(transactions, itens, threshold, 2);
df_res = association_rules(dict_comb, "confidence", 0.2);
end

 72.943079 seconds (1.44 M allocations: 1.192 GiB, 0.16% gc time)


Row,antecedents,consequents,antecedent support,consequent support,support,confidence,lift
Unnamed: 0_level_1,Any,Any,Float64,Float64,Float64,Float64,Float64
1,[154],[239],0.100829,0.174273,0.0242717,0.240721,1.38129
2,[25],[117],0.00580562,0.17235,0.00129402,0.222892,1.29325
3,[50],[22],0.0287133,0.0850208,0.00654006,0.227771,2.679
4,[62],[6],0.01322,0.126709,0.00297276,0.224868,1.77467
5,[198],[294],0.0109118,0.172455,0.00402196,0.36859,2.13731
6,[79],[109],0.0152485,0.102088,0.00409191,0.268349,2.6286
7,[230],[117],0.0413388,0.17235,0.0094079,0.22758,1.32046
8,[289],[6],0.00419683,0.126709,0.00101423,0.241667,1.90725
9,[183],[106],0.0546637,0.13958,0.0131501,0.240563,1.72348
10,[96],[114],0.0759976,0.116147,0.0155982,0.205246,1.76712


In [60]:
df_res.antecedents = first.(df_res.antecedents); # Transformando em Int
df_res.consequents = first.(df_res.consequents); # Transformando em Int

In [61]:
df_res[!, :antecedents_str] = [dict_codes[x] for x ∈ df_res.antecedents]; # Texto dos itens
df_res[!, :consequents_str] = [dict_codes[x] for x ∈ df_res.consequents]; # Texto dos itens

In [62]:
df_res = sort!(df_res, order(:lift, rev=true)) # ordenar regras

Row,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,antecedents_str,consequents_str
Unnamed: 0_level_1,Int64,Int64,Float64,Float64,Float64,Float64,Float64,String,String
1,49,271,0.00167873,0.00342741,0.00108418,0.645833,188.432,CABOS PARA RODOS E VASSOURAS,RODOS
2,271,49,0.00342741,0.00167873,0.00108418,0.316327,188.432,RODOS,CABOS PARA RODOS E VASSOURAS
3,80,81,0.0129402,0.00877837,0.00458154,0.354054,40.3325,COBERTURA CHOCOLATE AO LEITE,COBERTURA CHOCOLATE BRANCO
4,81,80,0.00877837,0.0129402,0.00458154,0.521912,40.3325,COBERTURA CHOCOLATE BRANCO,COBERTURA CHOCOLATE AO LEITE
5,277,279,0.00891827,0.0110866,0.00395202,0.443137,39.9704,SACHE AZEITE,SACHE VINAGRE
6,279,277,0.0110866,0.00891827,0.00395202,0.356467,39.9704,SACHE VINAGRE,SACHE AZEITE
7,103,146,0.00468646,0.0067499,0.00115413,0.246269,36.4848,EMBALAGENS ORIENTAIS,HASHI
8,75,76,0.00234323,0.0163327,0.00115413,0.492537,30.1566,CHOCOLATE EM BARRA BLEND,CHOCOLATE EM BARRA BRANCO
9,15,28,0.0111566,0.0148288,0.00493128,0.442006,29.8073,ALGA E NORI,ARROZ JAPONES
10,28,15,0.0148288,0.0111566,0.00493128,0.332547,29.8073,ARROZ JAPONES,ALGA E NORI


In [63]:
CSV.write("out.csv", df_res)

"out.csv"

# Gerando Recomendações

In [64]:
function arl_recommender(rules_df, product_id, rec_count=1)
    sorted_rules = rules_df
    recommendation_list = []
    for i in 1:size(sorted_rules, 1)
        product = sorted_rules[!, "antecedents_str"][i]
        if product in product_id
            push!(recommendation_list, sorted_rules[!, "consequents_str"][i])
        end
    end
    if rec_count > size(product_id, 1)
        rec_count = size(product_id, 1)
    end
    recommendation_list = union(recommendation_list)
    recommendation_list = setdiff(recommendation_list,product_id)
    return recommendation_list[1:rec_count]
end

arl_recommender (generic function with 2 methods)

In [78]:
t1 = ["RODOS", "SUCO NECTAR", "MILHO VERDE", "LEITE INTEGRAL", "CAFE EM PO", "SOBREMESAS EM PO", "ACUCAR REFINADO", "VINAGRE", "OUTROS TEMPEROS E ESPECIARIAS", "FARINHA DE TRIGO"]

10-element Vector{String}:
 "RODOS"
 "SUCO NECTAR"
 "MILHO VERDE"
 "LEITE INTEGRAL"
 "CAFE EM PO"
 "SOBREMESAS EM PO"
 "ACUCAR REFINADO"
 "VINAGRE"
 "OUTROS TEMPEROS E ESPECIARIAS"
 "FARINHA DE TRIGO"

In [79]:
r1 = arl_recommender(df_res, t2, 10)

10-element Vector{Any}:
 "CABOS PARA RODOS E VASSOURAS"
 "VASSOURAS"
 "OUTROS CEREAIS E SEMENTES"
 "OLEO DE SOJA"
 "AGUA SANITARIA"
 "BATATA PALHA"
 "ARROZ PARBOILIZADO"
 "AMIDO DE MILHO"
 "DERIVADOS DE COCO"
 "ARROZ BRANCO"

In [80]:
t2 = ["AZEITONAS", "BISCOITOS E BOLINHOS", "CALDO DE GALINHA", "COGUMELOS", "OLEO DE ALGODAO", "PEPINO EM CONSERVA", "TOMATE SECO"]

7-element Vector{String}:
 "AZEITONAS"
 "BISCOITOS E BOLINHOS"
 "CALDO DE GALINHA"
 "COGUMELOS"
 "OLEO DE ALGODAO"
 "PEPINO EM CONSERVA"
 "TOMATE SECO"

In [81]:
r2 = arl_recommender(df_res, t3, 5)

5-element Vector{Any}:
 "CALDO DE CARNE"
 "OUTROS CALDOS SOPAS E CREMES"
 "PALMITO"
 "SOBREMESAS EM PO"
 "CONSERVAS DOCES"