In [1]:
using DataFrames
using CSVFiles
using StatsBase
using LinearAlgebra

In [11]:
function min_max_scale(x)
    dt = fit(UnitRangeTransform, x, dims=1)
    StatsBase.transform(dt, x)
end

data = load("processed_data_5f34929a.csv")|>DataFrame
rename!(data, :sales=>:Sales,
    Symbol("Customer Id")=>:ID,
    :GR=>:Growth,
)
data = data[!,["Province","City","ID",
               "Customer Name","Potential","Sales",
               "ALL_CUST","MA_CUST",
               "VR_CUST","VR_CALL_CNT","VEEVA_CNT",
               "SPK_CNT","EMEET_CNT","ADD_WECHAT_CNT",
               "sales_half_yr_pre","sales_half_yr","hospital_segment","Group"]]
data[!,:Growth] = data.sales_half_yr./data.sales_half_yr_pre.-1
sort!(data, [:Province,:Group,:hospital_segment,:sales_half_yr]);

In [12]:
n = nrow(data)
n_partition = 3
m = nrow(data) % 3
partition = repeat(collect(1:3), n ÷ 3)
append!(partition, collect(1:3)[begin:m]);

In [13]:
function objective(partition, data, n=3)
    group_sales = [data.sales_half_yr[partition .== i]|>sum for i ∈ 1:n] |> std
    group_potential = [data.Potential[partition .== i]|>sum for i ∈ 1:n] |> std
    group_cust_counts = [data.ALL_CUST[partition .== i]|>sum for i ∈ 1:n] |> std
    present = [data.sales_half_yr[partition .== i]|>sum for i ∈ 1:n_partition]
    ex = [data.sales_half_yr_pre[partition .== i]|>sum for i ∈ 1:n_partition]
    group_growth = present ./ ex .-1 |> std
    [group_sales,group_potential,group_cust_counts,group_growth]
end

ex_ϵ = ϵ = objective(partition, data, n_partition)
objective(partition,data)

4-element Vector{Float64}:
  1.3182544855688596e6
  2.034205558246396e7
 39.068316233660916
  0.059341589260075045

In [14]:
function find_partition(data,ϵ=ϵ,ex_ϵ=ex_ϵ)
    total_no_of_improvment = 0
    for t ∈ 1:10
        no_of_improvment = 0
        for loc1 ∈ 1:n
            for loc2 ∈ loc1:n

                a1 = partition[loc1]; a2 = partition[loc2]
                    partition[loc1] = a2; partition[loc2] = a1

                    ϵ′ = objective(partition, data, n_partition)
                    if ϵ′[1] <= ϵ[1] && ϵ′[2] <= ϵ[2] && ϵ′[3] <= ϵ[3] && ϵ′[4] <= ϵ[4]
                        ϵ = ϵ′
                        no_of_improvment = no_of_improvment + 1
                    else
                        partition[loc1] = a1; partition[loc2] = a2
                end
            end
        end
        total_no_of_improvment = total_no_of_improvment + no_of_improvment
        println("Number of improvments: $(total_no_of_improvment)")
        println("Best objective value: $(ϵ)")
        println("Ex-objective value: $(ex_ϵ)")
        norm_difference = norm(ex_ϵ.- ϵ)
        println("Norm difference: $(norm_difference)")
        (no_of_improvment == 0 || norm_difference <= 1e-5) && break
        ex_ϵ=ϵ
    end
    println("Total Number of improvments: $(total_no_of_improvment)")
    println("################################################")
    return partition
end

partition = find_partition(data);

Number of improvments: 488870
Best objective value: [626761.8296526085, 8.976355037349638, 0.5773502691896258, 0.02218729394107298]
Ex-objective value: [1.3182544855688596e6, 2.034205558246396e7, 39.068316233660916, 0.059341589260075045]
Norm difference: 2.035379626064325e7
Number of improvments: 977651
Best objective value: [626761.8296526064, 8.976260216626997, 0.5773502691896258, 0.022187293941072814]
Ex-objective value: [626761.8296526085, 8.976355037349638, 0.5773502691896258, 0.02218729394107298]
Norm difference: 9.482072266393033e-5
Number of improvments: 1466444
Best objective value: [626761.8296526064, 8.976110483412034, 0.5773502691896258, 0.022187293941072814]
Ex-objective value: [626761.8296526064, 8.976260216626997, 0.5773502691896258, 0.022187293941072814]
Norm difference: 0.00014973321496292158
Number of improvments: 1955240
Best objective value: [626761.8296526053, 8.976110472499654, 0.5773502691896258, 0.022187293941072776]
Ex-objective value: [626761.8296526064, 8.976

In [15]:
insertcols!(data,1,(:Partition=>partition));

In [16]:
res = combine(groupby(data,:Partition),AsTable([:sales_half_yr, :sales_half_yr_pre]) => (x -> sum(x.sales_half_yr)/sum(x.sales_half_yr_pre)-1)=>Symbol("Growth"),names(data)[6:end-3].=>sum)

Unnamed: 0_level_0,Partition,Growth,Potential_sum,Sales_sum,ALL_CUST_sum,MA_CUST_sum,VR_CUST_sum,VR_CALL_CNT_sum,VEEVA_CNT_sum,SPK_CNT_sum,EMEET_CNT_sum,ADD_WECHAT_CNT_sum,sales_half_yr_pre_sum,sales_half_yr_sum
Unnamed: 0_level_1,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Float64,Float64
1,1,0.0749462,518498000.0,59170700.0,1134.0,955.0,796.0,268,2219,368,150,43,17737400.0,19066800.0
2,2,0.0418637,518498000.0,59833800.0,1135.0,974.0,815.0,147,2040,383,186,46,17253300.0,17975600.0
3,3,0.0840172,518498000.0,62015700.0,1135.0,960.0,808.0,317,1841,387,144,47,17578500.0,19055400.0


In [17]:
unstack(sort(combine(groupby(data,[:Partition,:hospital_segment]),nrow),:Partition),:Partition,:hospital_segment,:nrow)

Unnamed: 0_level_0,Partition,Next,Others,Top,CHC,Top Plus
Unnamed: 0_level_1,Int64,Int64?,Int64?,Int64?,Int64?,Int64?
1,1,158,360,36,13,4
2,2,164,353,40,12,1
3,3,160,360,34,14,2


In [18]:
unstack(sort(combine(groupby(data,[:Partition,:Group]),nrow),:Partition),:Partition,:Group,:nrow)

Unnamed: 0_level_0,Partition,LH,HL,LL,HH
Unnamed: 0_level_1,Int64,Int64?,Int64?,Int64?,Int64?
1,1,278,163,42,88
2,2,271,166,43,90
3,3,270,167,46,87


In [19]:
unstack(sort(combine(groupby(data,[:Partition,:Province]),nrow),:Partition),:Partition,:Province,:nrow)

Unnamed: 0_level_0,Partition,云南省,内蒙古自治区,吉林省,四川省,宁夏回族自治区,安徽省,山西省,广西壮族自治区,新疆维吾尔自治区,江苏省,江西省,河北省,河南省,湖北省,湖南省,甘肃省,西藏自治区,贵州省,辽宁省,陕西省,青海省,黑龙江省
Unnamed: 0_level_1,Int64,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?
1,1,24,35,10,45,15,29,57,5,15,21,1,22,38,45,24,35,5,19,37,54,16,19
2,2,22,36,10,43,15,34,55,8,15,22,2,22,34,48,24,34,4,16,33,55,16,22
3,3,13,34,13,45,16,30,59,7,16,20,1,26,37,47,22,35,4,17,34,55,16,23


In [None]:
sort!(data,:Province);
data[data.Partition.==1,:]

In [None]:
ENV["ROWS"]="500"

In [11]:
provincial = sort(combine(groupby(data,[:Partition,:Province]),nrow),[:Province,:Partition])

Unnamed: 0_level_0,Partition,Province,nrow
Unnamed: 0_level_1,Int64,String,Int64
1,1,云南省,20
2,2,云南省,22
3,3,云南省,17
4,1,内蒙古自治区,28
5,2,内蒙古自治区,44
6,3,内蒙古自治区,33
7,1,吉林省,9
8,2,吉林省,14
9,3,吉林省,10
10,1,四川省,38


In [14]:
combine(groupby(provincial,:Province),:nrow=>std)[!,2]|>std

2.326715050083571

In [16]:
import ExcelFiles
ExcelFiles.save("Partitioned.xlsx",data)

In [None]:
combine(groupby(data[data.ALL_CUST .≠ 0,:],:Partition),:Sales=>sum,:Potential=>sum,:ALL_CUST=>sum,:MA_CUST=>sum,:VR_CUST=>sum)