In [1]:
using DataFrames
using CSVFiles
using StatsBase
using LinearAlgebra

In [24]:
function min_max_scale(x)
    dt = fit(UnitRangeTransform, x, dims=1)
    StatsBase.transform(dt, x)
end

data = load("processed_data_5f34929a.csv")|>DataFrame
rename!(data, :sales=>:Sales,
    Symbol("Customer Id")=>:ID,
    :GR=>:Growth,
)
data = data[!,["Province","City","ID",
               "Customer Name","Potential","Sales",
               "ALL_CUST","MA_CUST",
               "VR_CUST","VR_CALL_CNT","VEEVA_CNT",
               "SPK_CNT","EMEET_CNT","ADD_WECHAT_CNT",
               "sales_half_yr_pre","sales_half_yr","hospital_segment","Group"]]

data[!,:Growth] = data.sales_half_yr./data.sales_half_yr_pre.-1
data[!,:Growth] = round.(data.Growth,digits=1)
# replace!(data[!,:Growth],NaN=>0,Inf=>1)
sort!(data, [:Province,:hospital_segment,:Group,:Growth]);

In [25]:
data[data.hospital_segment.=="Top Plus",:]

Unnamed: 0_level_0,Province,City,ID,Customer Name,Potential,Sales,ALL_CUST,MA_CUST,VR_CUST,VR_CALL_CNT,VEEVA_CNT,SPK_CNT,EMEET_CNT,ADD_WECHAT_CNT,sales_half_yr_pre,sales_half_yr,hospital_segment,Group,Growth
Unnamed: 0_level_1,String,String,Int64,String,Float64,Float64,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Float64,Float64,String,String,Float64
1,四川省,绵阳市,900002716,绵阳市中心医院,6428060.0,137475.0,22.0,6.0,4.0,0,72,1,0,0,26839.4,29541.4,Top Plus,LH,0.1
2,四川省,绵阳市,900011698,四川绵阳四0四医院,2191600.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0.0,0.0,Top Plus,LH,
3,宁夏回族自治区,中卫市,900035514,宁夏回族自治区人民医院宁南医院,15462600.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,Top Plus,LL,
4,山西省,运城市,900013462,运城市中心医院(东院),806510.0,39322.9,6.0,4.0,4.0,0,0,0,0,0,15540.1,11508.9,Top Plus,LH,-0.3
5,新疆维吾尔自治区,新疆自治区直辖县级行政区划,900002378,石河子大学医学院第一附属医院,1826130.0,297537.0,16.0,16.0,16.0,24,21,2,2,0,35239.6,76817.6,Top Plus,HL,1.2
6,河南省,开封市,900005310,尉氏县中心医院,503349.0,460824.0,8.0,7.0,7.0,0,2,0,0,0,240092.0,30083.7,Top Plus,HH,-0.9
7,甘肃省,酒泉市,900002365,酒泉市人民医院,2024820.0,138760.0,6.0,6.0,4.0,13,85,0,0,0,25229.2,100917.0,Top Plus,HL,3.0


In [26]:
n = nrow(data)
n_partition = 3
m = nrow(data) % 3
partition = repeat(collect(1:3), n ÷ 3)
append!(partition, collect(1:3)[begin:m]);
partition[data.hospital_segment.=="Top Plus"] = [1,2,3,1,2,3,1];

In [27]:
function objective(partition, data, n=3)
    group_sales = [data.sales_half_yr[partition .== i]|>sum for i ∈ 1:n_partition] |> std
    group_potential = [data.Potential[partition .== i]|>sum for i ∈ 1:n_partition] |> std
    group_cust_counts = [data.ALL_CUST[partition .== i]|>sum for i ∈ 1:n_partition] |> std
    present = [data.sales_half_yr[partition .== i]|>sum for i ∈ 1:n_partition]
    ex = [data.sales_half_yr_pre[partition .== i]|>sum for i ∈ 1:n_partition]
    group_growth = present ./ ex .-1 |> std
    top_plus_std = [(partition.==i) .& (data.hospital_segment.=="Top Plus") |> sum for i ∈ 1:n_partition] |> std
    [group_sales,group_potential,group_cust_counts,group_growth,top_plus_std]
end

ex_ϵ = ϵ = objective(partition, data, n_partition)
objective(partition,data)

5-element Vector{Float64}:
 410205.30987009057
      1.6504437210041815e7
     87.21429546429492
      0.05159533228859545
      0.5773502691896258

In [28]:
function find_partition(data,ϵ=ϵ,ex_ϵ=ex_ϵ)
    total_no_of_improvment = 0
    for t ∈ 1:10
        no_of_improvment = 0
        for loc1 ∈ 1:n
            for loc2 ∈ loc1:n

                a1 = partition[loc1]; a2 = partition[loc2]
                    partition[loc1] = a2; partition[loc2] = a1

                    ϵ′ = objective(partition, data, n_partition)
                    if ϵ′[1] <= ϵ[1] && ϵ′[2] <= ϵ[2] && ϵ′[3] <= ϵ[3] && ϵ′[4] <= ϵ[4]
                        ϵ = ϵ′
                        no_of_improvment = no_of_improvment + 1
                    else
                        partition[loc1] = a1; partition[loc2] = a2
                end
            end
        end
        total_no_of_improvment = total_no_of_improvment + no_of_improvment
        println("Number of improvments: $(total_no_of_improvment)")
        println("Best objective value: $(ϵ)")
        println("Ex-objective value: $(ex_ϵ)")
        norm_difference = norm(ex_ϵ.- ϵ)
        println("Norm difference: $(norm_difference)")
        (no_of_improvment == 0 || norm_difference <= 1e-2) && break
        ex_ϵ=ϵ
    end
    println("Total Number of improvments: $(total_no_of_improvment)")
    println("################################################")
    return partition
end

partition = find_partition(data);

Number of improvments: 488825
Best objective value: [4083.6775872814897, 34.884955380578766, 26.083200212652844, 0.00013944890073084866, 0.5773502691896258]
Ex-objective value: [410205.30987009057, 1.6504437210041815e7, 87.21429546429492, 0.05159533228859545, 0.5773502691896258]
Norm difference: 1.6509398259544883e7
Number of improvments: 977623
Best objective value: [4067.010795851914, 16.941765301469882, 26.083200212652844, 9.103042133227178e-5, 0.5773502691896258]
Ex-objective value: [4083.6775872814897, 34.884955380578766, 26.083200212652844, 0.00013944890073084866, 0.5773502691896258]
Norm difference: 24.489589763292443
Number of improvments: 1466432
Best objective value: [1622.4208036304442, 16.941436616358917, 26.083200212652844, 1.1326650777078842e-5, 0.5773502691896258]
Ex-objective value: [4067.010795851914, 16.941765301469882, 26.083200212652844, 9.103042133227178e-5, 0.5773502691896258]
Norm difference: 2444.5899922214926
Number of improvments: 1955222
Best objective value:

In [29]:
insertcols!(data,1,(:Partition=>partition));

In [42]:
res = combine(groupby(data,:Partition),AsTable([:sales_half_yr, :sales_half_yr_pre]) => (x -> sum(x.sales_half_yr)/sum(x.sales_half_yr_pre)-1)=>Symbol("Growth"),names(data)[6:end-3].=>sum)

Unnamed: 0_level_0,Partition,Growth,Potential_sum,Sales_sum,ALL_CUST_sum,MA_CUST_sum,VR_CUST_sum,VR_CALL_CNT_sum,VEEVA_CNT_sum,SPK_CNT_sum,EMEET_CNT_sum,ADD_WECHAT_CNT_sum,sales_half_yr_pre_sum,sales_half_yr_sum
Unnamed: 0_level_1,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Float64,Float64
1,1,0.0671318,518498000.0,60709200.0,1105.0,915.0,749.0,321,2354,353,160,39,17521700.0,18697900.0
2,2,0.0671095,518498000.0,59715400.0,1145.0,990.0,832.0,261,2087,421,166,57,17522800.0,18698800.0
3,3,0.0671239,518498000.0,60595600.0,1154.0,984.0,838.0,150,1659,364,154,40,17524700.0,18701100.0


In [43]:
unstack(sort(combine(groupby(data,[:Partition,:hospital_segment]),nrow),:Partition),:Partition,:hospital_segment,:nrow)

Unnamed: 0_level_0,Partition,Next,Others,Top,CHC,Top Plus
Unnamed: 0_level_1,Int64,Int64?,Int64?,Int64?,Int64?,Int64?
1,1,162,354,39,12,3
2,2,161,360,36,12,2
3,3,159,359,35,15,2


In [44]:
unstack(sort(combine(groupby(data,[:Partition,:Group]),nrow),:Partition),:Partition,:Group,:nrow)

Unnamed: 0_level_0,Partition,LH,HL,LL,HH
Unnamed: 0_level_1,Int64,Int64?,Int64?,Int64?,Int64?
1,1,275,161,45,89
2,2,269,169,45,88
3,3,275,166,41,88


In [33]:
unstack(sort(combine(groupby(data,[:Partition,:Province]),nrow),:Partition),:Partition,:Province,:nrow)

Unnamed: 0_level_0,Partition,云南省,内蒙古自治区,吉林省,四川省,宁夏回族自治区,安徽省,山西省,广西壮族自治区,新疆维吾尔自治区,江苏省,江西省,河北省,河南省,湖北省,湖南省,甘肃省,西藏自治区,贵州省,辽宁省,陕西省,青海省,黑龙江省
Unnamed: 0_level_1,Int64,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?,Int64?
1,1,22,32,13,43,15,33,57,6,15,19,1,23,37,45,26,35,5,18,35,52,16,22
2,2,17,35,11,48,15,30,57,7,16,22,2,24,36,45,22,36,4,18,31,58,17,20
3,3,20,38,9,42,16,30,57,7,15,22,1,23,36,50,22,33,4,16,38,54,15,22


In [35]:
combine(groupby(data,:Partition),nrow)

Unnamed: 0_level_0,Partition,nrow
Unnamed: 0_level_1,Int64,Int64
1,1,570
2,2,571
3,3,570


In [22]:
sort!(data,:Province);
data[data.Partition.==1,:]

Unnamed: 0_level_0,Partition,Province,City,ID,Customer Name,Potential,Sales,ALL_CUST,MA_CUST,VR_CUST,VR_CALL_CNT,VEEVA_CNT,SPK_CNT,EMEET_CNT,ADD_WECHAT_CNT,sales_half_yr_pre,sales_half_yr,hospital_segment,Group,Growth
Unnamed: 0_level_1,Int64,String,String,Int64,String,Float64,Float64,Float64,Float64,Float64,Int64,Int64,Int64,Int64,Int64,Float64,Float64,String,String,Float64
1,1,云南省,普洱市,900019156,普洱市中医医院,1.03322e6,5924.3,0.0,0.0,0.0,0,0,0,0,0,335.338,0.0,Next,LH,-1.0
2,1,云南省,普洱市,900009820,景谷县中医院,727058.0,1.00991e5,0.0,0.0,0.0,0,0,0,0,0,38677.6,17190.0,Next,LH,-0.6
3,1,云南省,玉溪市,900011471,峨山县中医医院,691566.0,1.47102e5,0.0,0.0,0.0,0,0,0,0,0,51418.5,36887.2,Next,LH,-0.3
4,1,云南省,西双版纳傣族自治州,900006207,景洪市人民医院,1.47954e6,195788.0,1.0,1.0,1.0,0,3,1,0,0,81905.7,73774.3,Next,LH,-0.1
5,1,云南省,楚雄彝族自治州,900008654,姚安县中医医院,833482.0,24066.1,0.0,0.0,0.0,0,0,0,0,0,8595.02,9024.77,Next,LH,0.1
6,1,云南省,楚雄彝族自治州,900008655,姚安县人民医院,1.35001e6,24066.1,0.0,0.0,0.0,0,0,0,0,0,9024.77,10743.8,Next,LH,0.2
7,1,云南省,玉溪市,900003242,峨山县人民医院,1.50856e6,60807.9,0.0,0.0,0.0,0,0,0,0,0,13413.5,27721.3,Next,LH,1.1
8,1,云南省,昭通市,900013761,昭通市第一人民医院,5.76055e6,32716.6,2.0,1.0,1.0,0,1,0,0,0,1530.18,29509.7,Next,LH,18.3
9,1,云南省,文山壮族苗族自治州,900002323,文山州人民医院,2.56341e6,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,Next,LH,
10,1,云南省,玉溪市,900002335,江川县人民医院,2.67685e6,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,Next,LH,


In [None]:
ENV["ROWS"]="500"

In [37]:
provincial = sort(combine(groupby(data,[:Partition,:Province]),nrow),[:Province,:Partition])

Unnamed: 0_level_0,Partition,Province,nrow
Unnamed: 0_level_1,Int64,String,Int64
1,1,云南省,22
2,2,云南省,17
3,3,云南省,20
4,1,内蒙古自治区,32
5,2,内蒙古自治区,35
6,3,内蒙古自治区,38
7,1,吉林省,13
8,2,吉林省,11
9,3,吉林省,9
10,1,四川省,43


In [40]:
combine(groupby(provincial,:Province),:nrow=>std)[!,2]|>std

1.0728200944131059

In [41]:
import ExcelFiles
ExcelFiles.save("Partitioned.xlsx",data)

In [None]:
combine(groupby(data[data.ALL_CUST .≠ 0,:],:Partition),:Sales=>sum,:Potential=>sum,:ALL_CUST=>sum,:MA_CUST=>sum,:VR_CUST=>sum)