# Wholesale customer segmentation with percentage preprocessing

### Importing libraries 

In [1]:
import org.apache.spark.mllib.clustering._
import org.apache.spark.mllib.linalg._
import org.apache.spark.rdd._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
import org.apache.spark.sql.functions._
import sqlContext.implicits._

### Read data from source

In [2]:
var tmp =sc.textFile("data/customer.csv")
// Remove header
var header = tmp.first
var data = tmp.filter(x => x!=header)
data.take(5).foreach(println)
data.count()

2,3,12669,9656,7561,214,2674,1338
2,3,7057,9810,9568,1762,3293,1776
2,3,6353,8808,7684,2405,3516,7844
1,3,13265,1196,4221,6404,507,1788
2,3,22615,5410,7198,3915,1777,5185


440

### Elaboration

In [3]:
// Split string data loaded from csv, and convert to integer
var DataInt = data.map(x => x.split(",") map(_.toInt))

In [4]:
// Define function to apply to each row
def elab(row: Array[Int]) : (Int, Int, Double, Double, Double, Double, Double, Double, Int) = {
    var income = Array(row(2),row(3),row(4),row(5),row(6),row(7))
    // Get total income for each channel in every region
    var tot = income.sum
    // Calculate percentage income for each category
    var perc = income.map(_.toDouble *100 / tot)
    return (row(0), row(1), perc(0), perc(1), perc(2), perc(3), perc(4), perc(5), tot)
}

In [5]:
// Apply function
var DataElab = DataInt.map(elab)

In [6]:
// Print Output
var ArrHeader = header.split(",")
DataElab.take(10).map(x =>
    ArrHeader(0) + ":" + x._1.toString + " " +
    ArrHeader(1) + ":" + x._2.toString + " " +
    ArrHeader(2) + ":" +  "%3.1f".format(x._3) + "% " +
    ArrHeader(3) + ":" +  "%3.1f".format(x._4) + "% " +
    ArrHeader(4) + ":" +  "%3.1f".format(x._5) + "% " +
    ArrHeader(5) + ":" +  "%3.1f".format(x._6) + "% " +
    ArrHeader(6) + ":" +  "%3.1f".format(x._7) + "% " +
    ArrHeader(7) + ":" +  "%3.1f".format(x._8) + "% " +
    "Tot:" + x._9
).foreach(println)

Channel:2 Region:3 Fresh:37.1% Milk:28.3% Grocery:22.2% Frozen:0.6% Detergents_Paper:7.8% Delicassen:3.9% Tot:34112
Channel:2 Region:3 Fresh:21.2% Milk:29.5% Grocery:28.8% Frozen:5.3% Detergents_Paper:9.9% Delicassen:5.3% Tot:33266
Channel:2 Region:3 Fresh:17.4% Milk:24.1% Grocery:21.0% Frozen:6.6% Detergents_Paper:9.6% Delicassen:21.4% Tot:36610
Channel:1 Region:3 Fresh:48.4% Milk:4.4% Grocery:15.4% Frozen:23.4% Detergents_Paper:1.9% Delicassen:6.5% Tot:27381
Channel:2 Region:3 Fresh:49.1% Milk:11.7% Grocery:15.6% Frozen:8.5% Detergents_Paper:3.9% Delicassen:11.2% Tot:46100
Channel:2 Region:3 Fresh:35.2% Milk:30.9% Grocery:19.2% Frozen:2.5% Detergents_Paper:6.7% Delicassen:5.4% Tot:26710
Channel:2 Region:3 Fresh:45.8% Milk:12.1% Grocery:26.4% Frozen:1.8% Detergents_Paper:11.9% Delicassen:2.1% Tot:26465
Channel:2 Region:3 Fresh:25.7% Milk:16.8% Grocery:31.9% Frozen:5.7% Detergents_Paper:11.3% Delicassen:8.7% Tot:29517
Channel:1 Region:3 Fresh:31.9% Milk:19.5% Grocery:33.1% Frozen:2.3% 

In [7]:
// Prepare input for Clustering
var customer = DataElab.map(t=>Vectors.dense(Array(t._3,t._4,t._5,t._6,t._7,t._8)))

In [8]:
customer.take(5).foreach(println)

[37.13942307692308,28.306754221388367,22.165220450281424,0.6273452157598499,7.838883677298312,3.922373358348968]
[21.213851981001625,29.48956892923706,28.762099440870557,5.296699332651957,9.898995971863163,5.338784344375639]
[17.35318219065829,24.05900027314941,20.98880087407812,6.5692433761267415,9.603933351543294,21.42583993444414]
[48.446002702604,4.367992403491472,15.415799276870823,23.388481063511193,1.8516489536539937,6.530075599868522]
[49.05639913232104,11.735357917570498,15.613882863340564,8.49240780911063,3.8546637744034706,11.247288503253795]


### Perform cluster analysis

In [9]:
var numClusters = 5
var numIterations = 10
var model = KMeans.train(customer,numClusters,numIterations)

In [10]:
var summary: MultivariateStatisticalSummary = Statistics.colStats(customer)
println(summary.mean) // a dense vector containing the mean value for each column
println(summary.variance) // column-wise variance
println(summary.numNonzeros) // number of nonzeros in each column

[37.50162161244346,16.75122400946883,22.967569991222263,10.57693576446939,7.422499825251994,4.7801487971440695]
[615.9146444195787,127.82138859862337,214.40346147769546,127.84838545306285,65.16960087476069,19.057762567618333]
[440.0,440.0,440.0,440.0,440.0,440.0]


In [11]:
model.clusterCenters.foreach(println)

[8.375697828618963,22.026732447718974,44.50969851443971,3.48770819620745,18.10130885368478,3.4988541593301252]
[68.22327009170908,7.529268334515726,9.9820728720439,8.899635555971777,1.771718010102582,3.594035135656924]
[20.922422478450688,6.459196238665622,6.403224000895555,10.634725176312548,53.30795925221091,2.2724728534646816]
[18.627577708406868,30.883450942281087,26.529841964781834,7.975970618172758,8.4268569445827,7.556301821774747]
[41.502299868744515,12.422263716059133,17.645115568302394,19.30348411708532,4.098627393298478,5.028209336510159]


In [12]:
var cusclu = model.predict(customer)

In [13]:
cusclu.take(5).foreach(println)

3
3
3
4
4


In [14]:
println("PMML Model:\n" + model.toPMML)

PMML Model:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML version="4.2" xmlns="http://www.dmg.org/PMML-4_2">
    <Header description="k-means clustering">
        <Application name="Apache Spark MLlib" version="1.6.0"/>
        <Timestamp>2016-06-16T09:07:35</Timestamp>
    </Header>
    <DataDictionary numberOfFields="6">
        <DataField name="field_0" optype="continuous" dataType="double"/>
        <DataField name="field_1" optype="continuous" dataType="double"/>
        <DataField name="field_2" optype="continuous" dataType="double"/>
        <DataField name="field_3" optype="continuous" dataType="double"/>
        <DataField name="field_4" optype="continuous" dataType="double"/>
        <DataField name="field_5" optype="continuous" dataType="double"/>
    </DataDictionary>
    <ClusteringModel modelName="k-means" functionName="clustering" modelClass="centerBased" numberOfClusters="5">
        <MiningSchema>
            <MiningField name="field_0" usageType="activ