# Wholesale customer segmentation with percentage preprocessing

### Importing libraries

In [1]:
import pyspark.mllib.clustering
from pyspark.mllib.clustering import KMeans
import pyspark.mllib.linalg
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import MultivariateStatisticalSummary
from pyspark.mllib.stat import Statistics
from pyspark import SparkConf, SparkContext

### Read data from source

In [2]:
# Init spark context
try:
    sc
except NameError:
    conf = SparkConf()
    sc = SparkContext(conf=conf)
else:
    print('sc already exits')

In [3]:
tmp =sc.textFile("data/customer.csv")
# Remove header
header = tmp.first()
data = tmp.filter(lambda x : x!=header)
for r in data.take(5):
    print(r)
data.count()

2,3,12669,9656,7561,214,2674,1338
2,3,7057,9810,9568,1762,3293,1776
2,3,6353,8808,7684,2405,3516,7844
1,3,13265,1196,4221,6404,507,1788
2,3,22615,5410,7198,3915,1777,5185


440

### Elaboration

In [4]:
# Define function to apply to each row
def elab(row):
    splitted = row.split(",")
    l = []
    
    # Converstion to integer
    for v in splitted:
        l.append(int(v))
    
    # Get total income for each channel in every region
    tot = 0
    for v in l[2:]:
        tot += v
    
    # Calculate percentage income for each category
    perc = []
    for v in l[2:]:
        perc.append(float(v) * 100 / tot)
    
    return (l[0],l[1],perc[0],perc[1],perc[2],perc[3],perc[4],perc[5],tot)

In [5]:
# Apply function
DataElab = data.map(elab)

In [6]:
# Print Output
ArrHeader = header.split(",")
for r in DataElab.take(10):
    s = (
        ArrHeader[0] + ":" + str(r[0]) + " " +
        ArrHeader[1] + ":" + str(r[1]) + " " +
        ArrHeader[2] + ":" +  "%.1f" % r[2] + "% " +
        ArrHeader[3] + ":" +  "%.1f" % r[3] + "% " +
        ArrHeader[4] + ":" +  "%.1f" % r[4] + "% " +
        ArrHeader[5] + ":" +  "%.1f" % r[5] + "% " +
        ArrHeader[6] + ":" +  "%.1f" % r[6] + "% " +
        ArrHeader[7] + ":" +  "%.1f" % r[7] + "% " +
        "Tot:" + str(r[8])
    )
    print(s)

Channel:2 Region:3 Fresh:37.1% Milk:28.3% Grocery:22.2% Frozen:0.6% Detergents_Paper:7.8% Delicassen:3.9% Tot:34112
Channel:2 Region:3 Fresh:21.2% Milk:29.5% Grocery:28.8% Frozen:5.3% Detergents_Paper:9.9% Delicassen:5.3% Tot:33266
Channel:2 Region:3 Fresh:17.4% Milk:24.1% Grocery:21.0% Frozen:6.6% Detergents_Paper:9.6% Delicassen:21.4% Tot:36610
Channel:1 Region:3 Fresh:48.4% Milk:4.4% Grocery:15.4% Frozen:23.4% Detergents_Paper:1.9% Delicassen:6.5% Tot:27381
Channel:2 Region:3 Fresh:49.1% Milk:11.7% Grocery:15.6% Frozen:8.5% Detergents_Paper:3.9% Delicassen:11.2% Tot:46100
Channel:2 Region:3 Fresh:35.2% Milk:30.9% Grocery:19.2% Frozen:2.5% Detergents_Paper:6.7% Delicassen:5.4% Tot:26710
Channel:2 Region:3 Fresh:45.8% Milk:12.1% Grocery:26.4% Frozen:1.8% Detergents_Paper:11.9% Delicassen:2.1% Tot:26465
Channel:2 Region:3 Fresh:25.7% Milk:16.8% Grocery:31.9% Frozen:5.7% Detergents_Paper:11.3% Delicassen:8.7% Tot:29517
Channel:1 Region:3 Fresh:31.9% Milk:19.5% Grocery:33.1% Frozen:2.3% 

In [7]:
# Prepare input for Clustering
customer = DataElab.map(lambda t:Vectors.dense(t[2],t[3],t[4],t[5],t[6],t[7]))

In [8]:
customer.take(5)

[DenseVector([37.1394, 28.3068, 22.1652, 0.6273, 7.8389, 3.9224]),
 DenseVector([21.2139, 29.4896, 28.7621, 5.2967, 9.899, 5.3388]),
 DenseVector([17.3532, 24.059, 20.9888, 6.5692, 9.6039, 21.4258]),
 DenseVector([48.446, 4.368, 15.4158, 23.3885, 1.8516, 6.5301]),
 DenseVector([49.0564, 11.7354, 15.6139, 8.4924, 3.8547, 11.2473])]

### Perform cluster analysis

In [9]:
numClusters = 5
numIterations = 10
model = KMeans.train(customer,numClusters,numIterations)

In [10]:
summary = Statistics.colStats(customer)
print(summary.mean()) # a dense vector containing the mean value for each column
print(summary.variance()) # column-wise variance
print(summary.numNonzeros()) # number of nonzeros in each column

[ 37.50162161  16.75122401  22.96756999  10.57693576   7.42249983
   4.7801488 ]
[ 615.91464442  127.8213886   214.40346148  127.84838545   65.16960087
   19.05776257]
[ 440.  440.  440.  440.  440.  440.]


In [11]:
for cc in model.clusterCenters:
    print(cc)

[ 72.54522884   5.65449568   8.0523073    9.26086246   1.26714443
   3.21996128]
[  8.21002176  24.53175011  42.73085515   3.66602478  17.17414329
   3.68720491]
[ 51.334056    13.00443804  17.52094303   9.44898919   3.87477459
   4.81679915]
[ 23.39303122  27.49066126  24.58413389   8.13247223   8.28761663
   8.11208476]
[ 33.42281643   9.48230723  14.06525948  35.84860446   2.69771329
   4.48329911]


In [12]:
cusclu = model.predict(customer)

In [13]:
for clu in cusclu.take(5):
    print(clu)

3
3
3
2
2


In [14]:
print("PMML Model:\n" + model.toPMML())

AttributeError: 'KMeansModel' object has no attribute 'toPMML'