# R: Explore Sales Data from Enterprise Systems through Machine Learning

## R implementation for association rules mining (arules package)

Initializing libraries

In [None]:
# Installing Brunel

try(system("pip install brunel --upgrade --user", intern = TRUE, ignore.stderr = TRUE))

# Installing supporting R packages
# Installing R packages for Brunel

install.packages("devtools")

devtools::install_github("Brunel-Visualization/Brunel", subdir="R", ref="master2.1", force=TRUE)

# Installing R packages for machine learning

install.packages("arules")

install.packages("C50")

# a package supporting model transformations for visualization

devtools::install_github("ibmdataworks/datafirst", subdir="datascientist/machinelearning/sigmoml", force=TRUE)

Installing package into ‘/gpfs/global_fs01/sym_shared/YPProdSpark/user/sce6-e6b1c3357c3b2f-e6c62d2acbd3/R/libs’
(as ‘lib’ is unspecified)
Downloading GitHub repo Brunel-Visualization/Brunel@master2.1
from URL https://api.github.com/repos/Brunel-Visualization/Brunel/zipball/master2.1
Installation failed: 404: Not Found
 (404)

Installing package into ‘/gpfs/global_fs01/sym_shared/YPProdSpark/user/sce6-e6b1c3357c3b2f-e6c62d2acbd3/R/libs’
(as ‘lib’ is unspecified)


In [None]:
library(arules)
library(plyr)
library(ibmdbR)

In [None]:
install.packages("devtools")

Loading the data from the object storage

Data wrangling with R: putting all the purchased items into a single transaction

# Insert a new cell with your dashDB connection here

In [6]:
# using subset of fields: the values in the result set are unique
ordersDF <- ddply(data.df.2,c('CUST_ORDER_NUMBER','PRODUCT_LINE'),summarize, unused=sum(1))
ordersDF <- subset(ordersDF , select = c('CUST_ORDER_NUMBER','PRODUCT_LINE'))
# merging multiple lines related to the same order inta a single line
ordersDF <- aggregate( PRODUCT_LINE ~ CUST_ORDER_NUMBER, data = ordersDF, paste, collapse = ",")
# getting an list of items istead of a comma separated strings
ordersDF <- dlply(ordersDF, 1, function(items) (strsplit(items$PRODUCT_LINE, split=",")))

transactions <- unlist(ordersDF, recursive = FALSE, use.names = FALSE)
# filtering the transaction with multiple purchased items
transactions <- transactions [lapply(transactions , length) > 1]
print('Sample of transactions ready for Apriori algorithm')
head(transactions)


[1] "Sample of transactions ready for Apriori algorithm"


Transforming data according to arules requirements and applying Apriori algorithm

In [7]:
transactions <- as(transactions, "transactions")

In [8]:
library(arules)

rulesMod <- apriori(transactions, parameter=list(support=0.05, conf=0.2, minlen=2))
rulesMod <- sort(rulesMod, decreasing=T, by="lift")

inspect(rulesMod)

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.2    0.1    1 none FALSE            TRUE       5    0.05      2
 maxlen target   ext
     10  rules FALSE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 322 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[5 item(s), 6443 transaction(s)] done [0.00s].
sorting and recoding items ... [5 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [12 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
     lhs                           rhs                        support   
[1]  {Golf Equipment}           => {Personal Accessories}     0.09514201
[2]  {Outdoor Protection}       => {Camping Equipment}        0.20037250
[3]  {Camping Equipment}        => {Outdoor Protection}       0.200372

## Visualizing association rules

Brunel-based visualization

In [9]:
#Transforming the rules into a data frame
rulesSet <- as(rulesMod, "data.frame") 

rulesSet$rules <- gsub("\\{|\\}", "", rulesSet$rules)
rulesSet <- transform(rulesSet, col=do.call(rbind, strsplit(as.character(rules), '=>', fixed=TRUE)), stringsAsFactors=FALSE)

rulesSet <- rename(rulesSet, c("col.1"="LHS", "col.2"="RHS"))

In [10]:
library(brunel)

brunel (" data('rulesSet') chord x(LHS) y(RHS) color(confidence) size(support) tooltip(rules, support, confidence, lift)",
        width=600, height=400, online_js=TRUE)

## Decision tree-based classification with SparkR and C5.0 library

Transforming the data for using in C5.0

In [11]:
# Preparing the data for training and classification
# Transforming the data frame into matrix
orderDetails <- data.df.2
orderDetails$COUNTRY = as.factor(orderDetails$COUNTRY)
orderDetails$GENDER = as.factor(orderDetails$GENDER)
orderDetails$MARITAL_STATUS = as.factor(orderDetails$MARITAL_STATUS)
orderDetails$PROFESSION = as.factor(orderDetails$PROFESSION)
orderDetails$PRODUCT_LINE = as.factor(orderDetails$PRODUCT_LINE)

set.seed(1234)
# training data set
trainingData <- orderDetails


Training the classification model: building model using C50 library

In [12]:
library(C50)
classifierModel <- C5.0(PRODUCT_LINE ~ AGE + GENDER + PROFESSION + MARITAL_STATUS , data=orderDetails, rules=FALSE)

#Detailed textual presentation:
#summary(classifierModel)

### Visualizing the model

Transforming the model for visualization:

In [13]:
library(sigmoml)
library(partykit)
library(plyr)

rulesDataFrame <- sigmoml::getRules(classifierModel)

Loading required package: grid

Attaching package: ‘grid’

The following object is masked from ‘package:SparkR’:

    explode



Graph representation

In [15]:
library(brunel)

brunel (" data('rulesDataFrame') treemap x(Level1, Level2, Level3, Level4, Level5, Level6, Level7, Level8, Level9) color(Prediction) size(Samples) label(Name) filter(Prediction) tooltip(#all)",
        width=1000, height=500, online_js=TRUE)

In [16]:
library(brunel)
brunel (" data('rulesDataFrame') tree(prune:20) y(Level1, Level2, Level3, Level4, Level5, Level6, Level7, Level8, Level9) color(Prediction) label(Prediction) size(samples) tooltip(#all) legends(none)",
        width=1000, height=800, online_js=TRUE)

Native R visualization

In [None]:
library(repr)

# Change plot size to 136 x 8
options(repr.plot.width=136, repr.plot.height=8)

plot(classifierModel, type="simple",main="Decison Tree", drop_terminal = TRUE)