# Data Mining
## Exercise 3 -  A-priori algorithm and association rules 

**Student: David Sánchez Marín**


###  Mining Frequent Itemsets - A-priori algorithm for k up to 2

In [1]:
import pyspark
import os
import math
import random
import sys

In [2]:
# Preliminary start-up code:
spark_home = os.environ.get('SPARK_HOME', None)
print ( spark_home )
sc = pyspark.SparkContext('local[*]')
print ( sc )

None
<SparkContext master=local[*] appName=pyspark-shell>


In [3]:
# From a transaction in a single string with items separated by white spaces
# generate a list with the items of the transaction
def parseTransaction( trans ):
    return trans.strip().split('\t')

# Compute the rdd with frequent singletonsets (L_1)
def computeL1 ( rddtrans, numtrans, theta ):
  rddtemp = rddtrans.flatMap( lambda trans : [ (it,1) for it in trans ] ).reduceByKey( lambda a,b : a+b  )
  rddtemp2 = rddtemp.filter( lambda x: x[0] != '')

  rddL1 = rddtemp2.filter( lambda x : (float(x[1])/numtrans) >= theta )
  return rddL1

# This is a function to map transactions to transactions with only items in L1,
# something that can increase the efficiency when computing L2
def filterOutL1( transseq, L1 ):
    for trans in transseq:
       yield [ it for it in trans if (it in L1) ]


In [4]:
def generateC2( seqoftransactions,  L1  ):
    
    for trans1 in seqoftransactions:     
        itemsetlist = []
        if len(trans1) > 1:
            trans2 = trans1.copy()
            for it1 in trans1:
                trans2.remove(it1)
                for it2 in trans2:
                    pair = (it1, it2)
                    itemsetlist.append(pair)
        yield itemsetlist


In [5]:
# And finally, the function for computing L_2 from L_1 and the rdd with
# the set of transactions
# This function should use your previous generateC2 function to compute L2 from C2
#
def computeL2( rddtrans, numtrans, L1, theta, verbose ):
    #   Map the transactions in rddtrans to the set of candidate frquenquent pairs (C2) 
    #   using the previous function generateC2      
        
    candidatePairsrdd = rddtrans.mapPartitions( lambda trans: generateC2(trans, L1onlyitems) )
    if verbose:
        print ("\n  candidatePairsrdd: ", candidatePairsrdd.take(10))

    candidatePairsfilteredrdd = candidatePairsrdd.filter( lambda x: len(x) > 0 ).map( lambda x: x[0])
    if verbose:
        print ("\n  candidatePairsfilteredrdd: ", candidatePairsfilteredrdd.take(10))

    #  Count number of occurences in the transactions for  each pair in C2. 
    #  Use flatpMap and reduceByKey to get the final rdd
    #rddL2temp = candidatePairsfilteredrdd.flatMap( lambda trans : [ (it,1) for it in trans ] ).reduceByKey( lambda a,b : a+b  ) 
    rddL2temp = candidatePairsfilteredrdd.map( lambda it : (it, 1) ).reduceByKey( lambda a,b : a+b  ) 
    if verbose:
        print ("\n  rddL2temp: ", rddL2temp.take(10))
    
    # Finally, filter out from the previous rdd those pairs with frequency below theta
    rddL2 = rddL2temp.filter( lambda x : (float(x[1])/numtrans) >= theta ) 
    
    return rddL2

In [6]:
# Load information in Dataset

rddlines = sc.textFile( "../data/1979.csv").map(parseTransaction)
numtrans = rddlines.count()
print ("Number of registers: ", numtrans)
print(rddlines.take(5))

Number of registers:  430941
[['0', '19790101', '197901', '1979', '1979.0027', '', '', '', '', '', '', '', '', '', '', 'AFR', 'AFRICA', 'AFR', '', '', '', '', '', '', '', '1', '040', '040', '04', '1', '1', '9', '1', '9', '5.52631578947368', '', '', '', '', '0', '0', '0', '', '', '', '', '0', '0', '0', '', '', '', '', '0', '0', '', '20130203'], ['1', '19790101', '197901', '1979', '1979.0027', '', '', '', '', '', '', '', '', '', '', 'AGR', 'FARMER', '', '', '', '', '', 'AGR', '', '', '1', '030', '030', '03', '1', '4', '10', '1', '10', '10.9792284866469', '', '', '', '', '0', '0', '0', '1', 'Nigeria', 'NI', 'NI', '10', '8', '0', '1', 'Nigeria', 'NI', 'NI', '10', '8', '0', '20130203'], ['2', '19790101', '197901', '1979', '1979.0027', '', '', '', '', '', '', '', '', '', '', 'AGR', 'FARMER', '', '', '', '', '', 'AGR', '', '', '1', '100', '100', '10', '3', '-5', '10', '1', '10', '10.9792284866469', '', '', '', '', '0', '0', '0', '1', 'Nigeria', 'NI', 'NI', '10', '8', '0', '1', 'Nigeria', 'NI'

In [7]:
# Select fields
rddtrans = rddlines.map(lambda x: [x[15],x[27],x[44]])
rddtrans.take(10)

[['AFR', '040', ''],
 ['AGR', '030', 'NI'],
 ['AGR', '100', 'NI'],
 ['CHN', '043', 'CH'],
 ['COP', '190', 'US'],
 ['CVL', '046', 'IS'],
 ['EGYEDU', '040', ''],
 ['GOV', '020', 'TU'],
 ['GOV', '040', 'CH'],
 ['GOV', '040', 'TW']]

In [8]:
# Generate L1 and L2 with differents theta values

thetaList = [0.0100, 0.0075, 0.0050]

for theta in thetaList:
    print ("*******************************************************************************************************")
    print ("                                THETA = ", theta)
    print ("*******************************************************************************************************\n")
    # First, let's compute the frequent singleton sets
    #       
    rddL1 = computeL1( rddtrans, float(numtrans), theta  )
    L1 = rddL1.collect()
    print ("L1 singleton with frequency information: \n", L1 )

    # Compute a version without the frequency information:
    #
    L1onlyitems = rddL1.map( lambda it : it[0] ).collect()
    print ( "\nL1 only items: \n", L1onlyitems )

    if theta == 0.010:
        # Show details of calculation on first loop
        verbose = 1
    else:
        verbose = 0
    
    # Next, compute frequent pairs (L2) from frequent items (L1)
    # We  need to work with the collected back version (to the driver) of the L1 set
    # (This will be Ok as far as L1 is small enough to fit in the memory
    # of every single machine ). We use mapPartitions to distribute only one function call per partition,
    # avoiding the overhead of executing many functions with the parameter L1
    #
    L1filteredtransrdd = rddtrans.mapPartitions( lambda transseq : filterOutL1( transseq, L1onlyitems )  )

    # Let's check if the filtered transactions are correct:
    #
    print ("\nTransactions with only L1 items : \n", L1filteredtransrdd.take(10) )


    # Next, compute the frequent pairs (L_2)
    #
    rddL2 = computeL2( L1filteredtransrdd, float(numtrans), L1onlyitems, theta, verbose )
    print ( "\nL2 Pairs with frequency information : ", rddL2.collect() )

    L2onlyitems = rddL2.map( lambda it : it[0] ).collect()
    print ( "\nL2 only items: ", L2onlyitems )
    

*******************************************************************************************************
                                THETA =  0.01
*******************************************************************************************************

L1 singleton with frequency information: 
 [('CH', 25151), ('190', 17254), ('051', 21278), ('LEG', 4850), ('112', 7218), ('USA', 19484), ('036', 17954), ('111', 8545), ('110', 4808), ('040', 30785), ('US', 32488), ('CVL', 4575), ('042', 50824), ('ZI', 5869), ('057', 10552), ('RS', 20365), ('SF', 5247), ('VNM', 12557), ('120', 8467), ('FR', 6313), ('013', 6065), ('GBR', 6890), ('043', 48844), ('046', 25938), ('IS', 14897), ('GOV', 29969), ('020', 17660), ('IR', 16888), ('CB', 4996), ('MIL', 7756), ('VM', 17120), ('173', 8070), ('010', 25975), ('030', 7116), ('CHN', 15506), ('PL', 5223), ('IRN', 8552), ('JA', 5558), ('KHM', 4727), ('EG', 9533), ('RUS', 11368), ('080', 4396), ('050', 5186), ('EGY', 6679), ('ISR', 8928), ('012', 5941), ('G

## Finding Association Rules

In [9]:
#
#  Given a:
#    - a frequent pair with its support information  like (('t', 'x'), 3)
#    - an item j from the pair like  't'
#    - the number of transactions
#    - a list with the L1 information computed with the function computeL1 and then collected back
#    to the driver
#
#   Compute the confidence and interest of the rule  freqitemset-{j} ->  {j}
#   Return it with the format:  ( 'freqitemset-{j} ->  j', confidence, interest )
def compConfidenceAndInterestForRule( freqpairWithFreq, j, numtrans, L1withfreqinfo ):
    #
    # INSERT YOUR SOLUTION HERE
    #
    print ("PARAMETERS:")
    print ("freqpairWithFreq: ", freqpairWithFreq )
    print ("j: ", j)
    print ("numtrans: ", numtrans)
    print ("L1withfreqinfo: ", L1withfreqinfo[:10])

    premise = freqpairWithFreq[0]
    L1filtered = [item for item in L1withfreqinfo if item[0] == j]
    
    supportL1 = L1filtered[0][1]
    supportL2 = freqpairWithFreq[1]

    print ("rddL1filtered: ", L1filtered)
    print ("supportL1: ", supportL1)
    print ("supportL2: ", supportL2)
    
    confidence = supportL2 / supportL1
    freqj = supportL1 / numtrans
    interest = confidence - freqj
    return ( str(premise)+" -> "+str(j), confidence, interest  )

In [10]:
#
#    Map each freqpair of a partition, to its SET of different association rules
#    together with their confidence and interest
#
#    This function should use the previous compConfidenceAndInterestForRule for each
#    association rule generated
#
def genAssocRulesForPartition( fpair, numtrans, L1withfreqinfo ):
    
        rulesforpair = []

        confAndInt = compConfidenceAndInterestForRule(fpair, fpair[0][0], numtrans, L1withfreqinfo)
        rulesforpair.append(confAndInt)

        confAndInt = compConfidenceAndInterestForRule(fpair, fpair[0][1], numtrans, L1withfreqinfo)
        rulesforpair.append(confAndInt)
        
        return  rulesforpair

In [17]:
#
#   Finally, map all the frequenitemsets of all the partitions of the input
#   rddfreqsets using the previous function, and finally collect back all
#   the association rules obtained, but show them on the screen ordered by
#   descending order, using their interest to orden them
#
def mapFreqItemSetsToAssocRules( rddfreqsets, numtrans, L1withfreqinfo ):
    # Compute an rdd with the assotiation rules for the freqsets, calling the previous function
    # for each partition of the rddfreqsets 
    
    print ("freqpairWithFreq: ", freqpairWithFreq)
    print ("numtrans: ", numtrans)
    
    print ("\nConfidence and Interest of association rules:")
    rddConfAndInt = rddfreqsets.map( lambda x: genAssocRulesForPartition(x, numtrans, L1withfreqinfo))
    print (rddConfAndInt.take(10))
    
    # Next, get a "flat" version of the previous set of rules (one rule per element of the RDD)
    #   INSERT CODE HERE
    print ("\nFlat map of association rules:")
    rddConfAndIntFlat = rddConfAndInt.flatMap(lambda x: x)
    print (rddConfAndIntFlat.take(10))    
    
    # Next, collect back the obtained association rules, but sort them in descending order by their interest
    # You can either sort in a new rdd and then collect back the sorted rdd, or sort them in the driver after
    # collecting back the set of association rules.
    #
    #   INSERT YOUR SOLUTION HERE
    print ("\nTop association rules")
    topInterest = rddConfAndIntFlat.takeOrdered(10, key = lambda x: -x[2])
    for it in topInterest:
        print (it)

    return topInterest
    

In [18]:
# Test of first pair

freqpairWithFreq = rddL2.take(1)[0]
j = freqpairWithFreq[0][0]

confAndInt = compConfidenceAndInterestForRule(freqpairWithFreq, j, numtrans, L1)
print ("confAndInt: ", confAndInt)

PARAMETERS:
freqpairWithFreq:  (('USA', '042'), 2703)
j:  USA
numtrans:  430941
L1withfreqinfo:  [('AFR', 3828), ('CH', 25151), ('190', 17254), ('TU', 3825), ('051', 21278), ('LEG', 4850), ('112', 7218), ('TZ', 2210), ('USA', 19484), ('036', 17954)]
rddL1filtered:  [('USA', 19484)]
supportL1:  19484
supportL2:  2703
confAndInt:  ("('USA', '042') -> USA", 0.13872921371381647, 0.09351652798653594)


In [19]:
# Calculation of Top Association rules with confidences and interest

rddTopInterest = mapFreqItemSetsToAssocRules( rddL2, numtrans, L1)


freqpairWithFreq:  (('USA', '042'), 2703)
numtrans:  430941

Confidence and Interest of association rules:
[[("('USA', '042') -> USA", 0.13872921371381647, 0.09351652798653594), ("('USA', '042') -> 042", 0.053183535337635764, -0.06475372289492973)], [("('GOV', '043') -> GOV", 0.17191097467382963, 0.10236781215274206), ("('GOV', '043') -> 043", 0.10547866677585784, -0.007863994980623915)], [("('GOV', '046') -> GOV", 0.09369682004738229, 0.024153657526294725), ("('GOV', '046') -> 046", 0.10825815405968078, 0.04806894022298388)], [("('CHN', '042') -> CHN", 0.17419063588288405, 0.1382089121666445), ("('CHN', '042') -> 042", 0.05314418385014954, -0.06479307438241594)], [("('CHN', '043') -> CHN", 0.14110666838643107, 0.1051249446701915), ("('CHN', '043') -> 043", 0.04479567602980919, -0.06854698572667256)], [("('GOV', '040') -> GOV", 0.11688745036537755, 0.047344287844289984), ("('GOV', '040') -> 040", 0.11378918304369011, 0.04235248985367106)], [("('GOV', '042') -> GOV", 0.08205145316827388