# Task 4: Sequential Pattern Mining

In [1]:
import numpy as np
import pandas as pd
import copy

## Loading the new dataset

In [2]:
df = pd.read_csv('../dataset/new_customer_supermarket.csv', sep='\t', index_col=0)
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale
0,539993,2011-04-01 10:00:00,1.95,13313.0,22386,JUMBO BAG PINK POLKADOT,10,19.50
1,539993,2011-04-01 10:00:00,0.42,13313.0,21499,BLUE POLKADOT WRAP,25,10.50
2,539993,2011-04-01 10:00:00,0.42,13313.0,21498,RED RETROSPOT WRAP,25,10.50
3,539993,2011-04-01 10:00:00,2.10,13313.0,22379,RECYCLING BAG RETROSPOT,5,10.50
4,539993,2011-04-01 10:00:00,1.25,13313.0,20718,RED RETROSPOT SHOPPER BAG,10,12.50
...,...,...,...,...,...,...,...,...
363572,581587,2011-09-12 12:50:00,0.85,12680.0,22613,PACK OF SPACEBOY NAPKINS,12,10.20
363573,581587,2011-09-12 12:50:00,2.10,12680.0,22899,CHILDRENS APRON DOLLY GIRL,6,12.60
363574,581587,2011-09-12 12:50:00,4.15,12680.0,23254,CHILDRENS CUTLERY DOLLY GIRL,4,16.60
363575,581587,2011-09-12 12:50:00,4.15,12680.0,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,16.60


In [3]:
df.dtypes

BasketID        int64
BasketDate     object
Sale          float64
CustomerID    float64
ProdID         object
ProdDescr      object
Qta             int64
TotSale       float64
dtype: object

In [4]:
df = df.astype({'BasketDate': 'datetime64',
                'BasketID': 'object',
                'CustomerID': 'object'})

## AprioriAll

In [5]:
dataset = [
    # sequence: list of events
    {1 : ['a'],  # event {timestamp : list of strings}
     2 : ['a', 'b', 'c'], 
     3 : ['a', 'c'], 
     4 : ['c']}, 
    {1 : ['a'], 
     3 : ['c'], 
     4 : ['b', 'c']}, 
    {1 : ['a', 'b'], 
     2 : ['d'], 
     4 : ['c'], 
     5 : ['b'], 
     6 : ['c']}, 
    {1 : ['a'], 
     2 : ['c'], 
     3 : ['b'], 
     4 : ['c']}
]

In [5]:
"""
Recursive method that checks if `subsequence` is a subsequence of `main_sequence`
"""
def is_subsequence(main_sequence, subsequence):
    
    """
    Function for the recursive call of is_subsequence
    """
    def is_subsequence_recursive(subsequence_clone, start=0):
        # check if empty: end of recursion, all itemsets have been found
        if not subsequence_clone:
            return True
        # retrieves element of the subsequence and removes is from subsequence 
        first_elem = set(subsequence_clone.pop(0))
        # search for the first itemset...
        for i in range(start, len(main_sequence)):
            if set(main_sequence[i]).issuperset(first_elem):
                # and recurse
                return is_subsequence_recursive(subsequence_clone, i + 1)
        return False

    return is_subsequence_recursive(subsequence.copy()) # start recursion

In [7]:
sequence = [['a'], ['b', 'c'], ['d'], ['a', 'e']]

In [8]:
is_subsequence(sequence, [['a'], ['b', 'c'], ['e']])

True

In [9]:
is_subsequence(sequence, [['a'], ['b', 'd']])

False

In [6]:
"""
Computes the length of the sequence (sum of the length of the contained itemsets)
"""
def sequence_length(sequence):
    return sum(len(i) for i in sequence)

In [11]:
sequence_length([['a'], ['b', 'c'], ['a'], ['b', 'c', 'd']])

7

In [7]:
"""
Computes the support of a sequence in a dataset
"""
def count_support(dataset, cand_seq):
    return sum(1 for seq in dataset if is_subsequence(seq, cand_seq))

In [13]:
count_support(dataset, [['b']])

4

In [14]:
count_support(dataset, [['a'], ['b', 'c']])

2

In [8]:
"""
Generates one candidate of length k from two candidates of length (k-1)
"""
def gen_cands_for_pair(cand1, cand2):
    cand1_clone = copy.deepcopy(cand1)
    cand2_clone = copy.deepcopy(cand2)
    # drop the leftmost item from cand1:
    if len(cand1[0]) == 1:
        cand1_clone.pop(0)
    else:
        cand1_clone[0] = cand1_clone[0][1:]
    # drop the rightmost item from cand2:
    if len(cand2[-1]) == 1:
        cand2_clone.pop(-1)
    else:
        cand2_clone[-1] = cand2_clone[-1][:-1]
    
    # if the result is not the same, then we dont need to join
    if not cand1_clone == cand2_clone:
        return []
    else:
        new_cand = copy.deepcopy(cand1)
        if len(cand2[-1]) == 1:
            new_cand.append(cand2[-1])
        else:
            new_cand[-1].extend(cand2[-1][-1])
        return new_cand

In [16]:
candA = [['a'], ['b', 'c'], ['d']]
candB = [['b', 'c'], ['d', 'e']]
gen_cands_for_pair(candA, candB)

[['a'], ['b', 'c'], ['d', 'e']]

In [17]:
candA = [['a'], ['b', 'c'], ['d']]
candC = [['b', 'c'], ['d'], ['e']]
gen_cands_for_pair(candA, candC)

[['a'], ['b', 'c'], ['d'], ['e']]

In [18]:
candA = [['a'], ['b', 'c'], ['d']]
candD = [['a'], ['b', 'c'], ['e']]
gen_cands_for_pair(candA, candD)

[]

In [9]:
"""
Generates the set of candidates of length k from the set of frequent sequences with length (k-1)
"""
def gen_cands(last_lvl_cands):
    k = sequence_length(last_lvl_cands[0]) + 1
    if k == 2:
        flat_short_cands = [item for sublist2 in last_lvl_cands for sublist1 in sublist2 for item in sublist1]
        result = [[[a, b]] for a in flat_short_cands for b in flat_short_cands if b > a]
        result.extend([[[a], [b]] for a in flat_short_cands for b in flat_short_cands])
        return result
    else:
        cands = []
        for i in range(0, len(last_lvl_cands)):
            for j in range(0, len(last_lvl_cands)):
                new_cand = gen_cands_for_pair(last_lvl_cands[i], last_lvl_cands[j])
                if not new_cand == []:
                    cands.append(new_cand)
        cands.sort()
        return cands

Lets assume we know the frequent sequences of level 2:

In [20]:
last_lvl_freq_patterns = [
    [['a', 'b']], 
    [['b', 'c']], 
    [['a'], ['b']], 
    [['a'], ['c']], 
    [['b'], ['c']], 
    [['c'], ['b']], 
    [['c'], ['c']]
]

Then we can compute the generate candidates for level 3:

In [21]:
new_cands = gen_cands(last_lvl_freq_patterns)
new_cands

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['a', 'b', 'c']],
 [['b'], ['c'], ['b']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['b']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [10]:
"""
Computes all direct subsequence for a given sequence.
A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.
"""
def gen_direct_subsequences(sequence):
    result = []
    for i, itemset in enumerate(sequence):
        if len(itemset) == 1:
            seq_clone = copy.deepcopy(sequence)
            seq_clone.pop(i)
            result.append(seq_clone)
        else:
            for j in range(len(itemset)):
                seq_clone = copy.deepcopy(sequence)
                seq_clone[i].pop(j)
                result.append(seq_clone)
    return result

"""
Prunes the set of candidates generated for length k given all frequent sequence of level (k-1)
"""
def prune_cands(cands_last_lvl, cands_gen):
    return [cand for cand in cands_gen if all(x in cands_last_lvl for x in gen_direct_subsequences(cand))]

We apply this on example dataset:

In [24]:
cands_pruned = prune_cands(last_lvl_freq_patterns, new_cands)
cands_pruned

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [11]:
def check_cands(dataset, cands_pruned, min_sup):
    cands_counts = [(i, count_support(dataset, i)) for i in cands_pruned]
    return [(i, count) for i, count in cands_counts if count >= min_sup]

In [42]:
result_lvl = check_cands(dataset, cands_pruned, 2)
result_lvl

[([['a'], ['b'], ['c']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2)]

In [12]:
"""
The AprioriAll algorithm with time constraints. Computes the frequent sequences in a seqeunce dataset.

Args:
    dataset: a list of sequences, for which the frequent (sub-)sequences are computed
    min_sup: the minimum support that makes a sequence frequent
    max_span: this constraint specifies the maximum allowed time difference in days between the latest 
              and the earliest occurrences of events in the entire sequence
    min_gap: this constraint specifies the minimum allowed time difference in days between the latest 
             and the earliest element of the pattern instance
    max_gap: this constraint specifies the maximum allowed time difference in days between the latest 
             and the earliest element of the pattern instance
    verbose: if True, additional informations on the mining process are printed (i.e., results 
             for each level if is True, candidates generated and pruned at each level otherwise)

Returns:
    A list of tuples (s, c), where s is a frequent sequence and c is the count for that sequence
"""
def aprioriall(dataset, min_sup, max_span=np.inf, min_gap=1, max_gap=np.inf, verbose=False):
    overall = []
    items = sorted(set([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))
    single_item_sequences = [[[item]] for item in items]
    single_item_counts = [(i, count_support(dataset, i)) for i in single_item_sequences 
                        if count_support(dataset, i) >= min_sup]
    overall.append(single_item_counts)
    if verbose > 0:
        print('Result, lvl 1: ' + str(overall[0]))
    k = 1
    while True:
        if not overall[k-1]:
            break
        # 1. candidate generation
        cands_last_lvl = [x[0] for x in overall[k-1]]
        cands_gen = gen_cands(cands_last_lvl)
        # 2. candidate pruning (using a "containsall" subsequences)
        cands_pruned = prune_cands(cands_last_lvl, cands_gen)
        # 3. candidate checking
        result_lvl = check_cands(dataset, cands_pruned, min_sup)
        if verbose > 0:
            print('Result, lvl ' + str(k+1) + ': ' + str(result_lvl))
            if verbose > 1:
                print('Candidates generated, lvl ' + str(k+1) + ': ' + str(cands_gen))
                print('Candidates pruned, lvl ' + str(k+1) + ': ' + str(cands_pruned))
        overall.append(result_lvl)
        k += 1
    # "flatten" overall
    overall = overall[:-1]
    overall = [item for sublist in overall for item in sublist]
    return overall

In [54]:
aprioriall(dataset, min_sup=2, verbose=2)

Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]
Result, lvl 2: [([['a', 'b']], 2), ([['b', 'c']], 2), ([['a'], ['b']], 4), ([['a'], ['c']], 4), ([['b'], ['c']], 3), ([['c'], ['b']], 3), ([['c'], ['c']], 4)]
Candidates generated, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Candidates pruned, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Result, lvl 3: [([['a'], ['b'], ['c']], 3), ([['a'], ['b', 'c']], 2), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'], ['c']], 4), ([['a', 'b'], ['c']], 2), ([['b'], ['c'], ['c']], 2), ([['c'], ['b'], ['c']], 2)]
Candidates generated, lvl 3: [[['a'], ['b'], ['c']], [['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c'], ['c']], [['a', 'b'], ['c']], [['a'

[([['a']], 4),
 ([['b']], 4),
 ([['c']], 4),
 ([['a', 'b']], 2),
 ([['b', 'c']], 2),
 ([['a'], ['b']], 4),
 ([['a'], ['c']], 4),
 ([['b'], ['c']], 3),
 ([['c'], ['b']], 3),
 ([['c'], ['c']], 4),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

In [13]:
"""
Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence
"""
def filter_closed(result):
    for supersequence, count_seq in copy.deepcopy(result):
        for subsequence, count_subseq in copy.deepcopy(result):
            if is_subsequence(supersequence, subsequence) and count_seq == count_subseq and subsequence != supersequence:
                result.remove((subsequence, count_subseq))

In [51]:
result = aprioriall(dataset, min_sup=2, verbose=False)
filter_closed(result)
result

[([['a'], ['b']], 4),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

In [14]:
"""
Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence
"""
def filter_maximal(result):
    for supersequence, count_seq in copy.deepcopy(result):
        for subsequence, count_subseq in copy.deepcopy(result):
            if is_subsequence(supersequence, subsequence) and subsequence != supersequence:
                result.remove((subsequence, count_subseq))

In [53]:
result = aprioriall(dataset, min_sup=2, verbose=False)
filter_maximal(result)
result

[([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

### Supermarket dataset

In [15]:
baskets_sequence = pd.DataFrame(df.sort_values('BasketDate').groupby(['CustomerID', 'BasketID', 'BasketDate'])['ProdDescr'].apply(list))
baskets_sequence

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ProdDescr
CustomerID,BasketID,BasketDate,Unnamed: 3_level_1
12347.0,542237,2011-01-26 14:30:00,"[PACK OF SPACEBOY CAKE CASES, TEA TIME OVEN GL..."
12347.0,549222,2011-07-04 10:43:00,"[SET OF TINS VINTAGE BATHROOM, GIFT BAG PSYCHE..."
12347.0,556201,2011-09-06 13:01:00,"[ALARM CLOCK BAKELIKE RED, FEATHER PENCOAL BLA..."
12347.0,562032,2011-02-08 08:48:00,"[SWEETHEART FAIRY CAKE CASES, RED REFECTORY CL..."
12347.0,573511,2011-10-31 12:25:00,"[ALARM CLOCK BAKELIKE GREEN, ALARM CLOCK BAKEL..."
...,...,...,...
18283.0,579673,2011-11-30 12:59:00,"[LUNCH BAG DOLLY GIRL DESIGN, LUNCH BAG SUKI D..."
18283.0,580872,2011-06-12 12:02:00,"[STRAWBERRY SHOPPER BAG, RED RETROSPOT SHOPPER..."
18287.0,554065,2011-05-22 10:39:00,"[SET SPRING FLOWER DECORATION, CERAMIC CHERRY ..."
18287.0,570715,2011-12-10 10:23:00,"[SET FROG PRINCE TLIGHT CANDLES, ASSORTED MONK..."


In [16]:
baskets_sequence = pd.DataFrame(baskets_sequence.sort_values('BasketDate').groupby('CustomerID')['ProdDescr'].apply(list))
baskets_sequence

Unnamed: 0_level_0,ProdDescr
CustomerID,Unnamed: 1_level_1
12347.0,"[[PACK OF SPACEBOY CAKE CASES, TEA TIME OVEN G..."
12348.0,"[[PACK OF SKULL TISSUES, MULTI HEARTS STICKERS..."
12349.0,"[[PANTRY CHOPPING BOARD, SMALL WHITE RETROSPOT..."
12350.0,"[[BLUE POLKADOT PASSPORT COVER, RED HARMONICA ..."
12352.0,"[[DELUXE SEWING KIT, PINK HEART SHAPE EGG FRYI..."
...,...
18280.0,"[[SET OF ROUND TINS CAMEMBERT, KINGS CHOICE TE..."
18281.0,"[[ROBOT BIRTHDAY CARD, CARD CIRCUS PARADE, PEN..."
18282.0,"[[REGENCY SUGAR BOWL GREEN, CARD CHRISTMAS VIL..."
18283.0,"[[BLUE PIECE POLKADOT CUTLERY SET, TEA TIME OV..."


In [17]:
baskets_sequence = baskets_sequence['ProdDescr'].tolist()
baskets_sequence[:1]

[[['PACK OF SPACEBOY CAKE CASES',
   'TEA TIME OVEN GLOVE',
   'RED RETROSPOT OVEN GLOVE',
   'RED RETROSPOT OVEN GLOVE DOUBLE',
   'SET RED RETROSPOT TEA TOWELS',
   'REGENCY CAKESTAND TIER',
   'TOOTHPASTE TUBE PEN',
   'MINI LADLE LOVE HEART RED',
   'CHOCOLATE CALCULATOR',
   'SET OF TINS VINTAGE BATHROOM',
   'RED TOADSTOOL LED NIGHT LIGHT',
   'DOG PICTURE PLAYING CARDS',
   'BOX OF ASSORTED COLOUR TEASPOONS',
   'TEATIME FAIRY CAKE CASES',
   'PACK OF MUSHROOM CAKE CASES',
   'SMALL HEART MEASURING SPOONS',
   'SWEETHEART FAIRY CAKE CASES',
   'BLUE NEW BAROQUE CANDLESTICK CANDLE',
   'BLACK CANDELABRA TLIGHT HOLDER',
   'WOODLAND CHARLOTTE BAG',
   'AIRLINE BAG VINTAGE JET SET BROWN',
   'AIRLINE BAG VINTAGE JET SET WHITE',
   'PINK NEW BAROQUECANDLESTICK CANDLE',
   'ALARM CLOCK BAKELIKE CHOCOLATE',
   'ALARM CLOCK BAKELIKE GREEN',
   'ALARM CLOCK BAKELIKE RED',
   'ALARM CLOCK BAKELIKE PINK',
   'ALARM CLOCK BAKELIKE ORANGE',
   'SANDWICH BATH SPONGE'],
  ['SWEETHEART FAIRY C

In [20]:
result = aprioriall(baskets_sequence, min_sup=150, verbose=False)
filter_closed(result)
result

[([['ABC TREASURE BOOK BOX']], 157),
 ([['ALARM CLOCK BAKELIKE CHOCOLATE']], 163),
 ([['ALARM CLOCK BAKELIKE GREEN']], 305),
 ([['ALARM CLOCK BAKELIKE IVORY']], 232),
 ([['ALARM CLOCK BAKELIKE ORANGE']], 166),
 ([['ALARM CLOCK BAKELIKE PINK']], 277),
 ([['ALARM CLOCK BAKELIKE RED']], 372),
 ([['ANTIQUE SILVER TEA GLASS ENGRAVED']], 162),
 ([['ANTIQUE SILVER TLIGHT GLASS']], 346),
 ([['AREA PATROLLED METAL SIGN']], 172),
 ([['ASSORTED COLOUR BIRD ORNAMENT']], 651),
 ([['ASSORTED COLOUR MINI CASES']], 223),
 ([['ASSORTED COLOURS SILK FAN']], 187),
 ([['BAG SWIRLY MARBLES']], 313),
 ([['BAKING SET PIECE RETROSPOT']], 576),
 ([['BAKING SET SPACEBOY DESIGN']], 278),
 ([['BATH BUILDING BLOCK WORD']], 155),
 ([['BATHROOM METAL SIGN']], 218),
 ([['BEWARE OF THE CAT METAL SIGN']], 168),
 ([['BINGO SET']], 196),
 ([['BISCUIT TIN CHRISTMAS']], 230),
 ([['BISCUIT TIN VINTAGE CHRISTMAS']], 199),
 ([['BLACKBLUE POLKADOT UMBRELLA']], 169),
 ([['BLUE DINER WALL CLOCK']], 176),
 ([['BLUE HAPPY BIRTHDAY

In [21]:
result = aprioriall(baskets_sequence, min_sup=150, verbose=False)
filter_maximal(result)
result

[([['ABC TREASURE BOOK BOX']], 157),
 ([['ALARM CLOCK BAKELIKE CHOCOLATE']], 163),
 ([['ALARM CLOCK BAKELIKE ORANGE']], 166),
 ([['ANTIQUE SILVER TEA GLASS ENGRAVED']], 162),
 ([['ANTIQUE SILVER TLIGHT GLASS']], 346),
 ([['AREA PATROLLED METAL SIGN']], 172),
 ([['ASSORTED COLOUR MINI CASES']], 223),
 ([['ASSORTED COLOURS SILK FAN']], 187),
 ([['BAG SWIRLY MARBLES']], 313),
 ([['BATH BUILDING BLOCK WORD']], 155),
 ([['BATHROOM METAL SIGN']], 218),
 ([['BEWARE OF THE CAT METAL SIGN']], 168),
 ([['BINGO SET']], 196),
 ([['BISCUIT TIN CHRISTMAS']], 230),
 ([['BISCUIT TIN VINTAGE CHRISTMAS']], 199),
 ([['BLACKBLUE POLKADOT UMBRELLA']], 169),
 ([['BLUE DINER WALL CLOCK']], 176),
 ([['BLUE POLKADOT WRAP']], 166),
 ([['BLUE SPOT CERAMIC DRAWER KNOB']], 154),
 ([['BLUE STRIPE CERAMIC DRAWER KNOB']], 162),
 ([['BOX OF COCKTAIL PARASOLS']], 193),
 ([['BOX OF MINI CRACKERS']], 300),
 ([['BOX OF MINI VINTAGE CRACKERS']], 270),
 ([['BOX OF VINTAGE ALPHABET BLOCKS']], 154),
 ([['BREAD BIN DINER STYLE