# Task 4: Sequential Pattern Mining

In [55]:
import numpy as np
import pandas as pd
import copy

## Generalized Sequential Pattern (GSP) Mining

In [5]:
dataset = [
    # sequence: list of events
    [['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],  # event: {timestamp : list of strings} or [list of strings]
    [['a'], ['c'], ['b', 'c']], 
    [['a', 'b'], ['d'], ['c'], ['b'], ['c']], 
    [['a'], ['c'], ['b'], ['c']]
]

In [6]:
dataset = [
    # sequence: list of events
    {1: ['a'],  # event: {timestamp : list of strings} or [list of strings]
     2: ['a', 'b', 'c'], 
     3: ['a', 'c'], 
     4: ['c']}, 
    {1: ['a'], 
     2: ['c'], 
     3: ['b', 'c']}, 
    {1: ['a', 'b'], 
     2: ['d'], 
     3: ['c'], 
     4: ['b'], 
     5: ['c']}, 
    {1: ['a'], 
     2: ['c'], 
     3: ['b'], 
     4: ['c']}
]

In [7]:
def is_subsequence(main_sequence, subsequence):
    """
    Recursive method that checks if `subsequence` is a subsequence of `main_sequence`
    """

    def is_subsequence_recursive(subsequence_clone, start=0):
        """
        Function for the recursive call of is_subsequence
        """
        # check if empty: end of recursion, all itemsets have been found
        if not subsequence_clone:
            return True
        # retrieves element of the subsequence and removes is from subsequence 
        first_elem = set(subsequence_clone.pop(0))
        # search for the first itemset...
        for i in range(start, len(main_sequence)):
            if set(main_sequence[i]).issuperset(first_elem):
                # and recurse
                return is_subsequence_recursive(subsequence_clone, i + 1)
        return False

    return is_subsequence_recursive(subsequence.copy()) # start recursion

In [8]:
sequence = [['a'], ['b', 'c'], ['d'], ['a', 'e']]

In [9]:
is_subsequence(sequence, [['a'], ['b', 'c'], ['e']])

True

In [10]:
is_subsequence(sequence, [['a'], ['b', 'd']])

False

In [11]:
def sequence_length(sequence):
    """
    Computes the length of the sequence (sum of the length of the contained itemsets)
    """
    return sum(len(i) for i in sequence)

In [12]:
sequence_length([['a'], ['b', 'c'], ['a'], ['b', 'c', 'd']])

7

In [66]:
def count_support(dataset, cand_seq, max_span=np.inf, min_gap=0, max_gap=np.inf):
    """
    Computes the support of a sequence in a dataset provided time constraints
    """
    if isinstance(dataset[0], dict):
        return sum(1 for seq in dataset if is_subsequence(list(seq.values()), cand_seq)) / len(dataset)
    else:
        return sum(1 for seq in dataset if is_subsequence(seq, cand_seq)) / len(dataset)

In [67]:
count_support(dataset, [['b']])

1.0

In [68]:
count_support(dataset, [['a'], ['b', 'c']])

0.5

In [16]:
def gen_cands_for_pair(cand1, cand2):
    """
    Generates one candidate of length k from two candidates of length (k-1)
    """
    cand1_clone = copy.deepcopy(cand1)
    cand2_clone = copy.deepcopy(cand2)
    # drop the leftmost item from cand1:
    if len(cand1[0]) == 1:
        cand1_clone.pop(0)
    else:
        cand1_clone[0] = cand1_clone[0][1:]
    # drop the rightmost item from cand2:
    if len(cand2[-1]) == 1:
        cand2_clone.pop(-1)
    else:
        cand2_clone[-1] = cand2_clone[-1][:-1]
    
    # if the result is not the same, then we dont need to join
    if not cand1_clone == cand2_clone:
        return []
    else:
        new_cand = copy.deepcopy(cand1)
        if len(cand2[-1]) == 1:
            new_cand.append(cand2[-1])
        else:
            new_cand[-1].extend(cand2[-1][-1])
        return new_cand

In [17]:
candA = [['a'], ['b', 'c'], ['d']]
candB = [['b', 'c'], ['d', 'e']]
gen_cands_for_pair(candA, candB)

[['a'], ['b', 'c'], ['d', 'e']]

In [18]:
candA = [['a'], ['b', 'c'], ['d']]
candC = [['b', 'c'], ['d'], ['e']]
gen_cands_for_pair(candA, candC)

[['a'], ['b', 'c'], ['d'], ['e']]

In [19]:
candA = [['a'], ['b', 'c'], ['d']]
candD = [['a'], ['b', 'c'], ['e']]
gen_cands_for_pair(candA, candD)

[]

In [20]:
def gen_cands(last_lvl_cands):
    """
    Generates the set of candidates of length k from the set of frequent sequences with length (k-1)
    """
    k = sequence_length(last_lvl_cands[0]) + 1
    if k == 2:
        flat_short_cands = [item for sublist2 in last_lvl_cands for sublist1 in sublist2 for item in sublist1]
        result = [[[a, b]] for a in flat_short_cands for b in flat_short_cands if b > a]
        result.extend([[[a], [b]] for a in flat_short_cands for b in flat_short_cands])
        return result
    else:
        cands = []
        for i in range(0, len(last_lvl_cands)):
            for j in range(0, len(last_lvl_cands)):
                new_cand = gen_cands_for_pair(last_lvl_cands[i], last_lvl_cands[j])
                if not new_cand == []:
                    cands.append(new_cand)
        cands.sort()
        return cands

Lets assume we know the frequent sequences of level 2:

In [21]:
last_lvl_freq_patterns = [
    [['a', 'b']], 
    [['b', 'c']], 
    [['a'], ['b']], 
    [['a'], ['c']], 
    [['b'], ['c']], 
    [['c'], ['b']], 
    [['c'], ['c']]
]

Then we can compute the generate candidates for level 3:

In [22]:
new_cands = gen_cands(last_lvl_freq_patterns)
new_cands

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['a', 'b', 'c']],
 [['b'], ['c'], ['b']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['b']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [23]:
def gen_direct_subsequences(sequence):
    """
    Computes all direct subsequence for a given sequence.
    A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.
    """
    result = []
    for i, itemset in enumerate(sequence):
        if len(itemset) == 1:
            seq_clone = copy.deepcopy(sequence)
            seq_clone.pop(i)
            result.append(seq_clone)
        else:
            for j in range(len(itemset)):
                seq_clone = copy.deepcopy(sequence)
                seq_clone[i].pop(j)
                result.append(seq_clone)
    return result

def prune_cands(last_lvl_cands, cands_gen):
    """
    Prunes the set of candidates generated for length k given all frequent sequence of level (k-1)
    """
    return [cand for cand in cands_gen if all(x in last_lvl_cands for x in gen_direct_subsequences(cand))]

We apply this on example dataset:

In [24]:
cands_pruned = prune_cands(last_lvl_freq_patterns, new_cands)
cands_pruned

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [69]:
min_sup = 0.5
cands_counts = [(s, count_support(dataset, s)) for s in cands_pruned]
result_lvl = [(i, count) for i, count in cands_counts if count >= min_sup]
result_lvl

[([['a'], ['b'], ['c']], 0.75),
 ([['a'], ['b', 'c']], 0.5),
 ([['a'], ['c'], ['b']], 0.75),
 ([['a'], ['c'], ['c']], 1.0),
 ([['a', 'b'], ['c']], 0.5),
 ([['b'], ['c'], ['c']], 0.5),
 ([['c'], ['b'], ['c']], 0.5)]

In [171]:
def gsp(dataset, min_sup, max_span=np.inf, min_gap=0, max_gap=np.inf, verbose=False):
    """
    The Generalized Sequential Pattern (GSP) algorithm with time constraints. 
    Computes the frequent sequences in a sequence dataset.

    Args:
        dataset: a list of sequences, for which the frequent (sub-)sequences are computed
        min_sup: the minimum support that makes a sequence frequent
        max_span: this constraint specifies the maximum allowed time difference in days between the latest 
                  and the earliest occurrences of events in the entire sequence
        min_gap: this constraint specifies the minimum allowed time difference in days between the latest 
                 and the earliest element of the pattern instance
        max_gap: this constraint specifies the maximum allowed time difference in days between the latest 
                 and the earliest element of the pattern instance
        verbose: if True, additional information on the mining process are printed (i.e., results 
                 for each level if is 1, candidates generated and pruned at each level if is 2)

    Returns:
        A list of tuples (s, c), where s is a frequent sequence and c is the count for that sequence
    """
    overall = []
    # make the first pass over the sequence database D to yield all the 1-element frequent subsequences
    items = sorted(set([item for sequence in dataset
                        for event in (sequence.values() if isinstance(sequence, dict) else sequence)
                        for item in event]))
    single_item_sequences = [[[item]] for item in items]
    single_item_counts = [(s, count_support(dataset, s)) for s in single_item_sequences]
    single_item_counts = [(i, count) for i, count in single_item_counts if count >= min_sup]
    overall.append(single_item_counts)
    if verbose > 0:
        print('Result, lvl 1: ' + str(overall[0]))
    k = 1
    while overall[k - 1]:
        # 1. candidate generation: merge pairs of frequent subsequences found in the 
        # (k-1)th pass to generate candidate sequences that contain k items 
        last_lvl_cands = [x[0] for x in overall[k - 1]]
        cands_gen = gen_cands(last_lvl_cands)
        # 2. candidate pruning: prune candidate k-sequences that contain infrequent 
        # (contiguous) (k-1)-subsequences (Apriori principle)
        cands_pruned = prune_cands(last_lvl_cands, cands_gen)
        # 3. support counting: make a new pass over the sequence database D to find 
        # the support for these candidate sequences
        cands_counts = [(s, count_support(dataset, s, max_span, min_gap, max_gap)) for s in cands_pruned]
        # 4. candidate elimination: eliminate candidate k-sequences whose actual 
        # support is less than `minsup`
        result_lvl = [(i, count) for i, count in cands_counts if count >= min_sup]
        if verbose > 0:
            print('Result, lvl ' + str(k + 1) + ': ' + str(result_lvl))
            if verbose > 1:
                print('Candidates generated, lvl ' + str(k + 1) + ': ' + str(cands_gen))
                print('Candidates pruned, lvl ' + str(k + 1) + ': ' + str(cands_pruned))
        overall.append(result_lvl)
        k += 1
    # "flatten" overall
    overall = overall[:-1]
    overall = [item for sublist in overall for item in sublist]
    overall.sort(key=lambda tup: tup[1], reverse=True)
    return overall

In [172]:
gsp(dataset, min_sup=0.5, verbose=2)

Result, lvl 1: [([['a']], 1.0), ([['b']], 1.0), ([['c']], 1.0)]
Result, lvl 2: [([['a', 'b']], 0.5), ([['b', 'c']], 0.5), ([['a'], ['b']], 1.0), ([['a'], ['c']], 1.0), ([['b'], ['c']], 0.75), ([['c'], ['b']], 0.75), ([['c'], ['c']], 1.0)]
Candidates generated, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Candidates pruned, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Result, lvl 3: [([['a'], ['b'], ['c']], 0.75), ([['a'], ['b', 'c']], 0.5), ([['a'], ['c'], ['b']], 0.75), ([['a'], ['c'], ['c']], 1.0), ([['a', 'b'], ['c']], 0.5), ([['b'], ['c'], ['c']], 0.5), ([['c'], ['b'], ['c']], 0.5)]
Candidates generated, lvl 3: [[['a'], ['b'], ['c']], [['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c

[([['a']], 1.0),
 ([['b']], 1.0),
 ([['c']], 1.0),
 ([['a'], ['b']], 1.0),
 ([['a'], ['c']], 1.0),
 ([['c'], ['c']], 1.0),
 ([['a'], ['c'], ['c']], 1.0),
 ([['b'], ['c']], 0.75),
 ([['c'], ['b']], 0.75),
 ([['a'], ['b'], ['c']], 0.75),
 ([['a'], ['c'], ['b']], 0.75),
 ([['a', 'b']], 0.5),
 ([['b', 'c']], 0.5),
 ([['a'], ['b', 'c']], 0.5),
 ([['a', 'b'], ['c']], 0.5),
 ([['b'], ['c'], ['c']], 0.5),
 ([['c'], ['b'], ['c']], 0.5),
 ([['a'], ['c'], ['b'], ['c']], 0.5),
 ([['a', 'b'], ['c'], ['c']], 0.5)]

## Loading the new Customer Supermarket dataset

In [219]:
df = pd.read_csv('../dataset/new_customer_supermarket.csv', sep='\t', index_col=0)
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale
0,539993,2011-04-01 10:00:00,1.95,13313.0,22386,JUMBO BAG PINK POLKADOT,10,19.50
1,539993,2011-04-01 10:00:00,0.42,13313.0,21499,BLUE POLKADOT WRAP,25,10.50
2,539993,2011-04-01 10:00:00,0.42,13313.0,21498,RED RETROSPOT WRAP,25,10.50
3,539993,2011-04-01 10:00:00,2.10,13313.0,22379,RECYCLING BAG RETROSPOT,5,10.50
4,539993,2011-04-01 10:00:00,1.25,13313.0,20718,RED RETROSPOT SHOPPER BAG,10,12.50
...,...,...,...,...,...,...,...,...
363572,581587,2011-09-12 12:50:00,0.85,12680.0,22613,PACK OF SPACEBOY NAPKINS,12,10.20
363573,581587,2011-09-12 12:50:00,2.10,12680.0,22899,CHILDRENS APRON DOLLY GIRL,6,12.60
363574,581587,2011-09-12 12:50:00,4.15,12680.0,23254,CHILDRENS CUTLERY DOLLY GIRL,4,16.60
363575,581587,2011-09-12 12:50:00,4.15,12680.0,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,16.60


In [220]:
df.dtypes

BasketID        int64
BasketDate     object
Sale          float64
CustomerID    float64
ProdID         object
ProdDescr      object
Qta             int64
TotSale       float64
dtype: object

In [221]:
df = df.astype({'BasketDate': 'datetime64',
                'BasketID': 'object',
                'CustomerID': 'object'})

## Data Modeling

In [222]:
df.sort_values('BasketDate', inplace=True)
df['BasketDayOfYear'] = df['BasketDate'].dt.dayofyear
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale,BasketDayOfYear
20767,542776,2011-01-02 08:23:00,0.30,15240,17021,NAMASTE SWAGAT INCENSE,36,10.80,2
20769,542776,2011-01-02 08:23:00,4.95,15240,21485,RETROSPOT HEART HOT WATER BOTTLE,3,14.85,2
20768,542776,2011-01-02 08:23:00,3.75,15240,21218,RED SPOTTY BISCUIT TIN,5,18.75,2
20766,542776,2011-01-02 08:23:00,2.95,15240,22083,PAPER CHAIN KIT RETROSPOT,12,35.40,2
20765,542776,2011-01-02 08:23:00,4.65,15240,22835,HOT WATER BOTTLE I AM SO POORLY,4,18.60,2
...,...,...,...,...,...,...,...,...,...
254281,570876,2011-12-10 17:19:00,3.75,16085,23394,POSTE FRANCE CUSHION COVER,1,3.75,344
254280,570876,2011-12-10 17:19:00,1.55,16085,46000M,POLYESTER FILLER PAD,1,1.55,344
254279,570876,2011-12-10 17:19:00,1.45,16085,46000S,POLYESTER FILLER PAD,1,1.45,344
254291,570876,2011-12-10 17:19:00,1.65,16085,22469,HEART OF WICKER SMALL,3,4.95,344


In [223]:
baskets_sequences = df[['CustomerID', 'BasketDayOfYear', 'ProdDescr']]
baskets_sequences = baskets_sequences.loc[(baskets_sequences.CustomerID != baskets_sequences.CustomerID.shift()) | 
                                          (baskets_sequences.BasketDayOfYear != baskets_sequences.BasketDayOfYear.shift()) | 
                                          (baskets_sequences.ProdDescr != baskets_sequences.ProdDescr.shift())] # remove consecutive duplicate products from baskets
baskets_sequences.to_csv('./rapidminer/customer_supermarket.csv', sep='\t', index=False)

In [224]:
baskets_sequences = df.groupby('CustomerID').apply(lambda customer: customer.groupby('BasketDayOfYear')['ProdDescr'].apply(list).to_dict())
baskets_sequences

CustomerID
12347.0    {26: ['PACK OF SPACEBOY CAKE CASES', 'TEA TIME...
12348.0    {25: ['PACK OF SKULL TISSUES', 'MULTI HEARTS S...
12349.0    {325: ['PANTRY CHOPPING BOARD', 'SMALL WHITE R...
12350.0    {33: ['BLUE POLKADOT PASSPORT COVER', 'RED HAR...
12352.0    {3: ['DELUXE SEWING KIT', 'PINK HEART SHAPE EG...
                                 ...                        
18280.0    {184: ['SET OF ROUND TINS CAMEMBERT', 'KINGS C...
18281.0    {340: ['ROBOT BIRTHDAY CARD', 'CARD CIRCUS PAR...
18282.0    {43: ['REGENCY SUGAR BOWL GREEN', 'CARD CHRIST...
18283.0    {23: ['BLUE PIECE POLKADOT CUTLERY SET', 'TEA ...
18287.0    {142: ['SET SPRING FLOWER DECORATION', 'CERAMI...
Length: 4206, dtype: object

In [225]:
baskets_sequences = baskets_sequences.tolist()
baskets_sequences[:1]

[{26: ['PACK OF SPACEBOY CAKE CASES',
   'TEA TIME OVEN GLOVE',
   'RED RETROSPOT OVEN GLOVE',
   'RED RETROSPOT OVEN GLOVE DOUBLE',
   'SET RED RETROSPOT TEA TOWELS',
   'REGENCY CAKESTAND TIER',
   'TOOTHPASTE TUBE PEN',
   'MINI LADLE LOVE HEART RED',
   'CHOCOLATE CALCULATOR',
   'SET OF TINS VINTAGE BATHROOM',
   'RED TOADSTOOL LED NIGHT LIGHT',
   'DOG PICTURE PLAYING CARDS',
   'BOX OF ASSORTED COLOUR TEASPOONS',
   'TEATIME FAIRY CAKE CASES',
   'PACK OF MUSHROOM CAKE CASES',
   'SMALL HEART MEASURING SPOONS',
   'SWEETHEART FAIRY CAKE CASES',
   'BLUE NEW BAROQUE CANDLESTICK CANDLE',
   'BLACK CANDELABRA TLIGHT HOLDER',
   'WOODLAND CHARLOTTE BAG',
   'AIRLINE BAG VINTAGE JET SET BROWN',
   'AIRLINE BAG VINTAGE JET SET WHITE',
   'PINK NEW BAROQUECANDLESTICK CANDLE',
   'ALARM CLOCK BAKELIKE CHOCOLATE',
   'ALARM CLOCK BAKELIKE GREEN',
   'ALARM CLOCK BAKELIKE RED',
   'ALARM CLOCK BAKELIKE PINK',
   'ALARM CLOCK BAKELIKE ORANGE',
   'SANDWICH BATH SPONGE'],
  39: ['SWEETHEART

Assuming no timing constraints:

In [180]:
result_set = gsp(baskets_sequences, min_sup=0.05, max_span=np.inf, min_gap=0, max_gap=np.inf, verbose=False)
list(filter(lambda freq_pattern: sequence_length(freq_pattern[0]) >= 2, result_set))

[([['WHITE HANGING HEART TLIGHT HOLDER'],
   ['WHITE HANGING HEART TLIGHT HOLDER']],
  0.08701854493580599),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  0.07227769852591535),
 ([['JUMBO BAG RED RETROSPOT'], ['JUMBO BAG RED RETROSPOT']],
  0.07156443176414645),
 ([['REGENCY CAKESTAND TIER'], ['REGENCY CAKESTAND TIER']],
  0.07013789824060865),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER']],
  0.06704707560627675),
 ([['PAPER CHAIN KIT CHRISTMAS', 'PAPER CHAIN KIT VINTAGE CHRISTMAS']],
  0.06657156443176415),
 ([['ASSORTED COLOUR BIRD ORNAMENT'], ['ASSORTED COLOUR BIRD ORNAMENT']],
  0.06514503090822635),
 ([['PARTY BUNTING'], ['PARTY BUNTING']], 0.06466951973371374),
 ([['LUNCH BAG RED RETROSPOT'], ['LUNCH BAG RED RETROSPOT']],
  0.06324298621017593),
 ([['PINK REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  0.060865430337612936),
 ([['GARDENERS KNEELING PAD CUP OF TEA', 'GARDENERS KNEELING PAD KEEP CALM']],

RapidMiner frequent patterns:

Assuming timing constraints:

In [None]:
result_set = gsp(baskets_sequences, 
                 min_sup=0.05, 
                 max_span=30,  # the overall duration of the pattern instance must be at most of 1 month
                 min_gap=1,  # each element of the pattern instance must be at least 1 day after the previous one
                 max_gap=7,  # each element of the pattern instance must be at most 1 week after the previous one
                 verbose=False)
list(filter(lambda freq_pattern: sequence_length(freq_pattern[0]) >= 2, result_set))