# Task 4 extra: Sequential Pattern Mining

In [1]:
import copy
import pandas as pd
from operator import neg
from collections import defaultdict

## PrefixSpan: Sequential Pattern Mining by Pattern-Growth

In [9]:
dataset = [
    # sequence: list of events
    [['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],  # event: [list of strings]
    [['a'], ['c'], ['b', 'c']], 
    [['a', 'b'], ['d'], ['c'], ['b'], ['c']], 
    [['a'], ['c'], ['b'], ['c']]
]

In [58]:
def is_subsequence(main_sequence, subsequence):
    """
    Recursive method that checks if `subsequence` is a subsequence of `main_sequence`
    """

    def is_subsequence_recursive(subsequence_clone, start=0):
        """
        Function for the recursive call of is_subsequence
        """
        # check if empty: end of recursion, all itemsets have been found
        if not subsequence_clone:
            return True
        # retrieves element of the subsequence and removes is from subsequence 
        first_elem = set(subsequence_clone.pop(0))
        # search for the first itemset...
        for i in range(start, len(main_sequence)):
            if set(main_sequence[i]).issuperset(first_elem):
                # and recurse
                return is_subsequence_recursive(subsequence_clone, i + 1)
        return False

    return is_subsequence_recursive(subsequence.copy()) # start recursion

In [59]:
sequence = [['a'], ['b', 'c'], ['d'], ['a', 'e']]

In [60]:
is_subsequence(sequence, [['a'], ['b', 'c'], ['e']])

True

In [61]:
is_subsequence(sequence, [['a'], ['b', 'd']])

False

In [2]:
def project_sequence(sequence, prefix, new_event):
    """
    Projects a sequence according to a given prefix

    Args:
        sequence: the sequence the projection is built from
        prefix: the prefix that is searched for in the sequence
        new_event: if set to True, the first itemset is ignored
    Returns:
        If the sequence does not contain the prefix, then None.
        Otherwise, a new sequence starting from the position of the prefix, including the itemset that includes the prefix
    """
    result = None
    for i, itemset in enumerate(sequence):
        if result is None:
            if not new_event or i > 0:
                if all(x in itemset for x in prefix):
                    result = [list(itemset)]
        else:
            result.append(copy.copy(itemset))
    return result

In [3]:
sequence = [['a'], ['b', 'c'], ['a', 'c'], ['c']]
project_sequence(sequence, ['b'], False)

[['b', 'c'], ['a', 'c'], ['c']]

In [4]:
project_sequence(sequence, ['a', 'c'], False)

[['a', 'c'], ['c']]

In [5]:
project_sequence(sequence, ['a'], False)

[['a'], ['b', 'c'], ['a', 'c'], ['c']]

In [6]:
project_sequence(sequence, ['a'], True)

[['a', 'c'], ['c']]

In [7]:
def project_dataset(dataset, prefix, new_event):
    """
    Projects a dataset according to a given prefix

    Args:
        dataset: the dataset the projection is built from
        prefix: the prefix that is searched for in the sequence
        new_event: if set to True, the first itemset is ignored
    Returns:
        A (potentially empty) list of sequences
    """
    projected_db = []
    for sequence in dataset:
        seq_proj = project_sequence(sequence, prefix, new_event)
        if not seq_proj is None:
            projected_db.append(seq_proj)
    return projected_db

In [10]:
project_dataset(dataset, ['c'], False)

[[['a', 'b', 'c'], ['a', 'c'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b'], ['c']]]

In [14]:
def sequence_length(sequence):
    """
    Computes the length of the sequence (sum of the length of the contained itemsets)
    """
    return sum(len(i) for i in sequence)

In [15]:
sequence_length([['a'], ['b', 'c'], ['a'], ['b', 'c', 'd']])

7

In [11]:
def generate_items(dataset):
    """
    Generates a list of all items that are contained in a dataset
    """
    return sorted(set([item for sequence in dataset for event in sequence for item in even]))

def generate_item_supports(dataset, ignore_first_event=False, prefix=[]):
    """
    Computes a defaultdict that maps each item in the dataset to its support
    """
    result = defaultdict(int)
    for sequence in dataset:
        if ignore_first_event:
            sequence = sequence[1:]
        cooccurring_items = set()
        for itemset in sequence:
            if all(x in itemset for x in prefix):
                for item in itemset:
                    if not item in prefix:
                        cooccurring_items.add(item)
        for item in cooccurring_items:
            result[item] += 1
    return sorted(result.items())

In [16]:
def prefixSpan(dataset, min_sup):
    """
    The PrefixSpan algorithm. Computes the frequent sequences in a seqeunce dataset.

    Args:
        dataset: a list of sequences, for which the frequent (sub-)sequences are computed
        min_sup: the minimum support that makes a sequence frequent
    Returns:
        A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence
    """
    
    def prefixSpan__recursive(dataset, min_sup, prev_prefixes=[]):
        result = []

        # add a new item to the last element (==same time)
        item_count_same_event = generate_item_supports(dataset, False, prefix=prev_prefixes[-1])
        for item, count in item_count_same_event:
            if count >= min_sup and item > prev_prefixes[-1][-1]:
                new_prefix = copy.deepcopy(prev_prefixes)
                new_prefix[-1].append(item)
                result.append((new_prefix, count))
                result.extend(prefixSpan__recursive(project_dataset(dataset, new_prefix[-1], False), min_sup, new_prefix))

        # add a new event to the prefix
        item_count_subsequent_events = generate_item_supports(dataset, True)
        for item, count in item_count_subsequent_events:
            if count >= min_sup:
                new_prefix = copy.deepcopy(prev_prefixes)
                new_prefix.append([item])
                result.append((new_prefix, count))
                result.extend(prefixSpan__recursive(project_dataset(dataset, [item], True), min_sup, new_prefix))
        return result
    
    result = []
    item_counts = generate_item_supports(dataset)
    min_sup *= len(dataset)
    for item, count in item_counts:
        if count >= min_sup:
            new_prefix = [[item]]
            result.append((new_prefix, count))
            result.extend(prefixSpan__recursive(project_dataset(dataset, [item], False), min_sup, new_prefix))
    result.sort(key=lambda tup: (tup[1], neg(sequence_length(tup[0]))), reverse=True)
    return [(tup[0], tup[1] / len(dataset)) for tup in result]

In [67]:
prefixSpan(dataset, min_sup=0.5)

[([['a']], 1.0),
 ([['b']], 1.0),
 ([['c']], 1.0),
 ([['a'], ['b']], 1.0),
 ([['a'], ['c']], 1.0),
 ([['c'], ['c']], 1.0),
 ([['a'], ['c'], ['c']], 1.0),
 ([['b'], ['c']], 0.75),
 ([['c'], ['b']], 0.75),
 ([['a'], ['b'], ['c']], 0.75),
 ([['a'], ['c'], ['b']], 0.75),
 ([['a', 'b']], 0.5),
 ([['b', 'c']], 0.5),
 ([['a', 'b'], ['c']], 0.5),
 ([['a'], ['b', 'c']], 0.5),
 ([['b'], ['c'], ['c']], 0.5),
 ([['c'], ['b'], ['c']], 0.5),
 ([['a', 'b'], ['c'], ['c']], 0.5),
 ([['a'], ['c'], ['b'], ['c']], 0.5)]

In [68]:
def filter_closed(result):
    """
    Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence
    """
    for supersequence, count_seq in copy.deepcopy(result):
        for subsequence, count_subseq in copy.deepcopy(result):
            if is_subsequence(supersequence, subsequence) and count_seq == count_subseq and subsequence != supersequence:
                result.remove((subsequence, count_subseq))

In [79]:
result = prefixSpan(dataset, min_sup=0.5)
filter_closed(result)
result

[([['a'], ['b']], 1.0),
 ([['a'], ['c'], ['c']], 1.0),
 ([['a'], ['b'], ['c']], 0.75),
 ([['a'], ['c'], ['b']], 0.75),
 ([['a'], ['b', 'c']], 0.5),
 ([['a', 'b'], ['c'], ['c']], 0.5),
 ([['a'], ['c'], ['b'], ['c']], 0.5)]

In [70]:
def filter_maximal(result):
    """
    Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence
    """
    for supersequence, count_seq in copy.deepcopy(result):
        for subsequence, count_subseq in copy.deepcopy(result):
            if is_subsequence(supersequence, subsequence) and subsequence != supersequence:
                result.remove((subsequence, count_subseq))

In [73]:
result = prefixSpan(dataset, min_sup=0.5)
filter_maximal(result)
result

[([['a'], ['b', 'c']], 0.5),
 ([['a', 'b'], ['c'], ['c']], 0.5),
 ([['a'], ['c'], ['b'], ['c']], 0.5)]

## Loading the new Customer Supermarket dataset

In [50]:
df = pd.read_csv('../dataset/new_customer_supermarket.csv', sep='\t', index_col=0)
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale
0,539993,2011-04-01 10:00:00,1.95,13313.0,22386,JUMBO BAG PINK POLKADOT,10,19.50
1,539993,2011-04-01 10:00:00,0.42,13313.0,21499,BLUE POLKADOT WRAP,25,10.50
2,539993,2011-04-01 10:00:00,0.42,13313.0,21498,RED RETROSPOT WRAP,25,10.50
3,539993,2011-04-01 10:00:00,2.10,13313.0,22379,RECYCLING BAG RETROSPOT,5,10.50
4,539993,2011-04-01 10:00:00,1.25,13313.0,20718,RED RETROSPOT SHOPPER BAG,10,12.50
...,...,...,...,...,...,...,...,...
363572,581587,2011-09-12 12:50:00,0.85,12680.0,22613,PACK OF SPACEBOY NAPKINS,12,10.20
363573,581587,2011-09-12 12:50:00,2.10,12680.0,22899,CHILDRENS APRON DOLLY GIRL,6,12.60
363574,581587,2011-09-12 12:50:00,4.15,12680.0,23254,CHILDRENS CUTLERY DOLLY GIRL,4,16.60
363575,581587,2011-09-12 12:50:00,4.15,12680.0,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,16.60


In [51]:
df.dtypes

BasketID        int64
BasketDate     object
Sale          float64
CustomerID    float64
ProdID         object
ProdDescr      object
Qta             int64
TotSale       float64
dtype: object

In [52]:
df = df.astype({'BasketDate': 'datetime64',
                'BasketID': 'object',
                'CustomerID': 'object'})

## Data Modeling

In [53]:
df.sort_values('BasketDate', inplace=True)
baskets_sequences = df[['CustomerID', 'BasketDate', 'ProdDescr']]
baskets_sequences = baskets_sequences.loc[(baskets_sequences.CustomerID != baskets_sequences.CustomerID.shift()) | 
                                          (baskets_sequences.BasketDate != baskets_sequences.BasketDate.shift()) | 
                                          (baskets_sequences.ProdDescr != baskets_sequences.ProdDescr.shift())] # remove consecutive duplicate products from baskets

In [54]:
baskets_sequences = df.groupby('CustomerID').apply(lambda customer: customer.groupby('BasketDate')['ProdDescr'].apply(list)).groupby('CustomerID').apply(list)
baskets_sequences = baskets_sequences.tolist()
baskets_sequences[:1]

[[['PACK OF SPACEBOY CAKE CASES',
   'TEA TIME OVEN GLOVE',
   'RED RETROSPOT OVEN GLOVE',
   'RED RETROSPOT OVEN GLOVE DOUBLE',
   'SET RED RETROSPOT TEA TOWELS',
   'REGENCY CAKESTAND TIER',
   'TOOTHPASTE TUBE PEN',
   'MINI LADLE LOVE HEART RED',
   'CHOCOLATE CALCULATOR',
   'SET OF TINS VINTAGE BATHROOM',
   'RED TOADSTOOL LED NIGHT LIGHT',
   'DOG PICTURE PLAYING CARDS',
   'BOX OF ASSORTED COLOUR TEASPOONS',
   'TEATIME FAIRY CAKE CASES',
   'PACK OF MUSHROOM CAKE CASES',
   'SMALL HEART MEASURING SPOONS',
   'SWEETHEART FAIRY CAKE CASES',
   'BLUE NEW BAROQUE CANDLESTICK CANDLE',
   'BLACK CANDELABRA TLIGHT HOLDER',
   'WOODLAND CHARLOTTE BAG',
   'AIRLINE BAG VINTAGE JET SET BROWN',
   'AIRLINE BAG VINTAGE JET SET WHITE',
   'PINK NEW BAROQUECANDLESTICK CANDLE',
   'ALARM CLOCK BAKELIKE CHOCOLATE',
   'ALARM CLOCK BAKELIKE GREEN',
   'ALARM CLOCK BAKELIKE RED',
   'ALARM CLOCK BAKELIKE PINK',
   'ALARM CLOCK BAKELIKE ORANGE',
   'SANDWICH BATH SPONGE'],
  ['SWEETHEART FAIRY C

In [75]:
%time freq_patterns = prefixSpan(baskets_sequences, min_sup=0.05)
list(filter(lambda freq_pattern: sequence_length(freq_pattern[0]) >= 2, freq_patterns))

CPU times: user 10.5 s, sys: 3.77 ms, total: 10.5 s
Wall time: 10.5 s


[([['WHITE HANGING HEART TLIGHT HOLDER'],
   ['WHITE HANGING HEART TLIGHT HOLDER']],
  0.08749405611031859),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  0.07203994293865906),
 ([['JUMBO BAG RED RETROSPOT'], ['JUMBO BAG RED RETROSPOT']],
  0.07180218735140276),
 ([['REGENCY CAKESTAND TIER'], ['REGENCY CAKESTAND TIER']],
  0.07085116500237755),
 ([['GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER']],
  0.06680932001902044),
 ([['PAPER CHAIN KIT CHRISTMAS', 'PAPER CHAIN KIT VINTAGE CHRISTMAS']],
  0.06609605325725154),
 ([['ASSORTED COLOUR BIRD ORNAMENT'], ['ASSORTED COLOUR BIRD ORNAMENT']],
  0.06538278649548264),
 ([['PARTY BUNTING'], ['PARTY BUNTING']], 0.06490727532097004),
 ([['LUNCH BAG RED RETROSPOT'], ['LUNCH BAG RED RETROSPOT']],
  0.06324298621017593),
 ([['PINK REGENCY TEACUP AND SAUCER', 'ROSES REGENCY TEACUP AND SAUCER']],
  0.06062767475035663),
 ([['GARDENERS KNEELING PAD CUP OF TEA', 'GARDENERS KNEELING PAD KEEP CALM']],
