# Task 4: Sequential Pattern Mining

In [107]:
import numpy as np
import pandas as pd
import copy

## Loading the new dataset

In [108]:
df = pd.read_csv('../dataset/new_customer_supermarket.csv', sep='\t', index_col=0)
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale
0,539993,2011-04-01 10:00:00,1.95,13313.0,22386,JUMBO BAG PINK POLKADOT,10,19.50
1,539993,2011-04-01 10:00:00,0.42,13313.0,21499,BLUE POLKADOT WRAP,25,10.50
2,539993,2011-04-01 10:00:00,0.42,13313.0,21498,RED RETROSPOT WRAP,25,10.50
3,539993,2011-04-01 10:00:00,2.10,13313.0,22379,RECYCLING BAG RETROSPOT,5,10.50
4,539993,2011-04-01 10:00:00,1.25,13313.0,20718,RED RETROSPOT SHOPPER BAG,10,12.50
...,...,...,...,...,...,...,...,...
363572,581587,2011-09-12 12:50:00,0.85,12680.0,22613,PACK OF SPACEBOY NAPKINS,12,10.20
363573,581587,2011-09-12 12:50:00,2.10,12680.0,22899,CHILDRENS APRON DOLLY GIRL,6,12.60
363574,581587,2011-09-12 12:50:00,4.15,12680.0,23254,CHILDRENS CUTLERY DOLLY GIRL,4,16.60
363575,581587,2011-09-12 12:50:00,4.15,12680.0,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,16.60


In [109]:
df.dtypes

BasketID        int64
BasketDate     object
Sale          float64
CustomerID    float64
ProdID         object
ProdDescr      object
Qta             int64
TotSale       float64
dtype: object

In [110]:
df = df.astype({'BasketDate': 'datetime64',
                'BasketID': 'object',
                'CustomerID': 'object'})

In [111]:
df['ProdDescr'].nunique()

3678

In [112]:
import webcolors
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
colors = webcolors.CSS3_NAMES_TO_HEX.keys()

def lemmatize_descr(descr):
    return ' '.join(lemmatizer.lemmatize(token) for token in word_tokenize(descr.lower()) 
                    if not token in stop_words and 
                    not any(color in token for color in colors) and 
                    len(token) >= 3).upper()

In [113]:
df['ProdDescr'] = df['ProdDescr'].apply(lemmatize_descr)
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale
0,539993,2011-04-01 10:00:00,1.95,13313,22386,JUMBO BAG POLKADOT,10,19.50
1,539993,2011-04-01 10:00:00,0.42,13313,21499,POLKADOT WRAP,25,10.50
2,539993,2011-04-01 10:00:00,0.42,13313,21498,RETROSPOT WRAP,25,10.50
3,539993,2011-04-01 10:00:00,2.10,13313,22379,RECYCLING BAG RETROSPOT,5,10.50
4,539993,2011-04-01 10:00:00,1.25,13313,20718,RETROSPOT SHOPPER BAG,10,12.50
...,...,...,...,...,...,...,...,...
363572,581587,2011-09-12 12:50:00,0.85,12680,22613,PACK SPACEBOY NAPKIN,12,10.20
363573,581587,2011-09-12 12:50:00,2.10,12680,22899,CHILDRENS APRON DOLLY GIRL,6,12.60
363574,581587,2011-09-12 12:50:00,4.15,12680,23254,CHILDRENS CUTLERY DOLLY GIRL,4,16.60
363575,581587,2011-09-12 12:50:00,4.15,12680,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,16.60


In [114]:
df['ProdDescr'].nunique()

3259

## AprioriAll

In [31]:
dataset = [
    # sequence: list of events
    [['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],  # event: {timestamp : list of strings} or [list of strings]
    [['a'], ['c'], ['b', 'c']], 
    [['a', 'b'], ['d'], ['c'], ['b'], ['c']], 
    [['a'], ['c'], ['b'], ['c']]
]

In [161]:
dataset = [
    # sequence: list of events
    {1: ['a'],  # event: {timestamp : list of strings} or [list of strings]
     2: ['a', 'b', 'c'], 
     3: ['a', 'c'], 
     4: ['c']}, 
    {1: ['a'], 
     2: ['c'], 
     3: ['b', 'c']}, 
    {1: ['a', 'b'], 
     2: ['d'], 
     3: ['c'], 
     4: ['b'], 
     5: ['c']}, 
    {1: ['a'], 
     2: ['c'], 
     3: ['b'], 
     4: ['c']}
]

In [23]:
"""
Recursive method that checks if `subsequence` is a subsequence of `main_sequence`
"""
def is_subsequence(main_sequence, subsequence):
    
    """
    Function for the recursive call of is_subsequence
    """
    def is_subsequence_recursive(subsequence_clone, start=0):
        # check if empty: end of recursion, all itemsets have been found
        if not subsequence_clone:
            return True
        # retrieves element of the subsequence and removes is from subsequence 
        first_elem = set(subsequence_clone.pop(0))
        # search for the first itemset...
        for i in range(start, len(main_sequence)):
            if set(main_sequence[i]).issuperset(first_elem):
                # and recurse
                return is_subsequence_recursive(subsequence_clone, i + 1)
        return False

    return is_subsequence_recursive(subsequence.copy()) # start recursion

In [24]:
sequence = [['a'], ['b', 'c'], ['d'], ['a', 'e']]

In [25]:
is_subsequence(sequence, [['a'], ['b', 'c'], ['e']])

True

In [26]:
is_subsequence(sequence, [['a'], ['b', 'd']])

False

In [27]:
"""
Computes the length of the sequence (sum of the length of the contained itemsets)
"""
def sequence_length(sequence):
    return sum(len(i) for i in sequence)

In [28]:
sequence_length([['a'], ['b', 'c'], ['a'], ['b', 'c', 'd']])

7

In [165]:
"""
Computes the support of a sequence in a dataset provided time constraints
"""
def count_support(dataset, cand_seq, max_span=np.inf, min_gap=1, max_gap=np.inf):
    if isinstance(cand_seq, dict):
        return sum(1 for idx, seq in enumerate(dataset) 
                   if min_gap < sorted(seq.keys())[idx + 1] - sorted(seq.keys())[idx] <= max_gap 
                   and sorted(seq.keys())[-1] - sorted(seq.keys())[0] <= max_span 
                   and is_subsequence(seq.values(), cand_seq))
    else:
        return sum(1 for seq in dataset
                   if is_subsequence(list(seq.values()) if isinstance(seq, dict) else seq, cand_seq))

In [133]:
count_support(dataset, [['b']])

4

In [129]:
count_support(dataset, [['a'], ['b', 'c']])

2

In [40]:
"""
Generates one candidate of length k from two candidates of length (k-1)
"""
def gen_cands_for_pair(cand1, cand2):
    cand1_clone = copy.deepcopy(cand1)
    cand2_clone = copy.deepcopy(cand2)
    # drop the leftmost item from cand1:
    if len(cand1[0]) == 1:
        cand1_clone.pop(0)
    else:
        cand1_clone[0] = cand1_clone[0][1:]
    # drop the rightmost item from cand2:
    if len(cand2[-1]) == 1:
        cand2_clone.pop(-1)
    else:
        cand2_clone[-1] = cand2_clone[-1][:-1]
    
    # if the result is not the same, then we dont need to join
    if not cand1_clone == cand2_clone:
        return []
    else:
        new_cand = copy.deepcopy(cand1)
        if len(cand2[-1]) == 1:
            new_cand.append(cand2[-1])
        else:
            new_cand[-1].extend(cand2[-1][-1])
        return new_cand

In [41]:
candA = [['a'], ['b', 'c'], ['d']]
candB = [['b', 'c'], ['d', 'e']]
gen_cands_for_pair(candA, candB)

[['a'], ['b', 'c'], ['d', 'e']]

In [42]:
candA = [['a'], ['b', 'c'], ['d']]
candC = [['b', 'c'], ['d'], ['e']]
gen_cands_for_pair(candA, candC)

[['a'], ['b', 'c'], ['d'], ['e']]

In [43]:
candA = [['a'], ['b', 'c'], ['d']]
candD = [['a'], ['b', 'c'], ['e']]
gen_cands_for_pair(candA, candD)

[]

In [44]:
"""
Generates the set of candidates of length k from the set of frequent sequences with length (k-1)
"""
def gen_cands(last_lvl_cands):
    k = sequence_length(last_lvl_cands[0]) + 1
    if k == 2:
        flat_short_cands = [item for sublist2 in last_lvl_cands for sublist1 in sublist2 for item in sublist1]
        result = [[[a, b]] for a in flat_short_cands for b in flat_short_cands if b > a]
        result.extend([[[a], [b]] for a in flat_short_cands for b in flat_short_cands])
        return result
    else:
        cands = []
        for i in range(0, len(last_lvl_cands)):
            for j in range(0, len(last_lvl_cands)):
                new_cand = gen_cands_for_pair(last_lvl_cands[i], last_lvl_cands[j])
                if not new_cand == []:
                    cands.append(new_cand)
        cands.sort()
        return cands

Lets assume we know the frequent sequences of level 2:

In [45]:
last_lvl_freq_patterns = [
    [['a', 'b']], 
    [['b', 'c']], 
    [['a'], ['b']], 
    [['a'], ['c']], 
    [['b'], ['c']], 
    [['c'], ['b']], 
    [['c'], ['c']]
]

Then we can compute the generate candidates for level 3:

In [46]:
new_cands = gen_cands(last_lvl_freq_patterns)
new_cands

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['a', 'b', 'c']],
 [['b'], ['c'], ['b']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['b']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [47]:
"""
Computes all direct subsequence for a given sequence.
A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.
"""
def gen_direct_subsequences(sequence):
    result = []
    for i, itemset in enumerate(sequence):
        if len(itemset) == 1:
            seq_clone = copy.deepcopy(sequence)
            seq_clone.pop(i)
            result.append(seq_clone)
        else:
            for j in range(len(itemset)):
                seq_clone = copy.deepcopy(sequence)
                seq_clone[i].pop(j)
                result.append(seq_clone)
    return result

"""
Prunes the set of candidates generated for length k given all frequent sequence of level (k-1)
"""
def prune_cands(cands_last_lvl, cands_gen):
    return [cand for cand in cands_gen if all(x in cands_last_lvl for x in gen_direct_subsequences(cand))]

We apply this on example dataset:

In [48]:
cands_pruned = prune_cands(last_lvl_freq_patterns, new_cands)
cands_pruned

[[['a'], ['b'], ['c']],
 [['a'], ['b', 'c']],
 [['a'], ['c'], ['b']],
 [['a'], ['c'], ['c']],
 [['a', 'b'], ['c']],
 [['b'], ['c'], ['c']],
 [['b', 'c'], ['c']],
 [['c'], ['b'], ['c']],
 [['c'], ['b', 'c']],
 [['c'], ['c'], ['b']],
 [['c'], ['c'], ['c']]]

In [62]:
def check_cands(dataset, cands_pruned, min_sup, max_span=np.inf, min_gap=1, max_gap=np.inf):
    if isinstance(cands_pruned, dict):
        cands_counts = [(s, count_support(dataset, s, t, max_span, min_gap, max_gap)) for s, t in cands_pruned]
    else:
        cands_counts = [(s, count_support(dataset, s)) for s in cands_pruned]
    return [(i, count) for i, count in cands_counts if count >= min_sup]

In [63]:
result_lvl = check_cands(dataset, cands_pruned, 2)
result_lvl

[([['a'], ['b'], ['c']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['c'], ['c']], 4),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2)]

In [166]:
"""
The AprioriAll algorithm with time constraints. Computes the frequent sequences in a seqeunce dataset.

Args:
    dataset: a list of sequences, for which the frequent (sub-)sequences are computed
    min_sup: the minimum support that makes a sequence frequent
    max_span: this constraint specifies the maximum allowed time difference in days between the latest 
              and the earliest occurrences of events in the entire sequence
    min_gap: this constraint specifies the minimum allowed time difference in days between the latest 
             and the earliest element of the pattern instance
    max_gap: this constraint specifies the maximum allowed time difference in days between the latest 
             and the earliest element of the pattern instance
    verbose: if True, additional informations on the mining process are printed (i.e., results 
             for each level if is True, candidates generated and pruned at each level otherwise)

Returns:
    A list of tuples (s, c), where s is a frequent sequence and c is the count for that sequence
"""
def aprioriall(dataset, min_sup, max_span=np.inf, min_gap=1, max_gap=np.inf, verbose=False):
    overall = []
    items = sorted(set([item for sequence in dataset 
                        for event in (sequence.values() if isinstance(sequence, dict) else sequence)
                        for item in event]))
    single_item_sequences = [[[item]] for item in items]
    single_item_counts = [(s, count_support(dataset, s)) for s in single_item_sequences]
    single_item_counts = [(i, count) for i, count in single_item_counts if count >= min_sup]
    overall.append(single_item_counts)
    if verbose > 0:
        print('Result, lvl 1: ' + str(overall[0]))
    k = 1
    while True:
        if not overall[k - 1]:
            break
        # 1. candidate generation
        cands_last_lvl = [x[0] for x in overall[k - 1]]
        cands_gen = gen_cands(cands_last_lvl)
        # 2. candidate pruning (using a "containsall" subsequences)
        cands_pruned = prune_cands(cands_last_lvl, cands_gen)
        # 3. candidate checking
        result_lvl = check_cands(dataset, cands_pruned, min_sup, max_span, min_gap, max_gap)
        if verbose > 0:
            print('Result, lvl ' + str(k + 1) + ': ' + str(result_lvl))
            if verbose > 1:
                print('Candidates generated, lvl ' + str(k + 1) + ': ' + str(cands_gen))
                print('Candidates pruned, lvl ' + str(k + 1) + ': ' + str(cands_pruned))
        overall.append(result_lvl)
        k += 1
    # "flatten" overall
    overall = overall[:-1]
    overall = [item for sublist in overall for item in sublist]
    overall.sort(key=lambda tup: tup[1], reverse=True)
    return overall

In [167]:
aprioriall(dataset, min_sup=2, verbose=2)

Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]
Result, lvl 2: [([['a', 'b']], 2), ([['b', 'c']], 2), ([['a'], ['b']], 4), ([['a'], ['c']], 4), ([['b'], ['c']], 3), ([['c'], ['b']], 3), ([['c'], ['c']], 4)]
Candidates generated, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Candidates pruned, lvl 2: [[['a', 'b']], [['a', 'c']], [['b', 'c']], [['a'], ['a']], [['a'], ['b']], [['a'], ['c']], [['b'], ['a']], [['b'], ['b']], [['b'], ['c']], [['c'], ['a']], [['c'], ['b']], [['c'], ['c']]]
Result, lvl 3: [([['a'], ['b'], ['c']], 3), ([['a'], ['b', 'c']], 2), ([['a'], ['c'], ['b']], 3), ([['a'], ['c'], ['c']], 4), ([['a', 'b'], ['c']], 2), ([['b'], ['c'], ['c']], 2), ([['c'], ['b'], ['c']], 2)]
Candidates generated, lvl 3: [[['a'], ['b'], ['c']], [['a'], ['b', 'c']], [['a'], ['c'], ['b']], [['a'], ['c'], ['c']], [['a', 'b'], ['c']], [['a'

[([['a']], 4),
 ([['b']], 4),
 ([['c']], 4),
 ([['a'], ['b']], 4),
 ([['a'], ['c']], 4),
 ([['c'], ['c']], 4),
 ([['a'], ['c'], ['c']], 4),
 ([['b'], ['c']], 3),
 ([['c'], ['b']], 3),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['c'], ['b']], 3),
 ([['a', 'b']], 2),
 ([['b', 'c']], 2),
 ([['a'], ['b', 'c']], 2),
 ([['a', 'b'], ['c']], 2),
 ([['b'], ['c'], ['c']], 2),
 ([['c'], ['b'], ['c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

In [168]:
"""
Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence
"""
def filter_closed(result):
    for supersequence, count_seq in copy.deepcopy(result):
        for subsequence, count_subseq in copy.deepcopy(result):
            if is_subsequence(supersequence, subsequence) and count_seq == count_subseq and subsequence != supersequence:
                result.remove((subsequence, count_subseq))

In [169]:
result = aprioriall(dataset, min_sup=2, verbose=False)
filter_closed(result)
result

[([['a'], ['b']], 4),
 ([['a'], ['c'], ['c']], 4),
 ([['a'], ['b'], ['c']], 3),
 ([['a'], ['c'], ['b']], 3),
 ([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

In [170]:
"""
Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence
"""
def filter_maximal(result):
    for supersequence, count_seq in copy.deepcopy(result):
        for subsequence, count_subseq in copy.deepcopy(result):
            if is_subsequence(supersequence, subsequence) and subsequence != supersequence:
                result.remove((subsequence, count_subseq))

In [171]:
result = aprioriall(dataset, min_sup=2, verbose=False)
filter_maximal(result)
result

[([['a'], ['b', 'c']], 2),
 ([['a'], ['c'], ['b'], ['c']], 2),
 ([['a', 'b'], ['c'], ['c']], 2)]

### Supermarket dataset

In [172]:
df.sort_values('BasketDate', inplace=True)
df['BasketDayOfYear'] = df['BasketDate'].dt.dayofyear
df

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,ProdID,ProdDescr,Qta,TotSale,BasketDayOfYear
20766,542776,2011-01-02 08:23:00,2.95,15240,22083,PAPER CHAIN KIT RETROSPOT,12,35.40,2
20765,542776,2011-01-02 08:23:00,4.65,15240,22835,HOT WATER BOTTLE POORLY,4,18.60,2
20764,542776,2011-01-02 08:23:00,10.95,15240,21843,RETROSPOT CAKE,1,10.95,2
20756,542776,2011-01-02 08:23:00,1.25,15240,21671,SPOT CERAMIC DRAWER KNOB,48,60.00,2
20757,542776,2011-01-02 08:23:00,1.25,15240,21668,STRIPE CERAMIC DRAWER KNOB,12,15.00,2
...,...,...,...,...,...,...,...,...,...
254280,570876,2011-12-10 17:19:00,1.55,16085,46000M,POLYESTER FILLER PAD,1,1.55,344
254288,570876,2011-12-10 17:19:00,9.95,16085,23397,FOOT STOOL HOME SWEET HOME,1,9.95,344
254289,570876,2011-12-10 17:19:00,2.95,16085,22470,HEART WICKER LARGE,3,8.85,344
254292,570876,2011-12-10 17:19:00,3.75,16085,23083,SET PAPER TABLE LANTERN STAR,2,7.50,344


In [173]:
baskets_sequences = df.groupby('CustomerID').apply(lambda customer: customer.groupby('BasketDayOfYear')['ProdDescr'].apply(list).to_dict())
baskets_sequences

CustomerID
12347.0    {26: ['PACK MUSHROOM CAKE CASE', 'BOX ASSORTED...
12348.0    {25: ['PACK SKULL TISSUE', 'PACK PAISLEY TISSU...
12349.0    {325: ['VINTAGE DOILY DELUXE SEWING KIT', 'ENA...
12350.0    {33: ['CALCULATOR', 'PLASTER TIN SPACEBOY', 'U...
12352.0    {3: ['DELUXE SEWING KIT', 'BAKING SET PIECE RE...
                                 ...                        
18280.0    {184: ['KING CHOICE TEA CADDY', 'ALARM CLOCK B...
18281.0    {340: ['ROBOT BIRTHDAY CARD', 'PENNY FARTHING ...
18282.0    {43: ['REGENCY TIER', 'REGENCY MILK JUG', 'REG...
18283.0    {23: ['PLASTER TIN SPACEBOY', 'EIGHT PIECE SNA...
18287.0    {142: ['GINGHAM CAT SCARF', 'STRAWBERRY BATH S...
Length: 4206, dtype: object

In [174]:
baskets_sequences = baskets_sequences.tolist()
baskets_sequences[:1]

[{26: ['PACK MUSHROOM CAKE CASE',
   'BOX ASSORTED COLOUR TEASPOON',
   'CALCULATOR',
   'TOOTHPASTE TUBE PEN',
   'SET TIN VINTAGE BATHROOM',
   'TOADSTOOL LED NIGHT LIGHT',
   'ALARM CLOCK BAKELIKE',
   'ALARM CLOCK BAKELIKE',
   'AIRLINE BAG VINTAGE JET SET',
   'SANDWICH BATH SPONGE',
   'MINI LADLE LOVE HEART',
   'REGENCY TIER',
   'DOG PICTURE PLAYING CARD',
   'ALARM CLOCK BAKELIKE',
   'SMALL HEART MEASURING SPOON',
   'RETROSPOT OVEN GLOVE DOUBLE',
   'PACK SPACEBOY CAKE CASE',
   'RETROSPOT OVEN GLOVE',
   'TEATIME FAIRY CAKE CASE',
   'TEA TIME OVEN GLOVE',
   'NEW BAROQUE CANDLESTICK CANDLE',
   'SWEETHEART FAIRY CAKE CASE',
   'AIRLINE BAG VINTAGE JET SET',
   'WOODLAND CHARLOTTE BAG',
   'CANDELABRA TLIGHT HOLDER',
   'SET RETROSPOT TEA TOWEL',
   'NEW BAROQUECANDLESTICK CANDLE',
   'ALARM CLOCK BAKELIKE',
   'ALARM CLOCK BAKELIKE'],
  39: ['AIRLINE BAG VINTAGE WORLD CHAMPION',
   'TRIPLE HOOK ANTIQUE ROSE',
   'ALARM CLOCK BAKELIKE',
   'WOODLAND CHARLOTTE BAG',
   'WOO

In [None]:
result = aprioriall(baskets_sequences, min_sup=200, max_span=np.inf, min_gap=1, max_gap=np.inf, verbose=False)
filter_closed(result)
result

In [None]:
result = aprioriall(baskets_sequences, min_sup=200, max_span=np.inf, min_gap=1, max_gap=np.inf, verbose=False)
filter_maximal(result)
result