In [2]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Transactions Dataset

In [3]:
df_tr = pd.read_csv("./../data/small_transactions.csv", skipinitialspace=True, sep=',', nrows=2000)
df_tr.head()

Unnamed: 0,SCONTRINO_ID,COD_MKT_ID
0,2558064013053,1580
1,2558064013053,1661
2,2558064013053,2068
3,2558064013053,2556
4,2558064013053,2650


##### This is a relational representation: each row of the dataframe has a basket ID and one product bought. For our analysis, we need the transactions: the list of product bought for each basket. 

In [4]:
transactions = df_tr.groupby('SCONTRINO_ID')['COD_MKT_ID'].apply(list)
transactions.head()

SCONTRINO_ID
2558064013053                 [1580, 1661, 2068, 2556, 2650, 4225]
2558064013054    [437, 1278, 1614, 2089, 2243, 2245, 2443, 2551...
2558064013055                         [151, 595, 2650, 4600, 4872]
2558064013056             [142, 437, 2499, 2515, 3458, 3675, 4044]
2558064013057                                          [437, 3087]
Name: COD_MKT_ID, dtype: object

In [5]:
baskets = transactions.values

In [6]:
baskets[0:5]

array([list([1580, 1661, 2068, 2556, 2650, 4225]),
       list([437, 1278, 1614, 2089, 2243, 2245, 2443, 2551, 3448, 6172]),
       list([151, 595, 2650, 4600, 4872]),
       list([142, 437, 2499, 2515, 3458, 3675, 4044]), list([437, 3087])],
      dtype=object)

# Adult dataset: From tabular to Transactions

In [7]:
train_dataset = load_dataset("mstz/adult", "income")["train"].to_pandas()
test_dataset = load_dataset("mstz/adult", "income")["test"].to_pandas()

train_labels = train_dataset["over_threshold"]
test_labels = test_dataset["over_threshold"]

train_dataset = train_dataset.drop("over_threshold", axis="columns")
test_dataset = test_dataset.drop("over_threshold", axis="columns")

df = pd.concat((train_dataset, test_dataset), axis="rows")

In [38]:
df

Unnamed: 0,age,capital_gain,capital_loss,education,final_weight,hours_worked_per_week,marital_status,native_country,occupation,race,relationship,is_male,workclass
0,43,0.0,0.0,10,34278,35,Married-civ-spouse,United-States,Sales,White,Husband,True,Private
1,23,0.0,0.0,3,244698,35,Never-married,Mexico,Farming-fishing,White,Other-relative,True,Private
2,39,0.0,0.0,10,118286,40,Married-civ-spouse,United-States,Sales,Black,Husband,True,Private
3,35,0.0,0.0,9,126675,46,Divorced,?,Craft-repair,White,Not-in-family,True,Private
4,66,0.0,0.0,13,28367,99,Married-civ-spouse,United-States,Priv-house-serv,White,Other-relative,True,Private
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12206,35,0.0,0.0,11,216256,60,Never-married,United-States,Other-service,White,Not-in-family,True,Private
12207,38,0.0,0.0,9,159179,40,Married-civ-spouse,United-States,Craft-repair,White,Husband,True,Private
12208,42,0.0,0.0,13,37997,40,Married-civ-spouse,United-States,Exec-managerial,White,Husband,True,Federal-gov
12209,29,0.0,0.0,9,183061,48,Married-civ-spouse,United-States,Craft-repair,Amer-Indian-Eskimo,Husband,True,Private


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 12210
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    48842 non-null  int64  
 1   capital_gain           48842 non-null  float64
 2   capital_loss           48842 non-null  float64
 3   education              48842 non-null  int8   
 4   final_weight           48842 non-null  int64  
 5   hours_worked_per_week  48842 non-null  int64  
 6   marital_status         48842 non-null  object 
 7   native_country         48842 non-null  object 
 8   occupation             48842 non-null  object 
 9   race                   48842 non-null  object 
 10  relationship           48842 non-null  object 
 11  is_male                48842 non-null  bool   
 12  workclass              48842 non-null  object 
dtypes: bool(1), float64(2), int64(3), int8(1), object(6)
memory usage: 4.6+ MB


In [39]:
df = df.astype({
    categorical_column: "category"
    for categorical_column in df.select_dtypes(include=["object", "bool"]).columns
})

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48842 entries, 0 to 12210
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   age                    48842 non-null  int64   
 1   capital_gain           48842 non-null  float64 
 2   capital_loss           48842 non-null  float64 
 3   education              48842 non-null  int8    
 4   final_weight           48842 non-null  int64   
 5   hours_worked_per_week  48842 non-null  int64   
 6   marital_status         48842 non-null  category
 7   native_country         48842 non-null  category
 8   occupation             48842 non-null  category
 9   race                   48842 non-null  category
 10  relationship           48842 non-null  category
 11  is_male                48842 non-null  category
 12  workclass              48842 non-null  category
dtypes: category(7), float64(2), int64(3), int8(1)
memory usage: 2.6 MB


# From tabular to transaction dataset

In [41]:
numerical_columns = ['age', 'capital_gain', 'capital_loss', 'education', 'final_weight', 'hours_worked_per_week']
for col in numerical_columns:
    df[col+'_disc'] = pd.cut(df[col], 3, right=False)
#     df[col+'_disc'] = df[col+'_disc'].astype('str') + f'_{col}'
    df[col+'_disc'] = df[col+'_disc'].astype('str')
df.drop(columns=numerical_columns, inplace=True)

In [42]:
baskets = df.values.tolist()

In [43]:
df

Unnamed: 0,marital_status,native_country,occupation,race,relationship,is_male,workclass,age_disc,capital_gain_disc,capital_loss_disc,education_disc,final_weight_disc,hours_worked_per_week_disc
0,Married-civ-spouse,United-States,Sales,White,Husband,True,Private,"[41.333, 65.667)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
1,Never-married,Mexico,Farming-fishing,White,Other-relative,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[1.0, 6.0)","[12285.0, 504990.0)","[33.667, 66.333)"
2,Married-civ-spouse,United-States,Sales,Black,Husband,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
3,Divorced,?,Craft-repair,White,Not-in-family,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
4,Married-civ-spouse,United-States,Priv-house-serv,White,Other-relative,True,Private,"[65.667, 90.073)","[0.0, 33333.0)","[0.0, 1452.0)","[11.0, 16.015)","[12285.0, 504990.0)","[66.333, 99.098)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12206,Never-married,United-States,Other-service,White,Not-in-family,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[11.0, 16.015)","[12285.0, 504990.0)","[33.667, 66.333)"
12207,Married-civ-spouse,United-States,Craft-repair,White,Husband,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
12208,Married-civ-spouse,United-States,Exec-managerial,White,Husband,True,Federal-gov,"[41.333, 65.667)","[0.0, 33333.0)","[0.0, 1452.0)","[11.0, 16.015)","[12285.0, 504990.0)","[33.667, 66.333)"
12209,Married-civ-spouse,United-States,Craft-repair,Amer-Indian-Eskimo,Husband,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"


In [44]:
df_encoded = pd.get_dummies(df, prefix_sep='%')

In [45]:
df_encoded

Unnamed: 0,marital_status%Divorced,marital_status%Married-AF-spouse,marital_status%Married-civ-spouse,marital_status%Married-spouse-absent,marital_status%Never-married,marital_status%Separated,marital_status%Widowed,native_country%?,native_country%Cambodia,native_country%Canada,...,"capital_loss_disc%[2904.0, 4360.356)","education_disc%[1.0, 6.0)","education_disc%[11.0, 16.015)","education_disc%[6.0, 11.0)","final_weight_disc%[12285.0, 504990.0)","final_weight_disc%[504990.0, 997695.0)","final_weight_disc%[997695.0, 1491878.115)","hours_worked_per_week_disc%[1.0, 33.667)","hours_worked_per_week_disc%[33.667, 66.333)","hours_worked_per_week_disc%[66.333, 99.098)"
0,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,True,False
1,False,False,False,False,True,False,False,False,False,False,...,False,True,False,False,True,False,False,False,True,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,True,False
3,True,False,False,False,False,False,False,True,False,False,...,False,False,False,True,True,False,False,False,True,False
4,False,False,True,False,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12206,False,False,False,False,True,False,False,False,False,False,...,False,False,True,False,True,False,False,False,True,False
12207,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,True,False
12208,False,False,True,False,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,True,False
12209,False,False,True,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,True,False


# Apriori

In [46]:
from mlxtend.frequent_patterns import apriori
min_support=0.7
frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True, max_len=3)

In [47]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.897424,(native_country%United-States)
1,0.855043,(race%White)
2,0.99482,"(capital_gain_disc%[0.0, 33333.0))"
3,0.95594,"(capital_loss_disc%[0.0, 1452.0))"
4,0.989415,"(final_weight_disc%[12285.0, 504990.0))"
5,0.803837,"(hours_worked_per_week_disc%[33.667, 66.333))"
6,0.788113,"(native_country%United-States, race%White)"
7,0.892797,"(capital_gain_disc%[0.0, 33333.0), native_coun..."
8,0.857274,"(native_country%United-States, capital_loss_di..."
9,0.888498,"(native_country%United-States, final_weight_di..."


## Extract transactions with frequenet itemsets

In [48]:
def filter_rows_by_itemset(df, itemset):
    filters = []

    for item in itemset:
        column_name, value = item.split('%', 1)

        filters.append(df[column_name] == value)
    
    if filters:
        condition = filters[0]
        for f in filters[1:]:
            condition = condition & f

        return df[condition]
    else:
        return pd.DataFrame() 

In [50]:
itemset = frequent_itemsets['itemsets'].values[34]

matching_rows = filter_rows_by_itemset(df, itemset)
matching_rows

Unnamed: 0,marital_status,native_country,occupation,race,relationship,is_male,workclass,age_disc,capital_gain_disc,capital_loss_disc,education_disc,final_weight_disc,hours_worked_per_week_disc
0,Married-civ-spouse,United-States,Sales,White,Husband,True,Private,"[41.333, 65.667)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
1,Never-married,Mexico,Farming-fishing,White,Other-relative,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[1.0, 6.0)","[12285.0, 504990.0)","[33.667, 66.333)"
2,Married-civ-spouse,United-States,Sales,Black,Husband,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
3,Divorced,?,Craft-repair,White,Not-in-family,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
5,Divorced,United-States,Adm-clerical,White,Not-in-family,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12205,Widowed,South,Exec-managerial,Asian-Pac-Islander,Unmarried,False,Self-emp-not-inc,"[41.333, 65.667)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
12206,Never-married,United-States,Other-service,White,Not-in-family,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[11.0, 16.015)","[12285.0, 504990.0)","[33.667, 66.333)"
12207,Married-civ-spouse,United-States,Craft-repair,White,Husband,True,Private,"[17.0, 41.333)","[0.0, 33333.0)","[0.0, 1452.0)","[6.0, 11.0)","[12285.0, 504990.0)","[33.667, 66.333)"
12208,Married-civ-spouse,United-States,Exec-managerial,White,Husband,True,Federal-gov,"[41.333, 65.667)","[0.0, 33333.0)","[0.0, 1452.0)","[11.0, 16.015)","[12285.0, 504990.0)","[33.667, 66.333)"


# fpgrowth

In [51]:
from mlxtend.frequent_patterns import fpgrowth
min_support=0.7
frequent_itemsets2 = fpgrowth(df_encoded, min_support=0.7, use_colnames=True, max_len=3)

In [52]:
frequent_itemsets2

Unnamed: 0,support,itemsets
0,0.99482,"(capital_gain_disc%[0.0, 33333.0))"
1,0.989415,"(final_weight_disc%[12285.0, 504990.0))"
2,0.95594,"(capital_loss_disc%[0.0, 1452.0))"
3,0.897424,(native_country%United-States)
4,0.855043,(race%White)
5,0.803837,"(hours_worked_per_week_disc%[33.667, 66.333))"
6,0.984255,"(capital_gain_disc%[0.0, 33333.0), final_weigh..."
7,0.95076,"(capital_gain_disc%[0.0, 33333.0), capital_los..."
8,0.945723,"(capital_loss_disc%[0.0, 1452.0), final_weight..."
9,0.940563,"(capital_loss_disc%[0.0, 1452.0), capital_gain..."


Since FP-Growth doesn't require creating candidate sets explicitly, it can be magnitudes faster than the alternative Apriori algorithm. The following cells compare the performance of the Apriori algorithm to the performance of FP-Growth

In [102]:
%timeit -n 100 -r 10 apriori(df_encoded, min_support=0.7)

3.47 ms ± 229 μs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [103]:
# If True, uses an iterator to search for combinations above min_support. 
# Note that while low_memory=True should only be used for large dataset if memory resources are limited, 
# because this implementation is approx. 3-6x slower than the default.
%timeit -n 100 -r 10 apriori(df_encoded, min_support=0.7, low_memory=True)

124 ms ± 598 μs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [104]:
%timeit -n 100 -r 10 fpgrowth(df_encoded, min_support=0.7)

98.4 ms ± 911 μs per loop (mean ± std. dev. of 10 runs, 100 loops each)


The efficiency of the algorithm depends on the compaction factor of the dataset

If the tree is bushy then the algorithm does not work well, it increases a lot of number of subproblems that need to be solved.

In [118]:
print('Number of unique items: ' + str(df_encoded.shape[1]))

Number of unique items: 104


The FP-Growth algorithm builds a tree where each node represents a frequent itemset. With 104 columns, the tree may become very large and difficult to navigate, slowing down the algorithm.

In [65]:
%timeit -n 100 -r 10 apriori(df_encoded.iloc[0:400, 0:30], min_support=0.7)

381 μs ± 135 μs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [66]:
%timeit -n 100 -r 10 fpgrowth(df_encoded.iloc[0:400,0:30], min_support=0.7)

595 μs ± 126 μs per loop (mean ± std. dev. of 10 runs, 100 loops each)


# Extract Rules

High support can mean that the consequence is simply very popular, and not because I am buying them together, so there is no real correlation. 
Confidence: how many time you find the consequence, given the antecedents
    
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

![Screenshot%202024-10-25%20at%2009.28.11.png](attachment:Screenshot%202024-10-25%20at%2009.28.11.png)

![Screenshot%202024-10-25%20at%2009.27.54.png](attachment:Screenshot%202024-10-25%20at%2009.27.54.png)

In [78]:
# Generate association rules from the frequent itemsets
from mlxtend.frequent_patterns import association_rules

# Currently implemented measures to filter rules are confidence and lift
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

In [79]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(race_White),(native_country_United-States),0.855043,0.897424,0.788113,0.921723,1.027076,0.020776,1.310420,0.181863
1,(native_country_United-States),(race_White),0.897424,0.855043,0.788113,0.878194,1.027076,0.020776,1.190066,0.257004
2,(native_country_United-States),"(capital_gain_disc_[0.0, 33333.0))",0.897424,0.994820,0.892797,0.994844,1.000024,0.000021,1.004639,0.000234
3,"(capital_gain_disc_[0.0, 33333.0))",(native_country_United-States),0.994820,0.897424,0.892797,0.897446,1.000024,0.000021,1.000210,0.004641
4,(native_country_United-States),"(capital_loss_disc_[0.0, 1452.0))",0.897424,0.955940,0.857274,0.955261,0.999290,-0.000609,0.984833,-0.006877
...,...,...,...,...,...,...,...,...,...,...
113,"(final_weight_disc_[12285.0, 504990.0), hours_...","(capital_loss_disc_[0.0, 1452.0))",0.795258,0.955940,0.757811,0.952912,0.996833,-0.002408,0.935700,-0.015282
114,"(capital_loss_disc_[0.0, 1452.0), hours_worked...","(final_weight_disc_[12285.0, 504990.0))",0.766103,0.989415,0.757811,0.989176,0.999759,-0.000183,0.977964,-0.001030
115,"(final_weight_disc_[12285.0, 504990.0))","(capital_loss_disc_[0.0, 1452.0), hours_worked...",0.989415,0.766103,0.757811,0.765918,0.999759,-0.000183,0.999211,-0.022272
116,"(capital_loss_disc_[0.0, 1452.0))","(final_weight_disc_[12285.0, 504990.0), hours_...",0.955940,0.795258,0.757811,0.792739,0.996833,-0.002408,0.987847,-0.067263


In [81]:
rules["antecedent_len"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequent_len"] = rules["consequents"].apply(lambda x: len(x))

In [85]:
rules[ (rules['antecedent_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric,antecedent_len,consequent_len
29,"(race_White, capital_gain_disc_[0.0, 33333.0))",(native_country_United-States),0.850477,0.897424,0.783752,0.921544,1.026876,0.020513,1.307423,0.175041,2,1
30,"(native_country_United-States, capital_gain_di...",(race_White),0.892797,0.855043,0.783752,0.877861,1.026686,0.020372,1.18682,0.242464,2,1
35,"(race_White, capital_loss_disc_[0.0, 1452.0))",(native_country_United-States),0.816019,0.897424,0.751218,0.920589,1.025813,0.018903,1.291708,0.136769,2,1
36,"(native_country_United-States, capital_loss_di...",(race_White),0.857274,0.855043,0.751218,0.876287,1.024845,0.018212,1.171719,0.169858,2,1
40,"(final_weight_disc_[12285.0, 504990.0), native...",(race_White),0.888498,0.855043,0.782237,0.880404,1.02966,0.022533,1.212055,0.258344,2,1
41,"(final_weight_disc_[12285.0, 504990.0), race_W...",(native_country_United-States),0.847877,0.897424,0.782237,0.922583,1.028034,0.021331,1.324973,0.17926,2,1
42,"(race_White, native_country_United-States)","(final_weight_disc_[12285.0, 504990.0))",0.788113,0.989415,0.782237,0.992544,1.003163,0.002466,1.419701,0.014879,2,1
53,"(final_weight_disc_[12285.0, 504990.0), capita...",(native_country_United-States),0.984255,0.897424,0.883891,0.89803,1.000675,0.000596,1.00594,0.042841,2,1
54,"(native_country_United-States, capital_gain_di...","(final_weight_disc_[12285.0, 504990.0))",0.892797,0.989415,0.883891,0.990024,1.000616,0.000544,1.061095,0.005742,2,1
66,"(native_country_United-States, capital_loss_di...","(final_weight_disc_[12285.0, 504990.0))",0.857274,0.989415,0.848655,0.989945,1.000536,0.000455,1.052757,0.003754,2,1


# Generalized Sequential Pattern mining

In [12]:
from gsp_python.dataset_gen import DatasetGenerator

algo_dsgen = DatasetGenerator(size=10, nevents=8, maxevents=4, avgelems=16)
sequences = algo_dsgen.generate_sequence_dataset()

![Screenshot%202024-10-25%20at%2009.13.02.png](attachment:Screenshot%202024-10-25%20at%2009.13.02.png)

In [16]:
print(f"Number of sequences {str(len(sequences))}")
for i, sq in enumerate(sequences):
    print(f"Sq. {str(i)}, Number of transactions (elements): {str(len(sq))}")
    for j, elem in enumerate(sq):
        print(f"\t Transaction {str(j)}, Number of events (items): {str(len(elem))}")

Number of sequences 14
Sq. 0, Number of transactions (elements): 14
	 Transaction 0, Number of events (items): 1
	 Transaction 1, Number of events (items): 3
	 Transaction 2, Number of events (items): 3
	 Transaction 3, Number of events (items): 4
	 Transaction 4, Number of events (items): 4
	 Transaction 5, Number of events (items): 2
	 Transaction 6, Number of events (items): 4
	 Transaction 7, Number of events (items): 3
	 Transaction 8, Number of events (items): 1
	 Transaction 9, Number of events (items): 2
	 Transaction 10, Number of events (items): 1
	 Transaction 11, Number of events (items): 2
	 Transaction 12, Number of events (items): 3
	 Transaction 13, Number of events (items): 2
Sq. 1, Number of transactions (elements): 17
	 Transaction 0, Number of events (items): 2
	 Transaction 1, Number of events (items): 4
	 Transaction 2, Number of events (items): 2
	 Transaction 3, Number of events (items): 2
	 Transaction 4, Number of events (items): 3
	 Transaction 5, Number of e

xg: max-gap --> Each element of the pattern instance must be at most xg time after the previous one

ng: min-gap --> Each element of the pattern instance must be at least ng time after the previous one

ms: maximum span --> The overall duration of the pattern instance must be at most ms

![Screenshot%202024-10-25%20at%2009.22.06.png](attachment:Screenshot%202024-10-25%20at%2009.22.06.png)

Example: xg = 2, ng = 0, ms= 4 è consecutive elements at most distance 2 & overall duration at most 4 time units

In [5]:
from gsp_python.gsp import GSP

algo_gsp = GSP(sequences, minsup=0.3, mingap=1, maxgap=2, maxspan=5)
output = algo_gsp.run_gsp()

In [6]:
output

[([[2]], 10),
 ([[7]], 10),
 ([[5]], 10),
 ([[6]], 10),
 ([[8]], 10),
 ([[1]], 10),
 ([[3]], 10),
 ([[4]], 10),
 ([[2], [2]], 6),
 ([[2], [7]], 6),
 ([[2, 7]], 9),
 ([[2], [5]], 10),
 ([[2, 5]], 8),
 ([[2], [6]], 9),
 ([[2, 6]], 9),
 ([[2], [8]], 8),
 ([[2, 8]], 6),
 ([[2], [1]], 5),
 ([[2], [3]], 9),
 ([[2, 3]], 7),
 ([[2], [4]], 8),
 ([[2, 4]], 8),
 ([[7], [2]], 6),
 ([[7], [7]], 8),
 ([[7], [5]], 8),
 ([[7], [6]], 7),
 ([[7], [8]], 9),
 ([[7, 8]], 10),
 ([[7], [1]], 9),
 ([[7], [3]], 9),
 ([[7], [4]], 10),
 ([[5], [2]], 9),
 ([[5], [7]], 9),
 ([[5, 7]], 9),
 ([[5], [5]], 8),
 ([[5], [6]], 10),
 ([[5, 6]], 8),
 ([[5], [8]], 9),
 ([[5, 8]], 9),
 ([[5], [1]], 9),
 ([[5], [3]], 8),
 ([[5], [4]], 8),
 ([[6], [2]], 9),
 ([[6], [7]], 10),
 ([[6, 7]], 8),
 ([[6], [5]], 7),
 ([[6], [6]], 7),
 ([[6], [8]], 9),
 ([[6, 8]], 10),
 ([[6], [1]], 4),
 ([[6], [3]], 9),
 ([[6], [4]], 8),
 ([[8], [2]], 8),
 ([[8], [7]], 8),
 ([[8], [5]], 10),
 ([[8], [6]], 9),
 ([[8], [8]], 7),
 ([[8], [1]], 8),
 ([[8

---