## Importing Libraries

In [32]:
import pandas as pd
from pandas import option_context
from mlxtend.frequent_patterns import association_rules,apriori,fpgrowth
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

In [3]:
def frozenset_to_str(x):
    x = list(x)
    x = str(x).lstrip('[').rstrip(']').strip()
    return x
    

def display_sorted_rules(filtered_rules):
    # Print number of rules
    print("\n", filtered_rules.shape[0], "rules found! \n") 

    # Sort rules by confidence, support, and lift
    filtered_rules = filtered_rules.sort_values(
        by=['confidence', 'support', 'lift'], 
        ascending=False
    )

    # Save filtered rules to a csv file
    filtered_rules_csv = filtered_rules.copy()
    filtered_rules_csv['antecedents'] = filtered_rules_csv['antecedents'].apply(lambda x: frozenset_to_str(x))
    filtered_rules_csv['consequents'] = filtered_rules_csv['consequents'].apply(lambda x: frozenset_to_str(x))
    filtered_rules_csv.to_csv('filtered_rules.csv', index=False)

    # Display rules
    with option_context('display.max_colwidth', 100):
        display(filtered_rules)


In [8]:
df = pd.read_csv(r"M:\TOSS\WEEK 1\Week 1 kagle Example\Groceries_dataset.csv.zip") 
df.head() 


Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [9]:
print(f"Dataset shape: {df.shape}")

Dataset shape: (38765, 3)


In [10]:
df.drop("Date",axis=1,inplace=True)

In [12]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Member_number    38765 non-null  int64
 1   itemDescription  38765 non-null  str  
dtypes: int64(1), str(1)
memory usage: 1001.8 KB


## Data pre-processingÂ¶
- The apriori function expects data to be in the format of a one-hot encoded pandas DataFrame.

- One-hot encoding: Transform data into the format where 1 transaction is represented by a row and each product one-hot encoded as a column.

In [30]:
basket = (
    df.groupby(['Member_number', 'itemDescription'])['itemDescription']
      .count()
      .unstack()
      .reset_index()
      .set_index('Member_number')
)
basket

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,,,,,,,,,,,...,,,,,,,,2.0,1.0,
1001,,,,,,,,,1.0,,...,,,,1.0,,1.0,,2.0,,
1002,,,,,,,,,,,...,,,,,,,,1.0,,
1003,,,,,,,,,,,...,,,,,,,,,,
1004,,,,,,,,,,,...,,,,,,,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,,,,,,,,,,,...,,,,,,,,,,
4997,,,,,,,,,,,...,,,,,,,1.0,1.0,,
4998,,,,,,,,,,,...,,,,,,,,,,
4999,,,,,,,,,,2.0,...,,,,1.0,,,,,1.0,


In [33]:
imputer = SimpleImputer(strategy="mean")
basket_imputed = pd.DataFrame(
    imputer.fit_transform(basket),
    columns=basket.columns,
    index=basket.index
)
basket_imputed

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,1.051447,...,1.025641,1.02,1.040892,1.097844,1.0,1.046243,1.023256,2.000000,1.000000,1.0
1001,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.000000,1.051447,...,1.025641,1.02,1.040892,1.000000,1.0,1.000000,1.023256,2.000000,1.209429,1.0
1002,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,1.051447,...,1.025641,1.02,1.040892,1.097844,1.0,1.046243,1.023256,1.000000,1.209429,1.0
1003,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,1.051447,...,1.025641,1.02,1.040892,1.097844,1.0,1.046243,1.023256,1.400896,1.209429,1.0
1004,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,1.051447,...,1.025641,1.02,1.040892,1.097844,1.0,1.046243,1.023256,3.000000,1.209429,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,1.051447,...,1.025641,1.02,1.040892,1.097844,1.0,1.046243,1.023256,1.400896,1.209429,1.0
4997,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,1.051447,...,1.025641,1.02,1.040892,1.097844,1.0,1.046243,1.000000,1.000000,1.209429,1.0
4998,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,1.051447,...,1.025641,1.02,1.040892,1.097844,1.0,1.046243,1.023256,1.400896,1.209429,1.0
4999,1.0,1.055556,1.0,1.0,1.0,1.0,1.008264,1.0,1.107296,2.000000,...,1.025641,1.02,1.040892,1.000000,1.0,1.046243,1.023256,1.400896,1.000000,1.0


### Binary Encoding of Basket Data for Association Rule Mining

In [35]:
# Convert positive values to True, or False otherwise. 
def encode_units(x):
    if x >= 1:
        return True
    else:
        return False
        
basket_sets = basket_imputed.map(encode_units)

# Print the data shape to see how many sales transactions and how many products are involved.
print(basket_sets.shape) 

# Print some example records of the transformed dataset.
basket_sets

(3898, 167)


itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1001,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1002,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1003,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1004,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4997,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4998,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4999,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


<h2 style="color:#2E86C1;">ðŸ“Š Generating Association Rule Metrics: Support, Confidence, Lift</h2>


In [40]:
frequent_itemsets_ap = apriori(
    basket_sets,
    min_support=0.07,
    use_colnames=True
)
frequent_itemsets_ap


MemoryError: Unable to allocate 8.30 GiB for an array with shape (762355, 3, 3898) and data type bool