In [2]:
import pandas as pd
from io import StringIO

# --- 1a: Define the raw data as a multiline string ---
# This is a sample of the same grocery dataset, formatted as a CSV.
# Using a string makes our code independent of external URLs.
csv_data = """shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
burgers,meatballs,eggs
chutney
turkey,avocado
mineral water,milk,energy bar,whole wheat rice,green tea
low fat yogurt
whole wheat pasta,french fries
soup,light cream,shallot
frozen vegetables,spaghetti,green tea
french fries
eggs,pet food
cookies
turkey,burgers,mineral water,eggs,cooking oil
spaghetti,champagne,cookies
mineral water,salmon
mineral water
shrimp,chocolate,chicken,honey,oil,cooking oil,low fat yogurt
turkey,eggs
turkey,fresh tuna,tomatoes,spaghetti,mineral water,black tea,salmon,eggs,chicken,extra dark chocolate
meatballs,milk,honey,french fries,protein bar
red wine,shrimp,pasta,pepper,eggs,chocolate,shampoo
rice,sparkling water
spaghetti,mineral water,ham,body spray,pancakes,green tea
burgers,grated cheese,shrimp,pasta,avocado,honey,white wine,toothpaste
eggs
parmesan cheese,spaghetti,soup,avocado,milk,fresh bread
ground beef,spaghetti,mineral water,milk,olive oil,energy bar,whole wheat rice,green tea
milk
soup,frozen vegetables,total salad,shampoo,salmon,rice,sparkling water
cookies
escalope,green tea
chocolate
grated cheese,yogurt cake
mint gum,fresh tuna,mineral water,chocolate,eggs,french fries
low fat yogurt
whole wheat pasta,salad
frozen vegetables,yogurt cake
ham,french fries,tomatoes,milk,spaghetti,mineral water,avocado,eggs
burgers,avocado
mineral water,chicken,cereals,clothes accessories,tonico,melons,ground beef
shallot
bacon,bars,grated cheese,burgers,avocado
turkey
frozen vegetables,tomatoes,mineral water,spaghetti
ground beef,mineral water
fresh bread
"""

# --- 1b: Load the string data into a pandas DataFrame ---
# The 'io.StringIO' module allows pandas to read a simple string as if it were a file on your computer.
data = pd.read_csv(StringIO(csv_data), header=None)

print("--- Data loaded successfully from embedded string. Here's a preview: ---")
print(data.head())


# --- 1c: Preprocess the data into a list of lists (same as before) ---
transactions = []
for i in range(0, len(data)):
    # This loop converts each row into a list of items, filtering out the empty 'nan' values.
    transaction = [str(item) for item in data.values[i, :] if str(item) != 'nan']
    transactions.append(transaction)

print(f"\n--- Preprocessing Complete ---")
print(f"There are {len(transactions)} transactions in our sample dataset.")
print("\nHere are the first 5 transactions in the correct format:")
for i in range(5):
    print(transactions[i])

--- Data loaded successfully from embedded string. Here's a preview: ---
              0          1           2                 3             4   \
0         shrimp    almonds     avocado    vegetables mix  green grapes   
1        burgers  meatballs        eggs               NaN           NaN   
2        chutney        NaN         NaN               NaN           NaN   
3         turkey    avocado         NaN               NaN           NaN   
4  mineral water       milk  energy bar  whole wheat rice     green tea   

                 5     6               7             8             9   \
0  whole weat flour  yams  cottage cheese  energy drink  tomato juice   
1               NaN   NaN             NaN           NaN           NaN   
2               NaN   NaN             NaN           NaN           NaN   
3               NaN   NaN             NaN           NaN           NaN   
4               NaN   NaN             NaN           NaN           NaN   

               10         11     12  

In [3]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd

# --- 2a: One-Hot Encode the Transaction Data ---
te = TransactionEncoder()
# The .fit_transform() method learns all unique items and transforms our list of lists
# into a one-hot encoded numpy array.
te_ary = te.fit_transform(transactions)
# We then convert this array into a pandas DataFrame.
df = pd.DataFrame(te_ary, columns=te.columns_)

print("--- One-Hot Encoded DataFrame (first 5 rows): ---")
print(df.head())


# --- 2b: Run the Apriori Algorithm ---
# We run the apriori function on our DataFrame to find itemsets with a support
# of at least 5% (this is a good starting point for a dataset of this size).
frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

print("\n--- Discovered Frequent Itemsets (with support >= 5%): ---")
print(frequent_itemsets.sort_values(by='support', ascending=False))

--- One-Hot Encoded DataFrame (first 5 rows): ---
   almonds  antioxydant juice  avocado  bacon   bars  black tea  body spray  \
0     True               True     True  False  False      False       False   
1    False              False    False  False  False      False       False   
2    False              False    False  False  False      False       False   
3    False              False     True  False  False      False       False   
4    False              False    False  False  False      False       False   

   burgers  cereals  champagne  ...  toothpaste  total salad  turkey  \
0    False    False      False  ...       False        False   False   
1     True    False      False  ...       False        False   False   
2    False    False      False  ...       False        False   False   
3    False    False      False  ...       False        False    True   
4    False    False      False  ...       False        False   False   

   vegetables mix  white wine  whole weat 

In [4]:
from mlxtend.frequent_patterns import association_rules

# --- 3a: Generate the association rules ---
# We use our 'frequent_itemsets' DataFrame as the input.
# We're looking for rules that have a 'confidence' of at least 0.2 (or 20%).
# This is a good starting threshold.
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)


# --- 3b: Display the results ---
# Let's clean up and sort the results to find the most interesting rules.

# We'll select only the most important columns for clarity.
rules_simplified = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

# Sort the rules by 'lift' in descending order to see the strongest relationships first.
print("--- Discovered Association Rules ---")
print(rules_simplified.sort_values(by='lift', ascending=False))

--- Discovered Association Rules ---
                   antecedents                 consequents   support  \
29                  (tomatoes)  (spaghetti, mineral water)  0.065217   
28  (spaghetti, mineral water)                  (tomatoes)  0.065217   
13                    (shrimp)                     (honey)  0.065217   
12                     (honey)                    (shrimp)  0.065217   
30                 (spaghetti)   (tomatoes, mineral water)  0.065217   
27   (tomatoes, mineral water)                 (spaghetti)  0.065217   
25                 (spaghetti)                  (tomatoes)  0.065217   
24                  (tomatoes)                 (spaghetti)  0.065217   
0                    (burgers)                   (avocado)  0.065217   
1                    (avocado)                   (burgers)  0.065217   
26       (tomatoes, spaghetti)             (mineral water)  0.065217   
23             (mineral water)                  (tomatoes)  0.065217   
22                  (tomato