<a href="https://colab.research.google.com/github/guilhermelaviola/BIArchitectureAndBigData/blob/main/Class05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Mining Techniques and Algorithms**
Data mining is a process that uses machine learning, statistical, and database systems to analyze large amounts of data. It involves classification algorithms, association rules, anomaly and outlier detection, and is crucial for informed decision-making in business contexts. It helps anticipate future trends, improve operations, and reduce costs. Data mining also aids in customer segmentation, enabling detailed analysis of customer behavior, which is essential for effective marketing strategies and customer satisfaction.

In [1]:
# Importing all the necessary libraries:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules

In [4]:
# Importing and displaying the DataFrame:
url = 'https://raw.githubusercontent.com/Sarvandani/Machine_learning_Association-rule-learning_Market-Basket-Analysis/refs/heads/main/Groceries_dataset.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [6]:
# Filtering the rows with Member_number = 1808:
df[df['Member_number']== 1808]

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
4355,1808,04-02-2015,long life bakery product
9090,1808,29-11-2014,meat
11488,1808,15-12-2014,sugar
16149,1808,21-07-2015,rolls/buns
20504,1808,04-02-2015,semi-finished bread
25239,1808,29-11-2014,whole milk
27637,1808,15-12-2014,citrus fruit
36088,1808,21-07-2015,candy
38731,1808,15-12-2014,napkins


In [11]:
# Creating a new Dataframe to group the filtered result above by 'Date':
grouping = df[df['Member_number'] == 1808].groupby('Date')['itemDescription'].apply(list)
grouping.head()

Unnamed: 0_level_0,itemDescription
Date,Unnamed: 1_level_1
04-02-2015,"[long life bakery product, semi-finished bread]"
15-12-2014,"[sugar, citrus fruit, napkins]"
21-07-2015,"[tropical fruit, rolls/buns, candy]"
29-11-2014,"[meat, whole milk]"


In [12]:
# Creating a new Dataframe to group the filtered result above by 'Date':
grouped_transactions = df.groupby(['Member_number', 'Date'])['itemDescription'].apply(list).reset_index(name='items')['items']
grouped_transactions.head()

Unnamed: 0,items
0,"[sausage, whole milk, semi-finished bread, yog..."
1,"[whole milk, pastry, salty snack]"
2,"[canned beer, misc. beverages]"
3,"[sausage, hygiene articles]"
4,"[soda, pickled vegetables]"


In [13]:
# Instantiating the transaction encoder:
te = TransactionEncoder()

# Transforming the data into the adequate format:
te_ary = te.fit(grouped_transactions).transform(grouped_transactions)
df_te = pd.DataFrame(te_ary, columns=te.columns_)
df_te.head()

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
# Applying the FP-Growth algorithm:
frequent_itemsets = fpgrowth(df_te, min_support=0.005, use_colnames=True)

# Displaying the most frequent item collections:
print(frequent_itemsets)

      support                     itemsets
0    0.157923                 (whole milk)
1    0.085879                     (yogurt)
2    0.060349                    (sausage)
3    0.009490        (semi-finished bread)
4    0.051728                     (pastry)
..        ...                          ...
121  0.007151   (whole milk, bottled beer)
122  0.005280  (whole milk, domestic eggs)
123  0.005614     (whole milk, newspapers)
124  0.007151   (citrus fruit, whole milk)
125  0.005012           (whole milk, pork)

[126 rows x 2 columns]


In [15]:
# Generating a DataFrame of association rules with the 'score' metric:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.005)
soda_rules = rules[rules['antecedents'].apply(lambda x: 'soda' in x)]
soda_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
3,(soda),(yogurt),0.097106,0.085879,0.005814,0.059876,0.697219,1.0,-0.002525,0.972342,-0.324769,0.032818,-0.028445,0.06379
15,(soda),(sausage),0.097106,0.060349,0.005948,0.061253,1.014975,1.0,8.8e-05,1.000963,0.016341,0.039259,0.000962,0.079906
23,(soda),(whole milk),0.097106,0.157923,0.011629,0.119752,0.758296,1.0,-0.003707,0.956636,-0.260917,0.047776,-0.045329,0.096694
25,(soda),(other vegetables),0.097106,0.122101,0.009691,0.099794,0.817302,1.0,-0.002166,0.975219,-0.198448,0.046252,-0.02541,0.089579
27,(soda),(rolls/buns),0.097106,0.110005,0.008087,0.083276,0.757022,1.0,-0.002596,0.970843,-0.262257,0.040631,-0.030032,0.078394
40,(soda),(tropical fruit),0.097106,0.067767,0.005413,0.055747,0.822622,1.0,-0.001167,0.98727,-0.192778,0.033948,-0.012894,0.067814
51,(soda),(root vegetables),0.097106,0.069572,0.00528,0.05437,0.781501,1.0,-0.001476,0.983925,-0.236442,0.032712,-0.016338,0.065129


In [16]:
# Generating a sorted DataFrame of association rules with the 'score' metric:
soda_rules_sorted = soda_rules.sort_values(by='confidence', ascending=False)
soda_rules_sorted

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
23,(soda),(whole milk),0.097106,0.157923,0.011629,0.119752,0.758296,1.0,-0.003707,0.956636,-0.260917,0.047776,-0.045329,0.096694
25,(soda),(other vegetables),0.097106,0.122101,0.009691,0.099794,0.817302,1.0,-0.002166,0.975219,-0.198448,0.046252,-0.02541,0.089579
27,(soda),(rolls/buns),0.097106,0.110005,0.008087,0.083276,0.757022,1.0,-0.002596,0.970843,-0.262257,0.040631,-0.030032,0.078394
15,(soda),(sausage),0.097106,0.060349,0.005948,0.061253,1.014975,1.0,8.8e-05,1.000963,0.016341,0.039259,0.000962,0.079906
3,(soda),(yogurt),0.097106,0.085879,0.005814,0.059876,0.697219,1.0,-0.002525,0.972342,-0.324769,0.032818,-0.028445,0.06379
40,(soda),(tropical fruit),0.097106,0.067767,0.005413,0.055747,0.822622,1.0,-0.001167,0.98727,-0.192778,0.033948,-0.012894,0.067814
51,(soda),(root vegetables),0.097106,0.069572,0.00528,0.05437,0.781501,1.0,-0.001476,0.983925,-0.236442,0.032712,-0.016338,0.065129
