In [1]:
import pandas as pd
import random

from datetime import datetime, timedelta

from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

random.seed(42)

In [2]:
# List of different products (17 items)
all_items = ['BREAD', 'TEA', 'COKE', 'JAM', 'MAGGI', 'MILK', 'ORANGE', 'CORNFLAKES', 'SUGAR', 'COFFEE',
             'RICE', 'FLOUR', 'SALT', 'PEPPER', 'BUTTER', 'EGGS', 'YOGURT']

# Istanbul districts
istanbul_districts = ['Kadıköy', 'Beşiktaş', 'Şişli', 'Bakırköy', 'Üsküdar', 'Sarıyer', 'Ataşehir', 'Fatih', 'Eyüp', 'Bağcılar']

# Payment methods
payment_methods = ['Credit Card', 'Debit Card', 'Cash']

# Create a dataset
data = {'OrderNumber': [], 'OrderDate': [], 'OrderTime': [], 'District': [], 'Customer_ID': [], 'PaymentMethod': [], 'Items': []}

num_transactions = 10000  # Total number of rows
max_items_per_transaction = 10  # Maximum number of items per row
max_rows_per_customer = 4  # Maximum number of rows per customer

# Add customer data for a total of 400 customers
customer_ids = [str(random.randint(10000, 99999)) for _ in range(10000)]
random.shuffle(customer_ids)

for customer_id in customer_ids:
    customer_rows = min(random.randint(1, max_rows_per_customer), num_transactions - len(data['OrderNumber']))
    
    for _ in range(customer_rows):
        # Randomly select a number of items for each row
        num_items = random.randint(3, max_items_per_transaction)
        items = random.sample(all_items, k=num_items)

        # Add a random order date and time
        order_date = datetime(2023, 1, 1) + timedelta(days=random.randint(1, 365))
        order_time = datetime.strptime(f"{random.randint(0, 23):02d}:{random.randint(0, 59):02d}", "%H:%M")

        data['OrderDate'].append(order_date.strftime("%Y-%m-%d"))
        data['OrderTime'].append(order_time.strftime("%H:%M"))

        # Add an Istanbul district
        data['District'].append(random.choice(istanbul_districts))
        data['OrderNumber'].append(random.randint(1000, 9999))
        data['Customer_ID'].append(customer_id)
        data['PaymentMethod'].append(random.choice(payment_methods))
        data['Items'].append(','.join(items))

        if len(data['OrderNumber']) == num_transactions:
            break

# Create a DataFrame
df = pd.DataFrame(data)

# Sort the DataFrame by "OrderNumber"
# df.sort_values(by='OrderNumber', inplace=True)

# Reorder the columns
df = df[['Customer_ID', 'OrderNumber', 'OrderDate', 'OrderTime', 'District', 'PaymentMethod', 'Items']]

# save the dataframe as a csv file
df.to_csv("market_transaction_data.csv", index=False)

In [3]:
# Display the created DataFrame
df.head()

Unnamed: 0,Customer_ID,OrderNumber,OrderDate,OrderTime,District,PaymentMethod,Items
0,34371,4508,2023-08-17,08:20,Üsküdar,Cash,"ORANGE,BUTTER,YOGURT,COKE,COFFEE"
1,34371,2889,2023-07-13,17:29,Beşiktaş,Cash,"ORANGE,SALT,MAGGI"
2,34371,3215,2023-02-28,17:30,Ataşehir,Credit Card,"TEA,CORNFLAKES,FLOUR,BUTTER,SUGAR"
3,34371,4956,2023-11-08,17:56,Beşiktaş,Cash,"CORNFLAKES,MAGGI,JAM,BREAD,SUGAR"
4,25970,3289,2023-01-24,13:31,Bakırköy,Credit Card,"YOGURT,COKE,MAGGI,BUTTER,MILK,SUGAR,EGGS,RICE"


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Customer_ID    10000 non-null  object
 1   OrderNumber    10000 non-null  int64 
 2   OrderDate      10000 non-null  object
 3   OrderTime      10000 non-null  object
 4   District       10000 non-null  object
 5   PaymentMethod  10000 non-null  object
 6   Items          10000 non-null  object
dtypes: int64(1), object(6)
memory usage: 547.0+ KB


In [6]:
product_data = list(df["Items"].apply(lambda x:x.split(",")))
product_data

[['ORANGE', 'BUTTER', 'YOGURT', 'COKE', 'COFFEE'],
 ['ORANGE', 'SALT', 'MAGGI'],
 ['TEA', 'CORNFLAKES', 'FLOUR', 'BUTTER', 'SUGAR'],
 ['CORNFLAKES', 'MAGGI', 'JAM', 'BREAD', 'SUGAR'],
 ['YOGURT', 'COKE', 'MAGGI', 'BUTTER', 'MILK', 'SUGAR', 'EGGS', 'RICE'],
 ['BUTTER', 'COKE', 'ORANGE', 'FLOUR', 'RICE'],
 ['BREAD', 'TEA', 'SALT', 'YOGURT', 'BUTTER', 'PEPPER', 'RICE', 'FLOUR'],
 ['EGGS', 'JAM', 'CORNFLAKES', 'SALT', 'SUGAR', 'RICE'],
 ['FLOUR', 'RICE', 'ORANGE', 'EGGS', 'TEA'],
 ['EGGS', 'COFFEE', 'CORNFLAKES', 'SUGAR', 'SALT'],
 ['SALT', 'EGGS', 'BREAD', 'SUGAR', 'ORANGE', 'CORNFLAKES', 'BUTTER', 'RICE'],
 ['COKE', 'FLOUR', 'EGGS', 'SUGAR', 'MAGGI', 'PEPPER', 'YOGURT', 'ORANGE'],
 ['YOGURT', 'CORNFLAKES', 'MILK', 'BUTTER', 'RICE', 'BREAD', 'PEPPER'],
 ['TEA', 'EGGS', 'BREAD'],
 ['YOGURT', 'MILK', 'COKE'],
 ['YOGURT', 'JAM', 'BUTTER', 'SUGAR', 'PEPPER', 'FLOUR', 'BREAD'],
 ['EGGS',
  'CORNFLAKES',
  'SUGAR',
  'JAM',
  'ORANGE',
  'FLOUR',
  'PEPPER',
  'BUTTER',
  'TEA',
  'SALT'],
 ['M

In [7]:
te = TransactionEncoder()
te_data = te.fit(product_data).transform(product_data)

model_data = pd.DataFrame(te_data,columns=te.columns_)
model_data

Unnamed: 0,BREAD,BUTTER,COFFEE,COKE,CORNFLAKES,EGGS,FLOUR,JAM,MAGGI,MILK,ORANGE,PEPPER,RICE,SALT,SUGAR,TEA,YOGURT
0,False,True,True,True,False,False,False,False,False,False,True,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,False
2,False,True,False,False,True,False,True,False,False,False,False,False,False,False,True,True,False
3,True,False,False,False,True,False,False,True,True,False,False,False,False,False,True,False,False
4,False,True,False,True,False,True,False,False,True,True,False,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,False,True,False,True,True,True,False,False,False,False,False,False,True,True,False,False,True
9996,True,False,False,True,False,False,True,True,True,False,True,True,False,False,False,True,True
9997,False,False,True,False,False,False,True,True,False,False,False,True,True,True,False,False,True
9998,False,False,False,False,False,True,False,False,True,True,False,False,False,False,False,False,False


In [8]:
model_data.sum()

BREAD         3853
BUTTER        3812
COFFEE        3803
COKE          3798
CORNFLAKES    3803
EGGS          3811
FLOUR         3774
JAM           3783
MAGGI         3794
MILK          3702
ORANGE        3879
PEPPER        3806
RICE          3792
SALT          3797
SUGAR         3801
TEA           3880
YOGURT        3791
dtype: int64

In [9]:
df_pattern = apriori(model_data, min_support=0.15, use_colnames=True)
df_pattern

Unnamed: 0,support,itemsets
0,0.3853,(BREAD)
1,0.3812,(BUTTER)
2,0.3803,(COFFEE)
3,0.3798,(COKE)
4,0.3803,(CORNFLAKES)
...,...,...
73,0.1531,"(SALT, SUGAR)"
74,0.1504,"(SALT, TEA)"
75,0.1506,"(YOGURT, SALT)"
76,0.1520,"(TEA, SUGAR)"


In [12]:
association_rules(df_pattern, metric="confidence", min_threshold=0.10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BUTTER),(BREAD),0.3812,0.3853,0.1524,0.399790,1.037607,0.005524,1.024142,0.058572
1,(BREAD),(BUTTER),0.3853,0.3812,0.1524,0.395536,1.037607,0.005524,1.023717,0.058963
2,(CORNFLAKES),(BREAD),0.3803,0.3853,0.1558,0.409677,1.063266,0.009270,1.041294,0.096017
3,(BREAD),(CORNFLAKES),0.3853,0.3803,0.1558,0.404360,1.063266,0.009270,1.040394,0.096798
4,(JAM),(BREAD),0.3783,0.3853,0.1540,0.407084,1.056539,0.008241,1.036741,0.086075
...,...,...,...,...,...,...,...,...,...,...
117,(SALT),(YOGURT),0.3797,0.3791,0.1506,0.396629,1.046238,0.006656,1.029052,0.071247
118,(TEA),(SUGAR),0.3880,0.3801,0.1520,0.391753,1.030657,0.004521,1.019158,0.048603
119,(SUGAR),(TEA),0.3801,0.3880,0.1520,0.399895,1.030657,0.004521,1.019821,0.047983
120,(YOGURT),(TEA),0.3791,0.3880,0.1567,0.413347,1.065328,0.009609,1.043207,0.098764
