In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [8]:
# load data
data = pd.read_csv('../data/basket/Assignment-1_Data.csv', delimiter=';')
data.head()

Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01.12.2010 08:26,255,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01.12.2010 08:26,339,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01.12.2010 08:26,275,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01.12.2010 08:26,339,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01.12.2010 08:26,339,17850.0,United Kingdom


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522064 entries, 0 to 522063
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   BillNo      522064 non-null  object 
 1   Itemname    520609 non-null  object 
 2   Quantity    522064 non-null  int64  
 3   Date        522064 non-null  object 
 4   Price       522064 non-null  object 
 5   CustomerID  388023 non-null  float64
 6   Country     522064 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 27.9+ MB


In [9]:
# 数据清洗
data.dropna(subset=['Itemname'], inplace=True)
data['Price'] = data['Price'].str.replace(',','.').astype('float64')
# sns.boxplot(data[['price']])
data = data[data['Price'] > 0]
data = data[data['Country'] == 'United Kingdom']
data.info()
#data = data.dropna()
#data = data.drop_duplicates()

<class 'pandas.core.frame.DataFrame'>
Index: 519551 entries, 0 to 522063
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   BillNo      519551 non-null  object 
 1   Itemname    519551 non-null  object 
 2   Quantity    519551 non-null  int64  
 3   Date        519551 non-null  object 
 4   Price       519551 non-null  float64
 5   CustomerID  387985 non-null  float64
 6   Country     519551 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 31.7+ MB


In [12]:
data['Country'].value_counts()

Country
United Kingdom          485123
Germany                   9040
France                    8407
Spain                     2484
Netherlands               2359
Belgium                   2031
Switzerland               1966
Portugal                  1501
Australia                 1182
Norway                    1071
Italy                      758
Sweden                     451
Unspecified                446
Austria                    398
Poland                     330
Japan                      321
Israel                     295
Hong Kong                  284
Singapore                  222
Iceland                    182
USA                        179
Greece                     145
Malta                      112
United Arab Emirates        68
RSA                         57
Lebanon                     45
Lithuania                   35
Brazil                      32
Bahrain                     18
Saudi Arabia                 9
Name: count, dtype: int64

In [3]:
# 使用value_counts()对Category列进行分类统计
category_counts = data['Itemname'].value_counts().reset_index()
category_counts.columns = ['Itemname', 'Count']

# 按照分类统计结果创建新的DataFrame
category_counts_df = pd.DataFrame(category_counts)

# 将分类统计结果保存到本地文件，比如CSV格式
category_counts_df.to_csv('category_counts.csv', index=False)

In [4]:
data.head()

Unnamed: 0,BillNo,Itemname,Quantity,Date,Price,CustomerID,Country
0,536365,WHITE HANGING HEART T-LIGHT HOLDER,6,01.12.2010 08:26,255,17850.0,United Kingdom
1,536365,WHITE METAL LANTERN,6,01.12.2010 08:26,339,17850.0,United Kingdom
2,536365,CREAM CUPID HEARTS COAT HANGER,8,01.12.2010 08:26,275,17850.0,United Kingdom
3,536365,KNITTED UNION FLAG HOT WATER BOTTLE,6,01.12.2010 08:26,339,17850.0,United Kingdom
4,536365,RED WOOLLY HOTTIE WHITE HEART.,6,01.12.2010 08:26,339,17850.0,United Kingdom


In [5]:
data = data[data['Country']=='United Kingdom']
transactions = data.groupby('BillNo')['Itemname'].apply(list).tolist()
transactions = [transaction for transaction in transactions if len(transaction) >= 2]

In [6]:
# One-hot encode transactions
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

In [7]:
# Apply Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.03, use_colnames=True, low_memory=True)

In [8]:
# Print frequent itemsets
print(frequent_itemsets)

      support                                           itemsets
0    0.040814                           (6 RIBBONS RUSTIC CHARM)
1    0.038149                      (60 TEATIME FAIRY CAKE CASES)
2    0.044518                       (ALARM CLOCK BAKELIKE GREEN)
3    0.031910                        (ALARM CLOCK BAKELIKE PINK)
4    0.049262                         (ALARM CLOCK BAKELIKE RED)
..        ...                                                ...
103  0.055371                (WOODEN PICTURE FRAME WHITE FINISH)
104  0.030935  (ROSES REGENCY TEACUP AND SAUCER, GREEN REGENC...
105  0.032885  (JUMBO BAG RED RETROSPOT, JUMBO BAG PINK POLKA...
106  0.031455  (LUNCH BAG RED RETROSPOT, LUNCH BAG  BLACK SKU...
107  0.030610  (LUNCH BAG RED RETROSPOT, LUNCH BAG PINK POLKA...

[108 rows x 2 columns]


In [10]:
# 使用关联规则查找频繁项集
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(ROSES REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),0.043868,0.039774,0.030935,0.705185,17.729877,0.02919,3.257049,0.986891
1,(GREEN REGENCY TEACUP AND SAUCER),(ROSES REGENCY TEACUP AND SAUCER),0.039774,0.043868,0.030935,0.777778,17.729877,0.02919,4.302593,0.982683
2,(JUMBO BAG RED RETROSPOT),(JUMBO BAG PINK POLKADOT),0.093131,0.052642,0.032885,0.353105,6.707694,0.027982,1.464471,0.938302
3,(JUMBO BAG PINK POLKADOT),(JUMBO BAG RED RETROSPOT),0.052642,0.093131,0.032885,0.624691,6.707694,0.027982,2.41633,0.8982
4,(LUNCH BAG RED RETROSPOT),(LUNCH BAG BLACK SKULL.),0.072789,0.0646,0.031455,0.432143,6.689519,0.026753,1.647245,0.91728
5,(LUNCH BAG BLACK SKULL.),(LUNCH BAG RED RETROSPOT),0.0646,0.072789,0.031455,0.486922,6.689519,0.026753,1.807153,0.90925
6,(LUNCH BAG RED RETROSPOT),(LUNCH BAG PINK POLKADOT),0.072789,0.055046,0.03061,0.420536,7.639649,0.026603,1.630736,0.937331
7,(LUNCH BAG PINK POLKADOT),(LUNCH BAG RED RETROSPOT),0.055046,0.072789,0.03061,0.55608,7.639649,0.026603,2.088691,0.919732
