In [106]:
import pandas as pd

data = pd.read_csv('data.csv', 
                   encoding='unicode_escape',
                   parse_dates = ['InvoiceDate'], 
                   dtype = {'CustomerID': str,
                          'InvoiceNo': str})
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [107]:
def check_data(dataframe):
    print(' shape '.center(55, '-'))
    print(' rows: {}' .format(dataframe.shape[0]))
    print(' columns: {}' .format(dataframe.shape[1]))
    print(' types '.center(55, '-'))
    print(dataframe.dtypes)
    print(' missing values '.center(55, '-'))
    print(dataframe.isnull().sum())
    print(' duplicated values '.center(55, '-'))
    print(dataframe.duplicated().sum())
    print(' quantiles '.center(55, '-'))
    print(dataframe.quantile([0, 0.05, 0.5, 0.95, 0.99, 1]).T)

check_data(data)


------------------------ shape ------------------------
 rows: 541909
 columns: 8
------------------------ types ------------------------
InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID             object
Country                object
dtype: object
-------------------- missing values -------------------
InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64
------------------ duplicated values ------------------
5268
---------------------- quantiles ----------------------
               0.00  0.05  0.50   0.95   0.99     1.00
Quantity  -80995.00  1.00  3.00  29.00  100.0  80995.0
UnitPrice -11062.06  0.42  2.08   9.95   18.0  38970.0


In [108]:
data.describe()

Unnamed: 0,Quantity,UnitPrice
count,541909.0,541909.0
mean,9.55225,4.611114
std,218.081158,96.759853
min,-80995.0,-11062.06
25%,1.0,1.25
50%,3.0,2.08
75%,10.0,4.13
max,80995.0,38970.0


##### 初步探索結果
##### 1. Description跟CustomerID有缺失值
##### 2. Quantity跟UnitPrice有極端值
##### 3. 負數表示有退貨訂單

## Data Preprocessing

In [109]:
def replace_with_thresholds(dataframe, variable, q1 = 0.25, q3 = 0.75):
    df = dataframe.copy()
    quartile1 = df[variable].quantile(q1)
    quartile3 = df[variable].quantile(q3)
    iqr = quartile3 - quartile1
    up_limit = quartile3 + 1.5*iqr
    low_limit = quartile1 - 1.5*iqr
    df.loc[(df[variable] < low_limit), variable] = low_limit
    df.loc[(df[variable] > up_limit), variable] = up_limit

    return df

def preprocess(dataframe):
    df = dataframe.copy()
    # remove missing values
    df = df.dropna()
    # remove cancelled orders
    df = df[~df['InvoiceNo'].str.contains('C', na = False)]
    df = df[df['Quantity'] > 0]
    # replacing outliers
    df = replace_with_thresholds(df, 'Quantity', q1 = 0.01, q3 = 0.99)
    df = replace_with_thresholds(df, 'UnitPrice', q1 = 0.01, q3 = 0.99)
    # total price
    df['TotalPrice'] = df['Quantity']*df['UnitPrice']

    return df

df = preprocess(data)


In [110]:
# remove the records that UnitPrice = 0
df = df[df['UnitPrice'] != 0]

In [111]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,TotalPrice
count,397884.0,397884.0,397884.0
mean,11.830797,2.893454,20.629824
std,25.523078,3.22709,51.828592
min,1.0,0.001,0.001
25%,2.0,1.25,4.68
50%,6.0,1.95,11.8
75%,12.0,3.75,19.8
max,298.5,37.06,3268.575


## RFM Analysis

In [112]:
print(df['InvoiceDate'].max())

2011-12-09 12:50:00


In [113]:
import datetime as dt
today = dt.datetime(2011, 12, 11)

In [114]:
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate' : lambda x :(today - x.max()).days,
    'InvoiceNo' : lambda x : x.nunique(),
    'TotalPrice' : lambda x : x.sum()
})

rfm = rfm.rename(columns={'InvoiceDate' : 'Recency', 'InvoiceNo' : 'Frequency', 'TotalPrice' : 'Monetary'})
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,326,1,310.44
12347,3,7,4310.0
12348,76,4,1770.78
12349,19,1,1491.72
12350,311,1,331.46


In [115]:
rfm.describe()

Unnamed: 0,Recency,Frequency,Monetary
count,4338.0,4338.0,4338.0
mean,93.059474,4.272015,1892.180055
std,100.012264,7.697998,7706.207355
min,1.0,1.0,3.75
25%,18.0,1.0,303.3075
50%,51.0,2.0,663.1
75%,142.75,5.0,1631.1075
max,374.0,209.0,266163.525


## Assigning RFM Scores

In [116]:
rfm['recency_score'] = pd.qcut(rfm['Recency'], 3, [3, 2, 1])
rfm['frequency_score'] = pd.qcut(rfm['Frequency'].rank(method="first"), 3, [1, 2, 3])
rfm['monetary_score'] = pd.qcut(rfm['Monetary'], 3, [1, 2, 3])
rfm['RFM_score'] = rfm['recency_score'].astype(str) + rfm['frequency_score'].astype(str) + rfm['monetary_score'].astype(str)
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,recency_score,frequency_score,monetary_score,RFM_score
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346,326,1,310.44,1,1,1,111
12347,3,7,4310.0,3,3,3,333
12348,76,4,1770.78,2,2,3,223
12349,19,1,1491.72,3,1,3,313
12350,311,1,331.46,1,1,1,111


In [117]:
def segment(score):
    if score in ['333', '332', '323']:
        return 'Champion'
    elif score in ['321', '322', '331', '232', '233']:
        return 'Loyal'
    elif score in ['312', '313', '311', '222', '223']:
        return 'Recent'
    elif score in ['213', '221', '123', '132', '133']:
        return 'Needs attention'
    elif score in ['231', '212', '122', '131', '211']:
        return 'At risk'
    elif score in ['111', '112', '113', '121']:
        return 'Inactive'

rfm['Segment'] = rfm['RFM_score'].apply(segment)

In [118]:
rfm.reset_index(inplace=True)
rfm.groupby('Segment').agg({'CustomerID':'count'})

Unnamed: 0_level_0,CustomerID
Segment,Unnamed: 1_level_1
At risk,710
Champion,961
Inactive,976
Loyal,776
Needs attention,326
Recent,589


### KMeans Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 1. 數據標準化
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

In [None]:
# 2. 找出最佳的分群數
sse = {}
for k in range(1, 11):  # 試驗不同的k值
    kmeans = KMeans(n_clusters=k, random_state=1)
    kmeans.fit(rfm_scaled)
    sse[k] = kmeans.inertia_  # SSE for each n_clusters

plt.plot(list(sse.keys()), list(sse.values()))
plt.title('Elbow Curve')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, random_state=1)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)
print(rfm.groupby('Cluster').mean())

In [None]:
rfm.groupby('Cluster').count()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# 假設您已經有了一個包含聚類標籤的 DataFrame
rfm['Cluster'] = kmeans.labels_

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(rfm['Recency'], rfm['Frequency'], rfm['Monetary'], 
                     c=rfm['Cluster'], cmap='viridis')

# 添加標籤和標題
ax.set_title('3D Scatter Plot of RFM Clusters')
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')

# 添加圖例
legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
ax.add_artist(legend1)

In [None]:
import seaborn as sns

# Pairplot
sns.pairplot(rfm[['Recency', 'Frequency', 'Monetary', 'Cluster']], hue='Cluster', palette='viridis')