In [3]:
import pandas as pd
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
from qrytool import load_data_into_dataframe
import numpy as np


def add_percentage_annotations(fig, data_column, unit):
    counts, bins = np.histogram(data_column, bins='auto')  # 데이터에 따라 자동으로 구간을 정합니다.
    total = sum(counts)  # 전체 개수를 계산합니다.

    # 각 구간(bin)마다 주석을 추가합니다.
    for count, bin_left, bin_right in zip(counts, bins[:-1], bins[1:]):
        bin_center = (bin_left + bin_right) / 2  # 구간의 중간 지점을 계산합니다.
        percentage = (count / total) * 100  # 해당 구간의 비율(퍼센트)을 계산합니다.
        annotation_text = f"{count}{unit}\n({percentage:.2f}%)"  # 주석 텍스트를 준비합니다.
        fig.add_annotation(x=bin_center, y=count, text=annotation_text, showarrow=False, yshift=10)


qry = """
    SELECT all_id, 주문id,결제일시::date,MAX(세후순결제금액) as 결제액
FROM salesmatrix
WHERE all_id IS NOT NULL AND 결제일시::date >= TO_DATE('2018-01-01', 'YYYY-MM-DD')
    AND 정산='합산'
GROUP BY all_id,주문id,결제일시::date;
    """
df = load_data_into_dataframe(qry)

# Convert '결제일시' to datetime
df['결제일시'] = pd.to_datetime(df['결제일시'])

# Calculate RFM metrics
NOW = datetime.now()
rfm = df.groupby('all_id').agg({
    '결제일시': lambda x: (NOW - x.max()).days,
    '주문id': 'count',
    '결제액': 'sum'
}).rename(columns={'결제일시': 'Recency', '주문id': 'Frequency', '결제액': 'Monetary'})

for col in ['Recency', 'Frequency', 'Monetary']:
    total_count = rfm[col].count()
    rfm[f'{col}_text'] = rfm[col].apply(lambda x: f"{x}\n({(x/total_count*100):.2f}%)")

# For Recency, the sorting issue does not apply as it's numerical
fig1 = px.histogram(rfm, x='Recency', title='최근성 분포 (단위: 일)')
fig1.update_layout(xaxis_title='최근성 (일)', yaxis_title='고객 수', bargap=0.2)

add_percentage_annotations(fig1, rfm['Recency'], '명')

# Sort 'Frequency' for plotting
rfm = rfm.sort_values(by=['Frequency'], ascending=False)

# Convert to categorical with ordered categories
rfm['Frequency_cat'] = pd.Categorical(rfm['Frequency'], categories=rfm['Frequency'].unique(), ordered=True)

# Frequency Distribution
fig2 = px.histogram(rfm, x='Frequency_cat', title='빈도 분포')
fig2.update_layout(xaxis_title='빈도', yaxis_title='고객 수', bargap=0.2)
add_percentage_annotations(fig2, rfm['Frequency'], '명')

# Sort 'Monetary' for plotting
rfm = rfm.sort_values(by=['Monetary'])

# Convert to categorical with ordered categories
rfm['Monetary_cat'] = pd.Categorical(rfm['Monetary'], categories=rfm['Monetary'].unique(), ordered=True)

# Monetary Distribution
fig3 = px.histogram(rfm, x='Monetary_cat', title='금액 분포 (단위: 원)')
fig3.update_layout(xaxis_title='금액', yaxis_title='고객 수', bargap=0.2)
add_percentage_annotations(fig3, rfm['Monetary'],'명')
fig1.show()
fig2.show()
fig3.show()

# Optional: Scatter plot for RFM segments (e.g., Recency vs Monetary)
fig4 = px.scatter(rfm, x='Recency', y='Monetary', size='Frequency', color='Frequency',
                  hover_name=rfm.index, title='RFM Scatter Plot: Recency vs Monetary')
fig4.show()

In [5]:
import plotly.graph_objs as go
import pandas as pd
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
from qrytool import load_data_into_dataframe
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

qry = """
    SELECT all_id, 주문id,결제일시::date,MAX(세후순결제금액) as 결제액
FROM salesmatrix
WHERE all_id IS NOT NULL AND 결제일시::date >= TO_DATE('2018-01-01', 'YYYY-MM-DD')
    AND 정산='합산'
GROUP BY all_id,주문id,결제일시::date;
    """
df = load_data_into_dataframe(qry)

# Convert '결제일시' to datetime
df['결제일시'] = pd.to_datetime(df['결제일시'])

NOW = pd.to_datetime('now')

# Recency 계산
df['결제일시'] = pd.to_datetime(df['결제일시'])
recency = df.groupby('all_id')['결제일시'].max().reset_index()
recency['Recency'] = (NOW - recency['결제일시']).dt.days

# Frequency 계산
frequency = df.groupby('all_id').size().reset_index(name='Frequency')

# Monetary 계산
monetary = df.groupby('all_id')['결제액'].sum().reset_index(name='Monetary')

# RFM 데이터프레임 합치기
rfm = recency.merge(frequency, on='all_id').merge(monetary, on='all_id')
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

kmeans = KMeans(n_clusters=4, random_state=0).fit(rfm_scaled)
rfm['Cluster'] = kmeans.labels_

cluster_summary = rfm.groupby('Cluster').mean()
print(cluster_summary)

import plotly.graph_objects as go

# 3D 산점도 생성
fig = go.Figure()

# 클러스터마다 다른 색상으로 산점도 추가
for cluster in rfm['Cluster'].unique():
    clustered_data = rfm[rfm['Cluster'] == cluster]

    fig.add_trace(go.Scatter3d(
        x=clustered_data['Recency'],
        y=clustered_data['Frequency'],
        z=clustered_data['Monetary'],
        mode='markers',
        marker=dict(size=5),
        name=f'Cluster {cluster}'
    ))

# 차트 레이아웃 설정
fig.update_layout(
    title='RFM 클러스터링 결과',
    scene=dict(
        xaxis_title='Recency',
        yaxis_title='Frequency',
        zaxis_title='Monetary'
    ),
    margin=dict(l=0, r=0, b=0, t=30)
)

fig.show()






               all_id                          결제일시     Recency  Frequency  \
Cluster                                                                      
0         1489.936146 2022-01-28 18:59:31.265678592  779.208666   1.685291   
1        41480.837209 2023-11-30 14:47:26.511628032  108.383721  21.139535   
2        29983.118881 2023-08-20 10:24:20.139859968  210.566434   7.659674   
3        31622.067146 2023-09-15 03:07:37.553956864  184.869704   2.423661   

             Monetary  
Cluster                
0        1.157242e+05  
1        1.556451e+06  
2        8.284516e+05  
3        1.542311e+05  


In [1]:
import plotly.express as px

# Recency & Frequency
fig_rf = px.scatter(rfm, x='Recency', y='Frequency', color='Cluster', title="Recency & Frequency", labels={"Cluster": "Cluster"})
fig_rf.update_layout(xaxis_title='Recency', yaxis_title='Frequency', legend_title="Cluster")
fig_rf.show()

# Frequency & Monetary
fig_fm = px.scatter(rfm, x='Frequency', y='Monetary', color='Cluster', title="Frequency & Monetary", labels={"Cluster": "Cluster"})
fig_fm.update_layout(xaxis_title='Frequency', yaxis_title='Monetary', legend_title="Cluster")
fig_fm.show()

# Recency & Monetary
fig_rm = px.scatter(rfm, x='Recency', y='Monetary', color='Cluster', title="Recency & Monetary", labels={"Cluster": "Cluster"})
fig_rm.update_layout(xaxis_title='Recency', yaxis_title='Monetary', legend_title="Cluster")
fig_rm.show()

NameError: name 'rfm' is not defined

In [24]:
import plotly.graph_objs as go
import pandas as pd
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
from qrytool import load_data_into_dataframe
import numpy as np

qry = """
    SELECT all_id, 주문id,결제일시::date,MAX(세후순결제금액) as 결제액
FROM salesmatrix
WHERE all_id IS NOT NULL AND 결제일시::date >= TO_DATE('2018-01-01', 'YYYY-MM-DD')
    AND 정산='합산'
GROUP BY all_id,주문id,결제일시::date;
    """
df = load_data_into_dataframe(qry)

# Convert '결제일시' to datetime
df['결제일시'] = pd.to_datetime(df['결제일시'])

# Calculate RFM metrics
NOW = datetime.now()
rfm = df.groupby('all_id').agg({
    '결제일시': lambda x: (NOW - x.max()).days,
    '주문id': 'count',
    '결제액': 'sum'
}).rename(columns={'결제일시': 'Recency', '주문id': 'Frequency', '결제액': 'Monetary'})


# Recency 분포
fig_recency = px.histogram(rfm, x='Recency', title='Recency 분포')
fig_recency.add_vline(x=quantiles['Recency'][0.25], line_dash="dash", line_color="green", annotation_text="25%")
fig_recency.add_vline(x=quantiles['Recency'][0.50], line_dash="dash", line_color="blue", annotation_text="50%")
fig_recency.add_vline(x=quantiles['Recency'][0.75], line_dash="dash", line_color="red", annotation_text="75%")
fig_recency.update_layout(xaxis_title='Recency (일)', yaxis_title='고객 수')
fig_recency.show()

# Frequency 분포
fig_frequency = px.histogram(rfm, x='Frequency', title='Frequency 분포')
fig_frequency.add_vline(x=quantiles['Frequency'][0.25], line_dash="dash", line_color="green", annotation_text="25%")
fig_frequency.add_vline(x=quantiles['Frequency'][0.50], line_dash="dash", line_color="blue", annotation_text="50%")
fig_frequency.add_vline(x=quantiles['Frequency'][0.75], line_dash="dash", line_color="red", annotation_text="75%")
fig_frequency.update_layout(xaxis_title='Frequency (횟수)', yaxis_title='고객 수')
fig_frequency.show()

# Monetary 분포
fig_monetary = px.histogram(rfm, x='Monetary', title='Monetary 분포')
fig_monetary.add_vline(x=quantiles['Monetary'][0.25], line_dash="dash", line_color="green", annotation_text="25%")
fig_monetary.add_vline(x=quantiles['Monetary'][0.50], line_dash="dash", line_color="blue", annotation_text="50%")
fig_monetary.add_vline(x=quantiles['Monetary'][0.75], line_dash="dash", line_color="red", annotation_text="75%")
fig_monetary.update_layout(xaxis_title='Monetary (금액)', yaxis_title='고객 수')
fig_monetary.show()

In [34]:
import plotly.graph_objs as go
import pandas as pd
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
from qrytool import load_data_into_dataframe
import numpy as np

qry = """
    SELECT all_id, 주문id,결제일시::date,MAX(세후순결제금액) as 결제액
FROM salesmatrix
WHERE all_id IS NOT NULL AND 결제일시::date >= TO_DATE('2018-01-01', 'YYYY-MM-DD')
    AND 정산='합산'
GROUP BY all_id,주문id,결제일시::date;
    """
df = load_data_into_dataframe(qry)

# Convert '결제일시' to datetime
df['결제일시'] = pd.to_datetime(df['결제일시'])

# Calculate RFM metrics
NOW = datetime.now()
rfm = df.groupby('all_id').agg({
    '결제일시': lambda x: (NOW - x.max()).days,
    '주문id': 'count',
    '결제액': 'sum'
}).rename(columns={'결제일시': 'Recency', '주문id': 'Frequency', '결제액': 'Monetary'})


def create_quantile_chart(df, column, title, xaxis_title):
    # 사분위수 계산
    q25, q50, q75 = df[column].quantile([0.25, 0.5, 0.75])
    # 구간별 라벨
    labels = [f"<= {q25:.0f}", f"{q25:.0f} - {q50:.0f}", f"{q50:.0f} - {q75:.0f}", f"> {q75:.0f}"]
    # 구간별 고객 수 계산
    df[f"{column}Quartile"] = pd.cut(df[column], bins=[-np.inf, q25, q50, q75, np.inf], labels=labels)
    counts = df[f"{column}Quartile"].value_counts().reindex(labels)

    # 차트 생성
    fig = go.Figure()
    total_max_count = counts.max() * 1.1  # 최대 고객 수 기준
    for label, count in counts.items():
        fig.add_trace(go.Bar(x=[label], y=[count], name=label, text=[f"{count}명"], textposition='auto'))

    # 사분위 경계 표시 및 값 라벨링 조정
    quantiles = [q25, q50, q75]
    for i, q in enumerate(quantiles):
        # 사분위 경계에 세로선 추가
        fig.add_vline(x=i + 0.5, line_dash="dash", line_color="lightgrey")
        # 배경색이 있는 텍스트 박스 추가
        # fig.add_shape(type="rect",
        #               x0=i + 0.5, y0=-0.04 * total_max_count, x1=i + 0.7, y1=0,
        #               line=dict(color="white"),
        #               fillcolor="white")
        # 사분위 경계값 표시
        if column == 'Recency':
            fig.add_annotation(x=i + 0.5, y=-0.01 * total_max_count, text=f"{q/7:.1f}주 - {q/30:.1f}개월", showarrow=False, font=dict(color="blue", size=12), xshift=-15)
        elif column == 'Frequency':
            fig.add_annotation(x=i + 0.5, y=-0.01 * total_max_count, text=f"{q:.0f}회", showarrow=False, font=dict(color="blue", size=12), xshift=-15)
        else:
            fig.add_annotation(x=i + 0.5, y=-0.01 * total_max_count, text=f"{q:.0f}원", showarrow=False, font=dict(color="blue", size=12), xshift=-15)


    fig.update_layout(title=title, xaxis_title=xaxis_title, yaxis_title='고객 수', bargap=0.2)
    return fig

# 예시 데이터 생성 및 차트 생성 호출 예시
# rfm 데이터프레임 생성 코드는 생략됩니다. create_quantile_chart 함수 호출 전에 해당 데이터프레임을 준비해야 합니다.


fig_recency = create_quantile_chart(rfm, 'Recency', 'Recency 분포 - 사분위 별 구간 및 고객 수', 'Recency (일)')
fig_frequency = create_quantile_chart(rfm, 'Frequency', 'Frequency 분포 - 사분위 별 구간 및 고객 수', 'Frequency (횟수)')
fig_monetary = create_quantile_chart(rfm, 'Monetary', 'Monetary 분포 - 사분위 별 구간 및 고객 수', 'Monetary (금액)')

fig_recency.show()
fig_frequency.show()
fig_monetary.show()

In [47]:
import plotly.graph_objs as go
import pandas as pd
from datetime import datetime
import plotly.graph_objects as go
import plotly.express as px
from qrytool import load_data_into_dataframe
import numpy as np

qry = """
    SELECT all_id, 주문id,결제일시::date,MAX(세후순결제금액) as 결제액
FROM salesmatrix
WHERE all_id IS NOT NULL AND 결제일시::date >= TO_DATE('2018-01-01', 'YYYY-MM-DD')
    AND 정산='합산'
GROUP BY all_id,주문id,결제일시::date;
    """
df = load_data_into_dataframe(qry)

# Convert '결제일시' to datetime
df['결제일시'] = pd.to_datetime(df['결제일시'])

# Calculate RFM metrics
NOW = datetime.now()
rfm = df.groupby('all_id').agg({
    '결제일시': lambda x: (NOW - x.max()).days,
    '주문id': 'count',
    '결제액': 'sum'
}).rename(columns={'결제일시': 'Recency', '주문id': 'Frequency', '결제액': 'Monetary'})
# 사분위수 계산
quantiles = rfm.quantile(q=[0.25, 0.5, 0.75])


# R, F, M 각 요소별 점수 구간 표 생성
score_intervals = pd.DataFrame({
    'Score': [1, 2, 3, 4],
    'Recency': [
        f" > {quantiles['Recency'][0.75]} days",
        f"{quantiles['Recency'][0.50]} - {quantiles['Recency'][0.75]} days",
        f"{quantiles['Recency'][0.25]} - {quantiles['Recency'][0.50]} days",
        f"<= {quantiles['Recency'][0.25]} days"
    ],
    'Frequency': [
        f"<= {quantiles['Frequency'][0.25]}",
        f"{quantiles['Frequency'][0.25]} - {quantiles['Frequency'][0.50]}",
        f"{quantiles['Frequency'][0.50]} - {quantiles['Frequency'][0.75]}",
        f" > {quantiles['Frequency'][0.75]}"
    ],
    'Monetary': [
        f"<= {quantiles['Monetary'][0.25]}",
        f"{quantiles['Monetary'][0.25]} - {quantiles['Monetary'][0.50]}",
        f"{quantiles['Monetary'][0.50]} - {quantiles['Monetary'][0.75]}",
        f" > {quantiles['Monetary'][0.75]}"
    ]
})

# 점수 구간 표 출력
print("R, F, M 각 요소별 점수 구간:")
display(score_intervals)

# RFM 점수 계산 함수
def r_score(x, p, d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]:
        return 2
    else:
        return 1


def fm_score(x, p, d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]:
        return 3
    else:
        return 4


# 점수 부여
rfm['R_Score'] = rfm['Recency'].apply(r_score, args=('Recency', quantiles,))
rfm['F_Score'] = rfm['Frequency'].apply(fm_score, args=('Frequency', quantiles,))
rfm['M_Score'] = rfm['Monetary'].apply(fm_score, args=('Monetary', quantiles,))

# 종합 RFM 점수 계산
rfm['RFM_Score'] = rfm[['R_Score', 'F_Score', 'M_Score']].mean(axis=1)

# 점수에 따른 고객 등급 분류


def assign_rfm_level(df):
    if df['RFM_Score'] > 3:
        return '1등급(Green)'
    elif df['RFM_Score'] > 2:
        return '2등급(Blue)'
    elif df['RFM_Score'] > 1:
        return '3등급(Orange)'
    else:
        return '4등급(Red)'


rfm_level_colors = {
    '1등급(Green)': 'green',
    '2등급(Blue)': 'blue',
    '3등급(Orange)': 'orange',
    '4등급(Red)': 'red'
}

rfm['RFM_Level'] = rfm.apply(assign_rfm_level, axis=1)
display(rfm)

# RFM_Level 별로 고객 수 계산
rfm_level_summary = rfm['RFM_Level'].value_counts().sort_index()

# 전체 고객 수 계산
total_customers = rfm_level_summary.sum()
# RFM_Level 별 고객 수 및 퍼센트 계산
rfm_level_counts = rfm['RFM_Level'].value_counts().sort_index()
rfm_level_percent = (rfm_level_counts / total_customers * 100).round(2)

# 요약된 데이터 프레임 생성
summary_df = pd.DataFrame({
    'Count': rfm_level_counts,
    'Percent': rfm_level_percent
})

# 요약된 데이터 프린트
print("RFM 등급별 요약:")
print(summary_df)
# RFM_Level 별 고객 수 계산
rfm_level_counts = rfm['RFM_Level'].value_counts().sort_index()

# 바 차트 생성
fig = go.Figure()


# RFM_Level 별로 반복하여 바 추가 및 텍스트 설정
for level in rfm_level_counts.index:
    count = rfm_level_counts[level]
    percent = rfm_level_percent[level]
    fig.add_trace(go.Bar(
        x=[level],
        y=[count],
        name=level,
        marker_color=rfm_level_colors[level],  # 색상 적용
        text=f"{count}명 ({percent}%)",  # 총 고객 수 및 퍼센트 표시
        textposition='outside'  # 텍스트 위치
    ))
# # RFM_Level 별로 반복하여 바 추가
# for level, count in rfm_level_counts.items():
#     fig.add_trace(go.Bar(x=[level], y=[count], name=level))

# 차트 레이아웃 설정
fig.update_layout(
    title='RFM 등급별 고객 수',
    xaxis=dict(title='RFM 등급'),
    yaxis=dict(title='고객 수'),
    bargap=0.2,
        yaxis_range=[0, max(rfm_level_counts) * 1.2]

)

fig.show()

R, F, M 각 요소별 점수 구간:


Unnamed: 0,Score,Recency,Frequency,Monetary
0,1,> 621.0 days,<= 1.0,<= 36000.0
1,2,328.5 - 621.0 days,1.0 - 2.0,36000.0 - 122375.0
2,3,57.0 - 328.5 days,2.0 - 4.0,122375.0 - 394200.0
3,4,<= 57.0 days,> 4.0,> 394200.0


Unnamed: 0_level_0,Recency,Frequency,Monetary,R_Score,F_Score,M_Score,RFM_Score,RFM_Level
all_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,275,9,1940000.0,3,4,4,3.666667,1등급(Green)
2.0,915,7,2070000.0,1,4,4,3.000000,2등급(Blue)
3.0,671,14,1894000.0,1,4,4,3.000000,2등급(Blue)
4.0,1125,6,751500.0,1,4,4,3.000000,2등급(Blue)
7.0,536,5,1166750.0,2,4,4,3.333333,1등급(Green)
...,...,...,...,...,...,...,...,...
52615.0,3,1,26500.0,4,1,1,2.000000,3등급(Orange)
52616.0,3,1,53000.0,4,1,2,2.333333,2등급(Blue)
52617.0,3,1,53000.0,4,1,2,2.333333,2등급(Blue)
52620.0,3,1,53000.0,4,1,2,2.333333,2등급(Blue)


RFM 등급별 요약:
             Count  Percent
1등급(Green)     710    26.89
2등급(Blue)      726    27.50
3등급(Orange)    905    34.28
4등급(Red)       299    11.33
