Clean data scrapped from Octuparse

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import re
import os
import itertools
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.io as pio
from typing import Any, Dict, List, Union

In [2]:
data_book_links = pd.read_excel("kindle/fixed_full_book_url_list v2-2 usable.xlsx")
data_book_info = pd.read_excel("kindle/1dep_book_info_raw.xlsx")
data_book_info_hardcover = pd.read_excel("kindle/1dep_book_info_raw hardcover.xlsx")
data_book_info = data_book_info.drop(columns=["format", "hardcover_list_price", "hardcover_selling_price"])
price_columns = ["kindle_selling_price", "kindle_original_price", "paperback_selling_price", "paperback_original_price"]
price_columns_hardcover = ["hardcover_list_price", "hardcover_selling_price"]


def convert_price(price: str):
    try:
        return float(price.replace("US$", "").replace("$", ""))
    except ValueError:
        return None


for col in price_columns:
    data_book_info[col] = data_book_info[col].astype(str).apply(convert_price).astype("Float64")

for col in price_columns_hardcover:
    data_book_info_hardcover[col] = data_book_info_hardcover[col].astype(str).apply(convert_price).astype("Float64")


def extract_kindle_rank(rank_str):
    match = re.search(r"暢銷商品排名:  #([\d,]+) 在 Kindle Store|Best Sellers Rank:  #([\d,]+) in Kindle Store", rank_str)
    if match:
        rank_num = match.group(1) or match.group(2)
        rank_num = rank_num.replace(",", "")
        return int(rank_num)
    return None


def extract_physical_rank(rank_str):
    match = re.search(r"暢銷商品排名:  #([\d,]+) 在 圖書|Best Sellers Rank:  #([\d,]+) in Books", rank_str)
    if match:
        rank_num = match.group(1) or match.group(2)
        rank_num = rank_num.replace(",", "")
        return int(rank_num)
    return None


data_book_info["kindle-whole_rank"] = data_book_info["kindle_whole_rank"].astype(str).apply(extract_kindle_rank).astype("Float64")
data_book_info["paperback-whole_rank"] = data_book_info["paperback_info"].astype(str).apply(extract_physical_rank).astype("Float64")
data_book_info_hardcover["hardcover-whole_rank"] = data_book_info_hardcover["hardcover_info"].astype(str).apply(extract_physical_rank).astype("Float64")

data_book_info_hardcover

Unnamed: 0,Original_URL,Text,hardcover_list_price,hardcover_selling_price,hardcover_link,hardcover_info,hardcover-whole_rank
0,https://www.amazon.com/Notes-Underground-Origi...,Product details\n \n A...,,,,,
1,https://www.amazon.com/fascinating-readable-hi...,Product details\n \n A...,,,,,
2,https://www.amazon.com/Old-Hollywood-Untold-st...,Product details\n \n A...,,,,,
3,https://www.amazon.com/One-Last-Breath-psychol...,Product details\n \n A...,,,,,
4,https://www.amazon.com/North-Country-Murder-Ir...,Product details\n \n A...,31.99,27.07,https://www.amazon.com/North-Country-Murder-Ir...,Product details\n \n P...,3602909.0
...,...,...,...,...,...,...,...
2841,https://www.amazon.com/Cruise-Ship-SOS-life-sa...,,,,,,
2842,https://www.amazon.com/Neither-here-nor-there-...,,,9.9,https://www.amazon.com/Neither-Here-Nor-There-...,Product details\n \n P...,605606.0
2843,https://www.amazon.com/Duck-Season-Drinking-Mi...,,,,,,
2844,https://www.amazon.com/Born-Run-Christopher-Mc...,,,,,,


In [3]:
print(data_book_links.info())
print(data_book_info.info())
data_kindle_1dep_ = pd.merge(
    data_book_links[["title", "category", "book_info_link"]],
    data_book_info[["kindle_selling_price", "kindle_original_price", "kindle-whole_rank", "paperback_selling_price", "paperback_original_price", "paperback-whole_rank", "original_kindle_link"]],
    left_on="book_info_link",
    right_on="original_kindle_link",
    how="left",
)

data_kindle_1dep = pd.merge(
    data_kindle_1dep_,
    data_book_info_hardcover[["hardcover_selling_price", "hardcover_list_price", "hardcover-whole_rank", "Original_URL"]],
    left_on="book_info_link",
    right_on="Original_URL",
    how="left",
)

print(data_kindle_1dep.info())
data_kindle_1dep

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2885 entries, 0 to 2884
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   title                        2885 non-null   object
 1   category_rank                2885 non-null   object
 2   format_kindle                2885 non-null   object
 3   book_info_link               2885 non-null   object
 4   located_category_page_link   2885 non-null   object
 5   located_category_page_title  2885 non-null   object
 6   Current_time                 2885 non-null   object
 7   category                     2885 non-null   object
 8   rank                         2885 non-null   int64 
dtypes: int64(1), object(8)
memory usage: 203.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2873 entries, 0 to 2872
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -

Unnamed: 0,title,category,book_info_link,kindle_selling_price,kindle_original_price,kindle-whole_rank,paperback_selling_price,paperback_original_price,paperback-whole_rank,original_kindle_link,hardcover_selling_price,hardcover_list_price,hardcover-whole_rank,Original_URL
0,"Da Vinci's Ghost: Genius, Obsession, and How L...",Arts & Photography,https://www.amazon.com/Vincis-Ghost-Obsession-...,9.99,18.0,60324.0,13.58,18.0,918176.0,https://www.amazon.com/Vincis-Ghost-Obsession-...,18.91,26.99,1397691.0,https://www.amazon.com/Vincis-Ghost-Obsession-...
1,My Effin' Life,Arts & Photography,https://www.amazon.com/My-Effin-Life-Geddy-Lee...,21.99,40.0,12518.0,20.97,40.0,4007.0,https://www.amazon.com/My-Effin-Life-Geddy-Lee...,20.97,40.0,6391.0,https://www.amazon.com/My-Effin-Life-Geddy-Lee...
2,London Uncovered: Sixty Unusual Places to Explore,Arts & Photography,https://www.amazon.com/London-Uncovered-Mark-D...,36.03,49.99,280649.0,37.93,50.0,2084598.0,https://www.amazon.com/London-Uncovered-Mark-D...,39.23,50.0,306891.0,https://www.amazon.com/London-Uncovered-Mark-D...
3,Brat: An '80s Story,Arts & Photography,https://www.amazon.com/Brat-80s-Story-Andrew-M...,11.99,18.99,9790.0,6.99,18.99,12944.0,https://www.amazon.com/Brat-80s-Story-Andrew-M...,14.63,28.0,61487.0,https://www.amazon.com/Brat-80s-Story-Andrew-M...
4,The Storyteller: Tales of Life and Music: A Me...,Arts & Photography,https://www.amazon.com/Storyteller-Tales-Life-...,2.99,21.99,5797.0,14.47,21.99,8208.0,https://www.amazon.com/Storyteller-Tales-Life-...,14.49,29.99,3979.0,https://www.amazon.com/Storyteller-Tales-Life-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2880,Orca: Sailing Around the World (Five Oceans),Travel,https://www.amazon.com/Orca-John-Pennington-eb...,3.99,,72952.0,,,,https://www.amazon.com/Orca-John-Pennington-eb...,,,,https://www.amazon.com/Orca-John-Pennington-eb...
2881,Neither here nor there: Travels in Europe,Travel,https://www.amazon.com/Neither-here-nor-there-...,14.99,,130307.0,6.99,18.99,,https://www.amazon.com/Neither-here-nor-there-...,9.9,,605606.0,https://www.amazon.com/Neither-here-nor-there-...
2882,Death in Yellowstone: Accidents and Foolhardin...,Travel,https://www.amazon.com/Death-Yellowstone-Accid...,8.99,9.99,70687.0,,,,https://www.amazon.com/Death-Yellowstone-Accid...,,,,https://www.amazon.com/Death-Yellowstone-Accid...
2883,FORTY NIGHTS IN ARUBA,Travel,https://www.amazon.com/FORTY-NIGHTS-ARUBA-BEA-...,2.99,11.95,75871.0,,,,https://www.amazon.com/FORTY-NIGHTS-ARUBA-BEA-...,,,,https://www.amazon.com/FORTY-NIGHTS-ARUBA-BEA-...


In [4]:
data_kindle_1dep = data_kindle_1dep.rename(
    columns={
        "kindle_selling_price": "kindle-selling_price",
        "kindle_original_price": "kindle-original_price",
        "paperback_selling_price": "paperback-selling_price",
        "paperback_original_price": "paperback-original_price",
        "hardcover_selling_price": "hardcover-selling_price",
        "hardcover_list_price": "hardcover-original_price",
    }
)
data_kindle_1dep.head()

Unnamed: 0,title,category,book_info_link,kindle-selling_price,kindle-original_price,kindle-whole_rank,paperback-selling_price,paperback-original_price,paperback-whole_rank,original_kindle_link,hardcover-selling_price,hardcover-original_price,hardcover-whole_rank,Original_URL
0,"Da Vinci's Ghost: Genius, Obsession, and How L...",Arts & Photography,https://www.amazon.com/Vincis-Ghost-Obsession-...,9.99,18.0,60324.0,13.58,18.0,918176.0,https://www.amazon.com/Vincis-Ghost-Obsession-...,18.91,26.99,1397691.0,https://www.amazon.com/Vincis-Ghost-Obsession-...
1,My Effin' Life,Arts & Photography,https://www.amazon.com/My-Effin-Life-Geddy-Lee...,21.99,40.0,12518.0,20.97,40.0,4007.0,https://www.amazon.com/My-Effin-Life-Geddy-Lee...,20.97,40.0,6391.0,https://www.amazon.com/My-Effin-Life-Geddy-Lee...
2,London Uncovered: Sixty Unusual Places to Explore,Arts & Photography,https://www.amazon.com/London-Uncovered-Mark-D...,36.03,49.99,280649.0,37.93,50.0,2084598.0,https://www.amazon.com/London-Uncovered-Mark-D...,39.23,50.0,306891.0,https://www.amazon.com/London-Uncovered-Mark-D...
3,Brat: An '80s Story,Arts & Photography,https://www.amazon.com/Brat-80s-Story-Andrew-M...,11.99,18.99,9790.0,6.99,18.99,12944.0,https://www.amazon.com/Brat-80s-Story-Andrew-M...,14.63,28.0,61487.0,https://www.amazon.com/Brat-80s-Story-Andrew-M...
4,The Storyteller: Tales of Life and Music: A Me...,Arts & Photography,https://www.amazon.com/Storyteller-Tales-Life-...,2.99,21.99,5797.0,14.47,21.99,8208.0,https://www.amazon.com/Storyteller-Tales-Life-...,14.49,29.99,3979.0,https://www.amazon.com/Storyteller-Tales-Life-...


In [5]:
def fill_prices(row):
    for format in ["kindle", "paperback", "hardcover"]:
        selling_price_col = f"{format}-selling_price"
        original_price_col = f"{format}-original_price"

        if pd.isna(row[original_price_col]) and not pd.isna(row[selling_price_col]):
            row[original_price_col] = row[selling_price_col]
        elif not pd.isna(row[original_price_col]) and not pd.isna(row[selling_price_col]) and row[original_price_col] < row[selling_price_col]:
            row[original_price_col] = row[selling_price_col]

    return row


data_kindle_1dep = data_kindle_1dep.apply(fill_prices, axis=1)

In [10]:
data_kindle_1dep["kindle_over_paperback-selling_price"] = data_kindle_1dep["kindle-selling_price"] / data_kindle_1dep["paperback-selling_price"]
data_kindle_1dep["kindle_over_paperback-original_price"] = data_kindle_1dep["kindle-original_price"] / data_kindle_1dep["paperback-original_price"]

# 計算 Kindle Over Hardcover 比率
data_kindle_1dep["kindle_over_hardcover-selling_price"] = data_kindle_1dep["kindle-selling_price"] / data_kindle_1dep["hardcover-selling_price"]
data_kindle_1dep["kindle_over_hardcover-original_price"] = data_kindle_1dep["kindle-original_price"] / data_kindle_1dep["hardcover-original_price"]

# 計算 Hardcover Over Paperback 比率
data_kindle_1dep["hardcover_over_paperback-selling_price"] = data_kindle_1dep["hardcover-selling_price"] / data_kindle_1dep["paperback-selling_price"]
data_kindle_1dep["hardcover_over_paperback-original_price"] = data_kindle_1dep["hardcover-original_price"] / data_kindle_1dep["paperback-original_price"]


# discount rate

data_kindle_1dep["kindle-discount_rate"] = data_kindle_1dep["kindle-selling_price"] / data_kindle_1dep["kindle-original_price"]
data_kindle_1dep["paperback-discount_rate"] = data_kindle_1dep["paperback-selling_price"] / data_kindle_1dep["paperback-original_price"]
data_kindle_1dep["hardcover-discount_rate"] = data_kindle_1dep["hardcover-selling_price"] / data_kindle_1dep["hardcover-original_price"]

In [10]:
data_kindle_1dep.iloc[0]

title                                   Da Vinci's Ghost: Genius, Obsession, and How L...
category                                                               Arts & Photography
book_info_link                          https://www.amazon.com/Vincis-Ghost-Obsession-...
kindle-selling_price                                                                 9.99
kindle-original_price                                                                18.0
kindle-whole_rank                                                                 60324.0
paperback-selling_price                                                             13.58
paperback-original_price                                                             18.0
paperback-whole_rank                                                             918176.0
original_kindle_link                    https://www.amazon.com/Vincis-Ghost-Obsession-...
hardcover-selling_price                                                             18.91
hardcover-

In [11]:
def plot_category_bar_plot(data: pd.DataFrame):
    category_counts = data["category"].value_counts().reset_index()
    category_counts.columns = ["category", "count"]

    # 繪製長條圖
    fig = px.bar(category_counts, x="category", y="count", labels={"category": "Category", "count": "Frequency"}, title="Distribution of Category 1dep in data_2dep_top100")

    # 顯示圖表
    fig.show()


plot_category_bar_plot(data_kindle_1dep)

In [14]:
def plot_book_price_scatter(data: pd.DataFrame):
    # 調整價格的方法
    def adjust_price(df, column):
        adjusted_column = f"{column}-adjusted"
        df[adjusted_column] = df[column].apply(lambda x: min(x, 100))
        return adjusted_column

    # 設置軸的範圍和單位長
    min_val = 0
    max_val = 100
    buffer = 5  # 預留邊緣緩衝區
    min_val -= buffer
    max_val += buffer

    # 各種版本的組合
    combinations = [("paperback", "kindle"), ("hardcover", "kindle"), ("paperback", "hardcover")]

    for version1, version2 in combinations:
        # 篩選掉缺少必要數據的行
        filtered_data = data.dropna(subset=[f"{version1}-original_price", f"{version2}-original_price"])

        # 調整價格
        adjusted_data = filtered_data.copy()
        adjusted_version1_original = adjust_price(adjusted_data, f"{version1}-original_price")
        adjusted_version2_original = adjust_price(adjusted_data, f"{version2}-original_price")

        # 創建子圖（調整價格）
        fig = make_subplots(rows=1, cols=2, subplot_titles=("Original Price", "Selling Price"), x_title=f"{version1.capitalize()} Price", y_title=f"{version2.capitalize()} Price")

        # 原價-調整
        scatter1 = px.scatter(adjusted_data, x=adjusted_version1_original, y=adjusted_version2_original)
        scatter1.update_traces(showlegend=False)
        fig.add_traces(scatter1.data, rows=1, cols=1)

        # 售價-調整
        scatter2 = px.scatter(adjusted_data, x=f"{version1}-selling_price", y=f"{version2}-selling_price")
        scatter2.update_traces(showlegend=False)
        fig.add_traces(scatter2.data, rows=1, cols=2)

        # 更新子圖佈局，設置軸範圍和單位長
        fig.update_layout(
            height=600,
            width=1200,
            title_text=f"Comparison of {version1.capitalize()} and {version2.capitalize()} Prices",
            xaxis1=dict(range=[min_val, max_val]),
            yaxis1=dict(range=[min_val, max_val]),
            xaxis2=dict(range=[min_val, max_val]),
            yaxis2=dict(range=[min_val, max_val]),
        )

        # 添加45度線
        for i in range(1, 3):
            fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode="lines", line=dict(color="red", dash="dash"), showlegend=False), row=1, col=i)

        fig.write_image(f"0808_figs knidle_1dep 3formats/price scatter - {version1} - {version2}.svg")
        fig.show()


# 假設你的數據存在 data_kindle_1dep
plot_book_price_scatter(data_kindle_1dep)

In [38]:
def plot_discount_rate_scatter(data: pd.DataFrame):
    filtered_data = data.dropna(subset=["paperback-discount_rate", "kindle-discount_rate"])

    # 設置軸的範圍和單位長
    min_val = 0
    max_val = max(filtered_data["paperback-discount_rate"].max(), filtered_data["kindle-discount_rate"].max())
    buffer = (max_val - min_val) * 0.05  # 預留邊緣緩衝區
    max_val += buffer

    # 創建散佈圖
    fig = px.scatter(
        filtered_data,
        x="paperback-discount_rate",
        y="kindle-discount_rate",
        labels={"paperback-discount_rate": "Paperback Discount Rate", "kindle-discount_rate": "Kindle Discount Rate"},
        title="Discount Rates for Paperback and kindle",
    )

    fig.update_layout(xaxis=dict(scaleanchor="y", scaleratio=1, range=[min_val, max_val]), yaxis=dict(range=[min_val, max_val]), width=600, height=600)  # 保證圖形是正方形

    # 添加45度線
    fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode="lines", line=dict(color="red", dash="dash"), showlegend=False))
    fig.write_image("0808_figs knidle_1dep 3formats/discount_rate_scatter.svg")
    fig.show()


plot_discount_rate_scatter(data_kindle_1dep)

In [21]:
def plot_discount_rate_scatter(data: pd.DataFrame):
    # 設置軸的範圍和單位長
    min_val = -0.05
    max_val = 1.05

    # 創建子圖布局
    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=("Kindle vs Paperback", "Kindle vs Hardcover", "Hardcover vs Paperback", ""),
        x_title="Discount Rate",
        y_title="Discount Rate",
        vertical_spacing=0.1,
        horizontal_spacing=0.1,
    )

    # Kindle vs Paperback
    filtered_data_1 = data.dropna(subset=["paperback-discount_rate", "kindle-discount_rate"])
    scatter1 = px.scatter(filtered_data_1, x="paperback-discount_rate", y="kindle-discount_rate")
    scatter1.update_traces(showlegend=False)
    fig.add_traces(scatter1.data, rows=1, cols=1)

    # Kindle vs Hardcover
    filtered_data_2 = data.dropna(subset=["hardcover-discount_rate", "kindle-discount_rate"])
    scatter2 = px.scatter(filtered_data_2, x="hardcover-discount_rate", y="kindle-discount_rate")
    scatter2.update_traces(showlegend=False)
    fig.add_traces(scatter2.data, rows=1, cols=2)

    # Hardcover vs Paperback
    filtered_data_3 = data.dropna(subset=["paperback-discount_rate", "hardcover-discount_rate"])
    scatter3 = px.scatter(filtered_data_3, x="paperback-discount_rate", y="hardcover-discount_rate")
    scatter3.update_traces(showlegend=False)
    fig.add_traces(scatter3.data, rows=2, cols=1)

    # 更新子圖佈局，設置軸範圍和單位長
    fig.update_layout(
        height=1200,
        width=1200,
        title_text="Discount Rates Comparison for Different Book Formats",
        xaxis1=dict(range=[min_val, max_val]),
        yaxis1=dict(range=[min_val, max_val]),
        xaxis2=dict(range=[min_val, max_val]),
        yaxis2=dict(range=[min_val, max_val]),
        xaxis3=dict(range=[min_val, max_val]),
        yaxis3=dict(range=[min_val, max_val]),
    )

    # 添加45度線
    for i in range(1, 4):
        fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode="lines", line=dict(color="red", dash="dash"), showlegend=False), row=(i + 1) // 2, col=(i + 1) % 2 + 1)

    fig.write_image("0808_figs knidle_1dep 3formats/discount_rate_scatter.svg")
    fig.show()


# 假設你的數據存在 data_kindle_1dep
plot_discount_rate_scatter(data_kindle_1dep)

In [30]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [7]:
def plot_discount_rate_scatter(data: pd.DataFrame):
    # 設置軸的範圍和單位長
    min_val = -0.05
    max_val = 1.05

    # 創建子圖布局
    fig = make_subplots(rows=2, cols=2, subplot_titles=("Kindle vs Paperback", "Kindle vs Hardcover", "Hardcover vs Paperback", ""), vertical_spacing=0.15, horizontal_spacing=0.1)

    # Kindle vs Paperback
    filtered_data_1 = data.dropna(subset=["paperback-discount_rate", "kindle-discount_rate"])
    scatter1 = px.scatter(filtered_data_1, x="paperback-discount_rate", y="kindle-discount_rate")
    scatter1.update_traces(showlegend=False)
    fig.add_traces(scatter1.data, rows=1, cols=1)

    fig.update_xaxes(title_text="Paperback Discount Rate", row=1, col=1)
    fig.update_yaxes(title_text="Kindle Discount Rate", row=1, col=1)

    # Kindle vs Hardcover
    filtered_data_2 = data.dropna(subset=["hardcover-discount_rate", "kindle-discount_rate"])
    scatter2 = px.scatter(filtered_data_2, x="hardcover-discount_rate", y="kindle-discount_rate")
    scatter2.update_traces(showlegend=False)
    fig.add_traces(scatter2.data, rows=1, cols=2)

    fig.update_xaxes(title_text="Hardcover Discount Rate", row=1, col=2)
    fig.update_yaxes(title_text="Kindle Discount Rate", row=1, col=2)

    # Hardcover vs Paperback
    filtered_data_3 = data.dropna(subset=["paperback-discount_rate", "hardcover-discount_rate"])
    scatter3 = px.scatter(filtered_data_3, x="paperback-discount_rate", y="hardcover-discount_rate")
    scatter3.update_traces(showlegend=False)
    fig.add_traces(scatter3.data, rows=2, cols=1)

    fig.update_xaxes(title_text="Paperback Discount Rate", row=2, col=1)
    fig.update_yaxes(title_text="Hardcover Discount Rate", row=2, col=1)

    # 更新子圖佈局，設置軸範圍和單位長
    fig.update_layout(
        height=800,
        width=800,
        title_text="Discount Rates Comparison for Different Book Formats",
        xaxis1=dict(range=[min_val, max_val]),
        yaxis1=dict(range=[min_val, max_val]),
        xaxis2=dict(range=[min_val, max_val]),
        yaxis2=dict(range=[min_val, max_val]),
        xaxis3=dict(range=[min_val, max_val]),
        yaxis3=dict(range=[min_val, max_val]),
    )

    # 添加45度線
    for i in range(1, 4):
        fig.add_trace(go.Scatter(x=[min_val, max_val], y=[min_val, max_val], mode="lines", line=dict(color="red", dash="dash"), showlegend=False), row=(i + 1) // 2, col=(i + 1) % 2 + 1)

    fig.write_image("0808_figs knidle_1dep 3formats/discount_rate_scatter.svg")
    fig.show()


# 假設你的數據存在 data_kindle_1dep
plot_discount_rate_scatter(data_kindle_1dep)

In [52]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 篩選掉缺少必要數據的行
filtered_data = data_kindle_1dep.dropna(subset=["kindle_over_paperback-selling_price", "kindle_over_paperback-original_price"])
percentiles = [0.2, 0.4, 0.6, 0.8]
quantile_values = filtered_data["kindle_over_paperback-selling_price"].quantile(percentiles)
print(quantile_values)

# 設置軸的範圍和單位長
min_val = min(filtered_data["kindle_over_paperback-selling_price"].min(), filtered_data["kindle_over_paperback-original_price"].min())
max_val = max(filtered_data["kindle_over_paperback-selling_price"].max(), filtered_data["kindle_over_paperback-original_price"].max())
buffer = (max_val - min_val) * 0.10  # 預留邊緣緩衝區
min_val -= buffer
max_val += buffer

# 創建子圖
fig = make_subplots(rows=1, cols=2, subplot_titles=("Kindle Over Paperback Selling Price", "Kindle Over Paperback Original Price"), x_title="Ratio", y_title="Count")

# 售價直方圖
hist1 = px.histogram(filtered_data, x="kindle_over_paperback-selling_price", nbins=30, text_auto=True)
hist1.update_traces(showlegend=False)
fig.add_traces(hist1.data, rows=1, cols=1)

# 原價直方圖
hist2 = px.histogram(filtered_data, x="kindle_over_paperback-original_price", nbins=30, text_auto=True)
hist2.update_traces(showlegend=False)
fig.add_traces(hist2.data, rows=1, cols=2)

# 更新子圖佈局，設置軸範圍和單位長
fig.update_layout(height=600, width=1200, title_text="Histograms of Kindle Over Paperback Prices")

fig.update_xaxes(range=[min_val, max_val], row=1, col=1)
fig.update_xaxes(range=[min_val, max_val], row=1, col=2)
fig.update_yaxes(range=[0, 850], row=1, col=1)
fig.update_yaxes(range=[0, 850], row=1, col=2)
fig.write_image("0808_figs knidle_1dep 3formats/kindle_over_paperback_histogram_initial.svg")
fig.show()

0.2    0.321896
0.4    0.553138
0.6    0.769065
0.8    0.972184
Name: kindle_over_paperback-selling_price, dtype: object


In [14]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd


def plot_price_histogram(data: pd.DataFrame):
    # 篩選掉缺少必要數據的行
    filtered_data = data.dropna(
        subset=[
            "kindle_over_paperback-selling_price",
            "kindle_over_paperback-original_price",
            "kindle_over_hardcover-selling_price",
            "kindle_over_hardcover-original_price",
            "hardcover_over_paperback-selling_price",
            "hardcover_over_paperback-original_price",
        ]
    )

    # 設置軸的範圍和單位長
    all_prices = pd.concat(
        [
            filtered_data["kindle_over_paperback-selling_price"],
            filtered_data["kindle_over_paperback-original_price"],
            filtered_data["kindle_over_hardcover-selling_price"],
            filtered_data["kindle_over_hardcover-original_price"],
            filtered_data["hardcover_over_paperback-selling_price"],
            filtered_data["hardcover_over_paperback-original_price"],
        ]
    )
    min_val = all_prices.min()
    max_val = all_prices.max()
    buffer = (max_val - min_val) * 0.10  # 預留邊緣緩衝區
    min_val -= buffer
    max_val += buffer

    # 創建子圖
    fig = make_subplots(
        rows=3,
        cols=2,
        subplot_titles=(
            "Kindle Over Paperback Selling Price",
            "Kindle Over Paperback Original Price",
            "Kindle Over Hardcover Selling Price",
            "Kindle Over Hardcover Original Price",
            "Hardcover Over Paperback Selling Price",
            "Hardcover Over Paperback Original Price",
        ),
        vertical_spacing=0.1,
        horizontal_spacing=0.1,
    )

    # 售價直方圖
    hist1 = px.histogram(filtered_data, x="kindle_over_paperback-selling_price", nbins=30, text_auto=True)
    hist1.update_traces(showlegend=False)
    fig.add_traces(hist1.data, rows=1, cols=1)

    # 原價直方圖
    hist2 = px.histogram(filtered_data, x="kindle_over_paperback-original_price", nbins=30, text_auto=True)
    hist2.update_traces(showlegend=False)
    fig.add_traces(hist2.data, rows=1, cols=2)

    # 售價直方圖
    hist3 = px.histogram(filtered_data, x="kindle_over_hardcover-selling_price", nbins=30, text_auto=True)
    hist3.update_traces(showlegend=False)
    fig.add_traces(hist3.data, rows=2, cols=1)

    # 原價直方圖
    hist4 = px.histogram(filtered_data, x="kindle_over_hardcover-original_price", nbins=30, text_auto=True)
    hist4.update_traces(showlegend=False)
    fig.add_traces(hist4.data, rows=2, cols=2)

    # 售價直方圖
    hist5 = px.histogram(filtered_data, x="hardcover_over_paperback-selling_price", nbins=30, text_auto=True)
    hist5.update_traces(showlegend=False)
    fig.add_traces(hist5.data, rows=3, cols=1)

    # 原價直方圖
    hist6 = px.histogram(filtered_data, x="hardcover_over_paperback-original_price", nbins=30, text_auto=True)
    hist6.update_traces(showlegend=False)
    fig.add_traces(hist6.data, rows=3, cols=2)

    # 更新子圖佈局，設置軸範圍和單位長
    fig.update_layout(height=1200, width=1200, title_text="Histograms of Prices for Different Book Format Comparisons")

    for i in range(1, 4):
        fig.update_xaxes(range=[min_val, 10], row=i, col=1)
        fig.update_xaxes(range=[min_val, 10], row=i, col=2)
        fig.update_yaxes(range=[0, 1000], row=i, col=1)
        fig.update_yaxes(range=[0, 1000], row=i, col=2)

    fig.write_image("0808_figs knidle_1dep 3formats/price_histogram_combined.svg")
    fig.show()


# 假設你的數據存在 data_kindle_1dep
plot_price_histogram(data_kindle_1dep)

In [54]:
# 篩選掉缺少必要數據的行
filtered_data = data_kindle_1dep.dropna(subset=["kindle_over_paperback-selling_price", "kindle_over_paperback-original_price"])

# 定義區間範圍
bins = [0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2, 2.5, 3, 3.5, 4, float("inf")]

# 根據區間範圍計算頻率
filtered_data["selling_price_binned"] = pd.cut(filtered_data["kindle_over_paperback-selling_price"], bins=bins)
filtered_data["original_price_binned"] = pd.cut(filtered_data["kindle_over_paperback-original_price"], bins=bins)

selling_price_counts = filtered_data["selling_price_binned"].value_counts().sort_index()
original_price_counts = filtered_data["original_price_binned"].value_counts().sort_index()

# 設置區間標籤
labels = ["0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1.0", "1.0-1.25", "1.25-1.5", "1.5-1.75", "1.75-2.0", "2.0-2.5", "2.5-3.0", "3.0-3.5", "3.5-4.0", ">4"]

# 創建子圖
fig = make_subplots(rows=1, cols=2, subplot_titles=("Kindle Over Paperback Selling Price", "Kindle Over Paperback Original Price"), x_title="Ratio", y_title="Count")

# 售價長條圖
fig.add_trace(go.Bar(x=labels, y=selling_price_counts.values, text=selling_price_counts.values, textposition="auto", name="Selling Price"), row=1, col=1)

# 原價長條圖
fig.add_trace(
    go.Bar(x=labels, y=original_price_counts.values, text=original_price_counts.values, textposition="auto", name="Original Price", marker_color="#636EFA"), row=1, col=2  # 設置原價長條圖的顏色為藍色
)

# 設置相同的刻度範圍
max_y = max(selling_price_counts.max(), original_price_counts.max())
buffer = max_y * 0.1  # 預留邊緣緩衝區

fig.update_yaxes(range=[0, max_y + buffer], row=1, col=1)
fig.update_yaxes(range=[0, max_y + buffer], row=1, col=2)

# 更新子圖佈局
fig.update_layout(height=600, width=1200, title_text="Bar Charts of Kindle Over Paperback Prices", showlegend=False)
fig.write_image("0808_figs knidle_1dep 3formats/kindle_over_paperback_histogram.svg")
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def plot_price_histogram(data: pd.DataFrame):
    # 篩選掉缺少必要數據的行
    filtered_data = data.dropna(
        subset=[
            "kindle_over_paperback-selling_price",
            "kindle_over_paperback-original_price",
            "kindle_over_hardcover-selling_price",
            "kindle_over_hardcover-original_price",
            "hardcover_over_paperback-selling_price",
            "hardcover_over_paperback-original_price",
        ]
    )

    # 定義區間範圍
    bins = [0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2, 2.5, 3, 3.5, 4, float("inf")]
    labels = ["0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1.0", "1.0-1.25", "1.25-1.5", "1.5-1.75", "1.75-2.0", "2.0-2.5", "2.5-3.0", "3.0-3.5", "3.5-4.0", ">4"]

    # 計算 Kindle Over Paperback
    filtered_data["selling_price_binned_kindle_paperback"] = pd.cut(filtered_data["kindle_over_paperback-selling_price"], bins=bins)
    filtered_data["original_price_binned_kindle_paperback"] = pd.cut(filtered_data["kindle_over_paperback-original_price"], bins=bins)
    selling_price_counts_kindle_paperback = filtered_data["selling_price_binned_kindle_paperback"].value_counts().sort_index()
    original_price_counts_kindle_paperback = filtered_data["original_price_binned_kindle_paperback"].value_counts().sort_index()

    # 計算 Kindle Over Hardcover
    filtered_data["selling_price_binned_kindle_hardcover"] = pd.cut(filtered_data["kindle_over_hardcover-selling_price"], bins=bins)
    filtered_data["original_price_binned_kindle_hardcover"] = pd.cut(filtered_data["kindle_over_hardcover-original_price"], bins=bins)
    selling_price_counts_kindle_hardcover = filtered_data["selling_price_binned_kindle_hardcover"].value_counts().sort_index()
    original_price_counts_kindle_hardcover = filtered_data["original_price_binned_kindle_hardcover"].value_counts().sort_index()

    # 計算 Hardcover Over Paperback
    filtered_data["selling_price_binned_hardcover_paperback"] = pd.cut(filtered_data["hardcover_over_paperback-selling_price"], bins=bins)
    filtered_data["original_price_binned_hardcover_paperback"] = pd.cut(filtered_data["hardcover_over_paperback-original_price"], bins=bins)
    selling_price_counts_hardcover_paperback = filtered_data["selling_price_binned_hardcover_paperback"].value_counts().sort_index()
    original_price_counts_hardcover_paperback = filtered_data["original_price_binned_hardcover_paperback"].value_counts().sort_index()

    # 設置相同的刻度範圍
    max_y = max(
        selling_price_counts_kindle_paperback.max(),
        original_price_counts_kindle_paperback.max(),
        selling_price_counts_kindle_hardcover.max(),
        original_price_counts_kindle_hardcover.max(),
        selling_price_counts_hardcover_paperback.max(),
        original_price_counts_hardcover_paperback.max(),
    )
    buffer = max_y * 0.1  # 預留邊緣緩衝區

    # 創建子圖
    fig = make_subplots(
        rows=3,
        cols=2,
        subplot_titles=(
            "Kindle Over Paperback Selling Price",
            "Kindle Over Paperback Original Price",
            "Kindle Over Hardcover Selling Price",
            "Kindle Over Hardcover Original Price",
            "Hardcover Over Paperback Selling Price",
            "Hardcover Over Paperback Original Price",
        ),
        vertical_spacing=0.1,
        horizontal_spacing=0.1,
    )

    # Kindle Over Paperback
    fig.add_trace(
        go.Bar(x=labels, y=selling_price_counts_kindle_paperback.values, text=selling_price_counts_kindle_paperback.values, textposition="auto", name="Selling Price", marker_color="#636EFA"),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Bar(x=labels, y=original_price_counts_kindle_paperback.values, text=original_price_counts_kindle_paperback.values, textposition="auto", name="Original Price", marker_color="#636EFA"),
        row=1,
        col=2,
    )

    # Kindle Over Hardcover
    fig.add_trace(
        go.Bar(x=labels, y=selling_price_counts_kindle_hardcover.values, text=selling_price_counts_kindle_hardcover.values, textposition="auto", name="Selling Price", marker_color="#636EFA"),
        row=2,
        col=1,
    )
    fig.add_trace(
        go.Bar(x=labels, y=original_price_counts_kindle_hardcover.values, text=original_price_counts_kindle_hardcover.values, textposition="auto", name="Original Price", marker_color="#636EFA"),
        row=2,
        col=2,
    )

    # Hardcover Over Paperback
    fig.add_trace(
        go.Bar(x=labels, y=selling_price_counts_hardcover_paperback.values, text=selling_price_counts_hardcover_paperback.values, textposition="auto", name="Selling Price", marker_color="#636EFA"),
        row=3,
        col=1,
    )
    fig.add_trace(
        go.Bar(x=labels, y=original_price_counts_hardcover_paperback.values, text=original_price_counts_hardcover_paperback.values, textposition="auto", name="Original Price", marker_color="#636EFA"),
        row=3,
        col=2,
    )

    fig.update_yaxes(range=[0, max_y + buffer], row=1, col=1)
    fig.update_yaxes(range=[0, max_y + buffer], row=1, col=2)
    fig.update_yaxes(range=[0, max_y + buffer], row=2, col=1)
    fig.update_yaxes(range=[0, max_y + buffer], row=2, col=2)
    fig.update_yaxes(range=[0, max_y + buffer], row=3, col=1)
    fig.update_yaxes(range=[0, max_y + buffer], row=3, col=2)

    # 更新子圖佈局
    fig.update_layout(height=1800, width=1200, title_text="Bar Charts of Prices for Different Book Format Comparisons", showlegend=False)
    fig.write_image("0808_figs knidle_1dep 3formats/price_histogram_combined.svg")
    fig.show()


# 假設你的數據存在 data_kindle_1dep
plot_price_histogram(data_kindle_1dep)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [50]:
data_kindle_1dep.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2885 entries, 0 to 2884
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   title                                 2885 non-null   object
 1   category                              2885 non-null   object
 2   book_info_link                        2885 non-null   object
 3   kindle-selling_price                  2851 non-null   object
 4   kindle-original_price                 2851 non-null   object
 5   kindle-whole_rank                     2761 non-null   object
 6   paperback-selling_price               1531 non-null   object
 7   paperback-original_price              1537 non-null   object
 8   paperback-whole_rank                  1687 non-null   object
 9   original_kindle_link                  2873 non-null   object
 10  kindle_over_paperback-selling_price   1523 non-null   object
 11  kindle_over_paperback-original

In [43]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def plot_rank_price_ratio(data_con: pd.DataFrame) -> None:
    # 提取相關欄位
    paperback_rank = data_con["paperback-whole_rank"]
    kindle_rank = data_con["kindle-whole_rank"]
    paperback_price = data_con["paperback-selling_price"]
    kindle_price = data_con["kindle-selling_price"]
    title = data_con["title"]

    # 計算價格比例
    price_ratio = kindle_price / paperback_price

    # 剔除缺失值
    valid = ~(paperback_rank.isna() | kindle_rank.isna() | price_ratio.isna())
    paperback_rank = paperback_rank[valid]
    kindle_rank = kindle_rank[valid]
    price_ratio = price_ratio[valid]
    title = title[valid]

    # 計算百分位數
    quantiles = price_ratio.quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0])

    # 分類價格比例
    def get_price_category(ratio):
        if ratio <= quantiles[0.2]:
            return f"0.00~{quantiles[0.2]:.2f}"
        elif quantiles[0.2] < ratio <= quantiles[0.4]:
            return f"{quantiles[0.2]:.2f}~{quantiles[0.4]:.2f}"
        elif quantiles[0.4] < ratio <= quantiles[0.6]:
            return f"{quantiles[0.4]:.2f}~{quantiles[0.6]:.2f}"
        elif quantiles[0.6] < ratio <= quantiles[0.8]:
            return f"{quantiles[0.6]:.2f}~{quantiles[0.8]:.2f}"
        else:
            return f">{quantiles[0.8]:.2f}"

    price_category = price_ratio.apply(get_price_category)

    # 自定義色彩，按範圍大小排序
    color_map = {
        f"0.00~{quantiles[0.2]:.2f}": "purple",
        f"{quantiles[0.2]:.2f}~{quantiles[0.4]:.2f}": "blue",
        f"{quantiles[0.4]:.2f}~{quantiles[0.6]:.2f}": "green",
        f"{quantiles[0.6]:.2f}~{quantiles[0.8]:.2f}": "orange",
        f">{quantiles[0.8]:.2f}": "red",
    }

    # 圖表範圍列表
    ranges = [(0, 4_000_000), (0, 1_000_000), (0, 100_000), (0, 10_000), (0, 1_000)]

    for min_val, max_val in ranges:
        buffer = max_val * 0.05  # 加5%的buffer
        actual_max_val = max_val + buffer

        # 建立子圖
        fig = make_subplots(rows=1, cols=1)

        # 添加主要散點圖
        for category in sorted(color_map.keys()):
            mask = price_category == category
            fig.add_trace(
                go.Scatter(
                    x=paperback_rank[mask],
                    y=kindle_rank[mask],
                    mode="markers",
                    marker=dict(color=color_map[category]),
                    name=category,
                    hovertemplate="Paperback Rank: %{x}<br>kindle Rank: %{y}<br>Price Ratio: %{customdata[0]:.2f}<br>Title: %{customdata[1]}<extra></extra>",
                    customdata=list(zip(price_ratio[mask], title[mask])),
                )
            )

        # 增加 x=y 的直線
        fig.add_trace(go.Scatter(x=[min_val, actual_max_val], y=[min_val, actual_max_val], mode="lines", line=go.scatter.Line(color="black", dash="dash"), showlegend=False))

        # 標示超出範圍的點
        out_of_range = (paperback_rank > max_val) | (kindle_rank > max_val)
        fig.add_trace(
            go.Scatter(
                x=paperback_rank[out_of_range].clip(upper=max_val),
                y=kindle_rank[out_of_range].clip(upper=max_val),
                mode="markers",
                marker=dict(symbol="x", color=price_category[out_of_range].map(color_map)),
                showlegend=False,
                hovertemplate="Paperback Rank: %{x}<br>kindle Rank: %{y}<br>Price Ratio: %{customdata[0]:.2f}<br>Title: %{customdata[1]}<extra></extra>",
                customdata=list(zip(price_ratio[out_of_range], title[out_of_range])),
            )
        )

        # 更新圖表布局
        fig.update_layout(
            height=800,
            width=800,
            title=f"Book Rank Comparison (Rank Range: {min_val} to {max_val})",
            xaxis=dict(title="Paperback Whole Rank", range=[-buffer, actual_max_val], scaleanchor="y", scaleratio=1),
            yaxis=dict(title="kindle Whole Rank", range=[-buffer, actual_max_val], scaleanchor="x", scaleratio=1),
            legend_title="Price Ratio Category",
            legend=dict(traceorder="normal"),
        )

        # 顯示圖表
        fig.write_image(f"0801 kindle-paperback/rank_price_ratio_{min_val}_to_{max_val}.svg")
        fig.show()


# 假設 data_2dep_top100 是你的數據
plot_rank_price_ratio(data_kindle_1dep)

In [26]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def plot_rank_price_ratio(data_con: pd.DataFrame) -> None:
    # 提取相關欄位
    paperback_rank = data_con["paperback-whole_rank"]
    kindle_rank = data_con["kindle-whole_rank"]
    hardcover_rank = data_con["hardcover-whole_rank"]
    paperback_price = data_con["paperback-selling_price"]
    kindle_price = data_con["kindle-selling_price"]
    hardcover_price = data_con["hardcover-selling_price"]
    title = data_con["title"]

    # 計算價格比例
    ratios = {
        "Kindle/Paperback": kindle_price / paperback_price,
        "Kindle/Hardcover": kindle_price / hardcover_price,
        "Hardcover/Paperback": hardcover_price / paperback_price,
    }

    ranks = {
        "Kindle/Paperback": (kindle_rank, paperback_rank),
        "Kindle/Hardcover": (kindle_rank, hardcover_rank),
        "Hardcover/Paperback": (hardcover_rank, paperback_rank),
    }

    ranges = [(0, 1_000_000), (0, 10_000)]
    layout_titles = ["0-1M", "0-10K"]
    combinations = ["Kindle/Paperback", "Kindle/Hardcover", "Hardcover/Paperback"]

    # 定義固定的 buffer
    fixed_buffer = 500

    # 建立子圖布局
    fig = make_subplots(rows=3, cols=2, subplot_titles=[f"{comb} ({layout_titles[i]})" for comb in combinations for i in range(2)])

    # 為每個組合和範圍創建圖表
    for row, comb in enumerate(combinations):
        rank1, rank2 = ranks[comb]
        price_ratio = ratios[comb]

        # 剔除缺失值
        valid = ~(rank1.isna() | rank2.isna() | price_ratio.isna())
        rank1 = rank1[valid]
        rank2 = rank2[valid]
        price_ratio = price_ratio[valid]
        title = title[valid]

        # 計算百分位數
        quantiles = price_ratio.quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0])

        # 分類價格比例
        def get_price_category(ratio):
            if ratio <= quantiles[0.2]:
                return f"0.00~{quantiles[0.2]:.2f}"
            elif quantiles[0.2] < ratio <= quantiles[0.4]:
                return f"{quantiles[0.2]:.2f}~{quantiles[0.4]:.2f}"
            elif quantiles[0.4] < ratio <= quantiles[0.6]:
                return f"{quantiles[0.4]:.2f}~{quantiles[0.6]:.2f}"
            elif quantiles[0.6] < ratio <= quantiles[0.8]:
                return f"{quantiles[0.6]:.2f}~{quantiles[0.8]:.2f}"
            else:
                return f">{quantiles[0.8]:.2f}"

        price_category = price_ratio.apply(get_price_category)

        # 自定義色彩，按範圍大小排序
        color_map = {
            f"0.00~{quantiles[0.2]:.2f}": "purple",
            f"{quantiles[0.2]:.2f}~{quantiles[0.4]:.2f}": "blue",
            f"{quantiles[0.4]:.2f}~{quantiles[0.6]:.2f}": "green",
            f"{quantiles[0.6]:.2f}~{quantiles[0.8]:.2f}": "orange",
            f">{quantiles[0.8]:.2f}": "red",
        }

        for col, (min_val, max_val) in enumerate(ranges):
            buffer = fixed_buffer  # 使用固定的buffer
            actual_max_val = max_val + buffer

            # 添加主要散點圖
            for category in sorted(color_map.keys()):
                mask = price_category == category
                fig.add_trace(
                    go.Scatter(
                        x=rank2[mask],
                        y=rank1[mask],
                        mode="markers",
                        marker=dict(color=color_map[category]),
                        name=category,
                        hovertemplate="Rank 2: %{x}<br>Rank 1: %{y}<br>Price Ratio: %{customdata[0]:.2f}<br>Title: %{customdata[1]}<extra></extra>",
                        customdata=list(zip(price_ratio[mask], title[mask])),
                    ),
                    row=row + 1,
                    col=col + 1,
                )

            # 增加 x=y 的直線
            fig.add_trace(
                go.Scatter(x=[min_val, actual_max_val], y=[min_val, actual_max_val], mode="lines", line=go.scatter.Line(color="black", dash="dash"), showlegend=False), row=row + 1, col=col + 1
            )

            # 標示超出範圍的點
            out_of_range = (rank2 > max_val) | (rank1 > max_val)
            fig.add_trace(
                go.Scatter(
                    x=rank2[out_of_range].clip(upper=max_val),
                    y=rank1[out_of_range].clip(upper=max_val),
                    mode="markers",
                    marker=dict(symbol="x", color=price_category[out_of_range].map(color_map)),
                    showlegend=False,
                    hovertemplate="Rank 2: %{x}<br>Rank 1: %{y}<br>Price Ratio: %{customdata[0]:.2f}<br>Title: %{customdata[1]}<extra></extra>",
                    customdata=list(zip(price_ratio[out_of_range], title[out_of_range])),
                ),
                row=row + 1,
                col=col + 1,
            )

            # 更新圖表布局
            fig.update_xaxes(title_text=f"Rank 2 ({comb.split('/')[1]})", row=row + 1, col=col + 1, range=[-buffer, actual_max_val], scaleanchor="y", scaleratio=1)
            fig.update_yaxes(title_text=f"Rank 1 ({comb.split('/')[0]})", row=row + 1, col=col + 1, range=[-buffer, actual_max_val], scaleanchor="x", scaleratio=1)

    fig.update_layout(
        height=1200,
        width=1600,
        title="Book Rank Comparison for Different Combinations and Ranges",
        legend_title="Price Ratio Category",
        legend=dict(traceorder="normal"),
    )

    # 顯示圖表
    fig.write_image("combined_rank_price_ratio.svg")
    fig.show()


# 假設 data_con 是你的數據
plot_rank_price_ratio(data_kindle_1dep)