### 数据统计与分析

In [None]:
import re 
import numpy as np
import pandas as pd
import jieba 

from wordcloud import WordCloud

In [None]:
file_path = './data.csv'
data = pd.read_csv(file_path)

> 标题的词云

In [None]:
TITLE_FILE_NAME = './temp/titles.txt'

fp_title = open(TITLE_FILE_NAME, 'w+' , encoding='utf-8')
titles = data['标题']
for title in titles:
  fp_title.write(title[title.find('（'):] if title.find('（') != -1 else ' ')

# 把标题写入文件
fp_title.close()

# 修正标题
data['标题'] = data['标题'].apply(lambda x: x[:x.find('（')] if x.find('（') != -1 else x)
data['标题'] = data['标题'].apply(lambda x: x[:x.find(' ')] if x.find(' ') != -1 else x)

In [None]:
# 注意到获取的页面内容中有html的标签和其他符号存在，不利于下一步的词云操作，所以需要利用jieba和正则表达式对得到的文本进行清洗
r ='[，。\%、；1234567890n-】【“”]《》（）'

file=open(TITLE_FILE_NAME, "r", encoding='utf-8').read()

# 剔除无关信息
file =re.sub(r,'',file)
# 删除中文无关词汇
chinese_char = '[的版]'
file = re.sub(chinese_char,'',file)

#分词
con = jieba.lcut(file)

#分词后插入空格
words = " ".join(con)

#词云分析
wordcloud = WordCloud(font_path="./assets/SanJiXingShuJianTi-2.ttf",background_color="white",width=1300, height=800).generate(words)

wordcloud.to_file('./assets/title_cloud.png') 

<!-- ![](./assets/title_cloud.png) -->

从词云图中可以看出消费者更加倾向于购买标题中含有“全册”、“推荐”、“代表作”等词的书籍


> --全局参数配置--

In [None]:
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from pyecharts.components import Table
from pyecharts.options import ComponentTitleOpts
from pyecharts.charts import Pie
from pyecharts.charts import Bar
from pyecharts.charts import Page
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType

In [None]:
# 全局主题
GLOBAL_THEME = ThemeType.LIGHT
# 全局词云主题
GLOBAL_WORDCLOUD_THEME = SymbolType.RECT
# 全局高度
GLOBAL_WIDTH = '900px'
# 全局宽度
GLOBAL_HEIGHT = '600px'
# 词云宽度
WORDCLOUD_WIDTH = "900px"
# 词云高度
WORDCLOUD_HEIGHT = "600px"

> 评论数量统计分析

In [None]:
# 排序后的DataFrame
data_sort_by_comment = data.sort_values('评论数量', ascending=False)

# 评论数量最多的 top10、30
# data_sort_by_comment.head(10)

In [None]:
# 生成数据
data_sort_by_comment.head(10)

title_and_comments_top = pd.DataFrame()
title_and_comments_top['标题'] = data_sort_by_comment.head(10)['标题']
title_and_comments_top['评论数量']  = data_sort_by_comment.head(10)['评论数量']

# 评论前5和后5的数量和标题
# title_and_comments_top

In [None]:
# 表格
table_comment_top = Table()

headers = ["书名", "评论数量"]
rows = [[i[0], i[1]] for i in np.array(title_and_comments_top)]
table_comment_top.add(headers, rows)
table_comment_top.set_global_opts(
    title_opts=ComponentTitleOpts(title="评论数量最多的10本书")
)
table_comment_top.render("./temp/table_comment_number.html")

# 柱状图
bar_comment_top = (
    Bar(init_opts=opts.InitOpts(theme=GLOBAL_THEME, width=GLOBAL_WIDTH, height=GLOBAL_HEIGHT))
    .add_xaxis([i[0] for i in rows])
    .add_yaxis("评论数量", [i[1] for i in rows], category_gap="50%")
    .set_global_opts(
        title_opts=opts.TitleOpts(title="前10评论数量"),
        datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=80),
        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
        )
    .render("./temp/bar_comment_number.html")
)

> top10出版社以及上榜的书籍

In [None]:
data_groupby_publisher = data.groupby(['出版社'])

# 将 DataFrameGroupBy 转换为 array 类型
# 然后再每一个group（出版社）中添加这个出版社一共出版的图书
# 然后把这个array按照出版图书的降序排序

publisher_with_rank = np.array([np.insert(i, 0, i[1].shape[0]) for i in np.array([list(i) for i in list(data_groupby_publisher)], dtype=object)])
publisher_with_rank = publisher_with_rank[publisher_with_rank[:,0].argsort()][::-1]

# 所有出版社按照上榜图书排序和他们的图书
# publisher_with_rank


In [None]:
# 词云
words = [(item[1], item[0]) for item in publisher_with_rank]

wordcloud_publisher = (
    WordCloud()
    .add("", words, word_size_range=[2, 80], shape=GLOBAL_WORDCLOUD_THEME, width=WORDCLOUD_WIDTH, height=WORDCLOUD_HEIGHT)
    .set_global_opts(title_opts=opts.TitleOpts(title=""))
)
    # .render("./temp/wordcould_publisher.html")

In [None]:
# 排行前10的出版社
publisher_top_10 = publisher_with_rank[:10]

publishers = [i[1] for i in publisher_top_10]
publisher_books_number = [i[0] for i in publisher_top_10]

publishers.append('其他')
publisher_books_number.append(int(500 - np.sum(publisher_books_number)))

In [None]:
# 表格
# table_publisher = Table()

# headers = ["出版社", "热销图书"]
# rows = [ list(z) for z in zip(publishers, publisher_books_number)]
# table_publisher.add(headers, rows)
# table_publisher.set_global_opts(
#     title_opts=ComponentTitleOpts(title="热销前10的出版社")
# )
# table_publisher.render("./temp/table_publisher_top.html")

In [None]:
# 饼图
pie_publisher = (
    Pie(
        init_opts=opts.InitOpts(theme=GLOBAL_THEME, width=GLOBAL_WIDTH, height=GLOBAL_HEIGHT)
    )
    .add(
        "",
        [list(z) for z in zip(publishers, publisher_books_number)],
        radius=["40%", "55%"],
        label_opts=opts.LabelOpts(
            position="outside",
            formatter="{b|{b}: }{c}  {per|{d}%}  ",
            background_color="#eee",
            border_color="#aaa",
            border_width=1,
            border_radius=4,
            rich={
                "b": {"fontSize": 16, "lineHeight": 33},
                "per": {
                    "color": "#eee",
                    "backgroundColor": "#334455",
                    "padding": [2, 4],
                    "borderRadius": 2,
                },
            },
        ),
    )
)
    # .render("./temp/pie_publisher.html")

> 一类图书类别统计

In [None]:
data_groupby_type_1 = data.groupby(['类别1'])
type_1_book_group = np.array([np.insert(i, 0, i[1].shape[0]) for i in np.array([list(i) for i in list(data_groupby_type_1)], dtype=object)])
type_1_book_group = type_1_book_group[type_1_book_group[:,0].argsort()][::-1]

# 所有1级分类和分类下面的图书
# type_1_book_group

# 保留前5个分类下面的标签
TYPE_1_COUNT = 5

type_1_labels = [i[1] for i in type_1_book_group]
type_1_count = [i[0] for i in type_1_book_group]

In [None]:
# 词云
words = [tuple([item[0], item[1]]) for item in zip(type_1_labels, type_1_count)]

wordcloud_type1 = (
    WordCloud()
    .add("", words, word_size_range=[30, 80], shape=GLOBAL_WORDCLOUD_THEME, width=WORDCLOUD_WIDTH, height=WORDCLOUD_HEIGHT)
    .set_global_opts(title_opts=opts.TitleOpts(title=""))
)
    # .render("./temp/wordcould_type1.html")

In [None]:
type_1_labels = type_1_labels[:TYPE_1_COUNT]
type_1_labels.append('其他')
temp = int(500 - np.sum(type_1_count[:TYPE_1_COUNT])) # 转换成int即可
type_1_count = type_1_count[:TYPE_1_COUNT]
type_1_count.append(temp)

In [None]:
pie_type_1 = (
    Pie(
        init_opts=opts.InitOpts(theme=GLOBAL_THEME, width=GLOBAL_WIDTH, height=GLOBAL_HEIGHT)
    )
    .add(
        "",
        [list(z) for z in zip(type_1_labels, type_1_count)],
        radius=["40%", "55%"],
        label_opts=opts.LabelOpts(
            position="outside",
            formatter="{b|{b}: }{c}  {per|{d}%}  ",
            background_color="#eee",
            border_color="#aaa",
            border_width=1,
            border_radius=4,
            rich={
                "b": {"fontSize": 16, "lineHeight": 33},
                "per": {
                    "color": "#eee",
                    "backgroundColor": "#334455",
                    "padding": [2, 4],
                    "borderRadius": 2,
                },
            },
        ),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="一级类别"),
    )
)
    # .render("./temp/pie_type_1.html")

> 二类图书类别统计

In [None]:
data_groupby_type_2 = data.groupby(['类别2'])

type_2_book_group = np.array([np.insert(i, 0, i[1].shape[0]) for i in np.array([list(i) for i in list(data_groupby_type_2)], dtype=object)])
type_2_book_group = type_2_book_group[type_2_book_group[:,0].argsort()][::-1]

# 所有2级分类和分类下面的图书
# type_2_book_group

# 保留前5个分类下面的标签
TYPE_2_COUNT = 5

# 取出标签
type_2_labels = [i[1] for i in type_2_book_group]
type_2_count = [i[0] for i in type_2_book_group]

In [None]:
# 词云
words = [tuple([item[0], item[1]]) for item in zip(type_2_labels, type_2_count)]

wordcloud_type2 = (
    WordCloud()
    .add("", words, word_size_range=[20, 80], shape=GLOBAL_WORDCLOUD_THEME, width=WORDCLOUD_WIDTH, height=WORDCLOUD_HEIGHT)
    .set_global_opts(title_opts=opts.TitleOpts(title=""))
)
    # .render("./temp/wordcould_type2.html")

In [None]:
type_2_labels = type_2_labels[:TYPE_2_COUNT]
type_2_labels.append('其他')
temp = int(500 - np.sum(type_2_count[:TYPE_2_COUNT])) # 转换成int即可
type_2_count = type_2_count[:TYPE_2_COUNT]
type_2_count.append(temp)

In [None]:
pie_type_2 = (
    Pie(
        init_opts=opts.InitOpts(theme=GLOBAL_THEME, width=GLOBAL_WIDTH, height=GLOBAL_HEIGHT)
    )
    
    .add(
        "",
        [list(z) for z in zip(type_2_labels, type_2_count)],
        radius=["40%", "55%"],
        label_opts=opts.LabelOpts(
            position="outside",
            formatter="{b|{b}: }{c}  {per|{d}%}  ",
            background_color="#eee",
            border_color="#aaa",
            border_width=1,
            border_radius=4,
            rich={
                "b": {"fontSize": 16, "lineHeight": 33},
                "per": {
                    "color": "#eee",
                    "backgroundColor": "#334455",
                    "padding": [2, 4],
                    "borderRadius": 2,
                },
            },
        ),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="二级类别"),
    )
)
    # .render("./temp/pie_type_2.html")

> 热销图书的定价区间

In [None]:
# 价格区间以10为step
data_modify_price = data
data_modify_price = data_modify_price['价格'].apply(lambda x: int(float(x[1:]) / 10))
data_with_price_df = data
# 把价格修正为价格区间
data_with_price_df['价格'] = data_modify_price

In [None]:
# 按照价格分组
data_groupby_price = data_with_price_df.groupby(['价格'])
price_book_group = np.array([np.insert(i, 0, i[1].shape[0]) for i in np.array([list(i) for i in list(data_groupby_price)], dtype=object)])
price_book_group = price_book_group[price_book_group[:,0].argsort()][::-1]

# 所有的价格区间和区间内的图书数量和内容
# price_book_group

In [None]:
PRICE_COUNT = 10

price_count = [i[0] for i in price_book_group]
price_labels = [i[1] for i in price_book_group]

price_labels = price_labels[:PRICE_COUNT]
price_count = price_count[:PRICE_COUNT]

# 规格化x轴标签
price_labels = ['{0}0-{1}0'.format(i, i+1) for i in price_labels]
temp = int(500 - np.sum(price_count[:PRICE_COUNT]))
price_labels.append('其他')
price_count.append(temp)

In [None]:
bar_price = (
    Bar(init_opts=opts.InitOpts(theme=GLOBAL_THEME, width=GLOBAL_WIDTH, height=GLOBAL_HEIGHT))
    .add_xaxis(price_labels)
    .add_yaxis("数量", price_count)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="￥"),
        datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=90),
    )
)
    # .render("./temp/bar_price.html")

> 出版年份和时间排序

In [None]:
# 年份以年排序
data_modify_date = data
data_modify_date = data_modify_date['出版时间'].apply(lambda x: x[:4])
data_with_date_df = data
data_with_date_df['出版时间'] = data_modify_date

In [None]:
# 按照价格分组
data_groupby_date = data_with_date_df.groupby(['出版时间'])
date_book_group = np.array([np.insert(i, 0, i[1].shape[0]) for i in np.array([list(i) for i in list(data_groupby_date)], dtype=object)])
date_book_group = date_book_group[date_book_group[:,0].argsort()][::-1]

# 所有的价格区间和区间内的图书数量和内容
# date_book_group

In [None]:
YEAR_COUNT = 10

year_count = [i[0] for i in date_book_group]
year_labels = [i[1] for i in date_book_group]

year_labels = year_labels[:YEAR_COUNT]
year_count = year_count[:YEAR_COUNT]

# 规格化x轴标签
temp = int(500 - np.sum(year_count[:YEAR_COUNT]))
year_labels.append('其他')
year_count.append(temp)

In [None]:
bar_year = (
    Bar(init_opts=opts.InitOpts(theme=GLOBAL_THEME, width=GLOBAL_WIDTH, height=GLOBAL_HEIGHT))
    .add_xaxis(year_labels)
    .add_yaxis("数量", year_count)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="出版时间"),
        datazoom_opts=opts.DataZoomOpts(range_start=0, range_end=90),
    )
)
    # .render("./temp/bar_year.html")

> 创建大屏数据展板

In [None]:
page = Page(layout=Page.DraggablePageLayout)
page.add(
    table_comment_top,
    bar_comment_top,
    pie_publisher,
    wordcloud_publisher,
    pie_type_1,
    wordcloud_type1,
    pie_type_2,
    wordcloud_type2,
    bar_price,
    bar_year
)
page.save_resize_html(cfg_file='./temp/chart_config.json', source='./temp/dashboard.html', dest='./temp/dashboard_simplified.html')
page.render("./temp/dashboard.html")