In [2]:
import pandas as pd
import re
import jieba.analyse
import numpy as np
from pyecharts.charts import Pie
from pyecharts.charts import Bar
from pyecharts.charts import Map
from pyecharts.charts import Line
from pyecharts.charts import WordCloud
from pyecharts import options as opts
from pyecharts.globals import SymbolType
from pyecharts.charts import Tab

一、读取数据

In [3]:
orginal_data=pd.read_excel(r'D:\BaiduNetdiskDownload\cda\螺狮粉\螺蛳粉店铺数据.xlsx')
print(orginal_data.shape)
print(orginal_data.info())
print(orginal_data.head())

(4404, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4404 entries, 0 to 4403
Data columns (total 5 columns):
goods_name      4404 non-null object
shop_name       4404 non-null object
price           4404 non-null float64
purchase_num    4404 non-null object
location        4404 non-null object
dtypes: float64(1), object(4)
memory usage: 172.1+ KB
None
                          goods_name shop_name  price  purchase_num location
0              商用意大利面通心粉螺旋面粉30斤意面螺蛳粉   尚膳食品专营店  100.0     ['27人付款']    山东 济南
1     预售 李子柒柳州螺蛳粉广西特产正宗螺丝粉方便面米线螺狮粉3袋    李子柒旗舰店   39.7   ['65万+人付款']    浙江 嘉兴
2  嘻螺会柳州正宗螺蛳粉300gx5包广西特产螺狮粉酸辣螺丝粉速食米线  嘻螺会食品旗舰店   49.9  ['9.5万+人付款']    广西 柳州
3  螺霸王螺蛳粉280g*10包装礼盒正宗广西柳州螺狮粉特产螺丝粉整箱   丝皇食品专营店  115.0   ['2059人付款']    广西 柳州
4    预售李子柒柳州螺蛳粉广西特产正宗螺丝粉方便面速食米线11袋礼盒    李子柒旗舰店  139.0  ['5500+人付款']    浙江 嘉兴


二、数据清洗
1.数据去重和null
2.去除购买人数为空的数据,防止后面类型转换出错
2.类型转换：purchase_num字段由字符串转换成数值型
3.新增字段：location字段里提取省份和城市
4.新增字段：price*purchase_num=sales
5.新增字段：goods_name提取重量信息
5.新增字段：增加price分箱数据
6.词云分析：从goods_name字段做词云分析
5.shop_name可以分为官方店和非官方店，非精准划分

In [4]:
print(orginal_data.drop_duplicates(inplace=True))
print(orginal_data.shape)

orginal_data=orginal_data[orginal_data['purchase_num'].str.contains('人')]
print(orginal_data.shape)

None
(4385, 5)
(4383, 5)


In [5]:
print(orginal_data['purchase_num'].head())
orginal_data['purchase_num']=[re.sub('\[\'|\'\]','',i) for i in orginal_data['purchase_num']]
print(orginal_data['purchase_num'].head())

orginal_data['num']=[''.join(re.findall('\d+\.{0,1}\d*',i)) for i in orginal_data['purchase_num']]
orginal_data['num']=orginal_data['num'].astype(float)
print(orginal_data['num'].head())

orginal_data['unit']=[''.join(re.findall('万',i)) for i in orginal_data['purchase_num']]
orginal_data['unit']=orginal_data['unit'].apply(lambda x: 10000 if x=='万' else 1)
print(orginal_data['unit'].head())

0       ['27人付款']
1     ['65万+人付款']
2    ['9.5万+人付款']
3     ['2059人付款']
4    ['5500+人付款']
Name: purchase_num, dtype: object
0       27人付款
1     65万+人付款
2    9.5万+人付款
3     2059人付款
4    5500+人付款
Name: purchase_num, dtype: object
0      27.0
1      65.0
2       9.5
3    2059.0
4    5500.0
Name: num, dtype: float64
0        1
1    10000
2    10000
3        1
4        1
Name: unit, dtype: int64


In [6]:
orginal_data['purchase_num_new']=orginal_data['num']*orginal_data['unit']
orginal_data['sales']=orginal_data['purchase_num_new']*orginal_data['price']
print(orginal_data['purchase_num_new'].head())
print(orginal_data['sales'].head())

0        27.0
1    650000.0
2     95000.0
3      2059.0
4      5500.0
Name: purchase_num_new, dtype: float64
0        2700.0
1    25805000.0
2     4740500.0
3      236785.0
4      764500.0
Name: sales, dtype: float64


In [16]:
orginal_data['province']=orginal_data['location'].str.split(' ').apply(lambda x: x[0])
orginal_data['city']=orginal_data['location'].str.split(' ').apply(lambda x: x[1] if len(x)>1 else x[0])
print(orginal_data['province'].head())
print(orginal_data['city'].head())

0    山东
1    浙江
2    广西
3    广西
4    浙江
Name: province, dtype: object
0    济南
1    嘉兴
2    柳州
3    柳州
4    嘉兴
Name: city, dtype: object


In [7]:
orginal_data['shop_type']=orginal_data['shop_name'].str.contains('旗舰')|orginal_data['shop_name'].str.contains('专营')|orginal_data['shop_name'].str.contains('天猫')
print(orginal_data['shop_type'].head())

0    True
1    True
2    True
3    True
4    True
Name: shop_type, dtype: bool


In [8]:
print(orginal_data.goods_name.head(10))
orginal_data['weight']=[''.join(re.findall('\d+\.{0,1}g|\d+\.{0,1}G',i)) for i in orginal_data['goods_name']]
orginal_data['weight']=orginal_data['weight'].str.lower()
print(orginal_data['weight'].head(10))

0                商用意大利面通心粉螺旋面粉30斤意面螺蛳粉
1       预售 李子柒柳州螺蛳粉广西特产正宗螺丝粉方便面米线螺狮粉3袋
2    嘻螺会柳州正宗螺蛳粉300gx5包广西特产螺狮粉酸辣螺丝粉速食米线
3    螺霸王螺蛳粉280g*10包装礼盒正宗广西柳州螺狮粉特产螺丝粉整箱
4      预售李子柒柳州螺蛳粉广西特产正宗螺丝粉方便面速食米线11袋礼盒
5    【礼盒款】好欢螺螺蛳粉柳州螺狮粉速食方便面米线300g10袋酸辣粉
6    【礼盒装】好欢螺螺蛳粉柳州速食螺狮粉酸辣粉方便面米线400g10袋
7    螺霸王螺蛳粉280G*6包礼盒装整箱 正宗广西柳州螺狮粉酸辣螺丝粉
8      柳江人家柳州螺蛳粉350g*10袋礼盒装广西柳州特产特色螺丝粉
9     柳江人家螺蛳粉350g*5袋螺丝粉广西柳州正宗特产速食酸辣螺狮粉
Name: goods_name, dtype: object
0        
1        
2    300g
3    280g
4        
5    300g
6    400g
7    280g
8    350g
9    350g
Name: weight, dtype: object


In [9]:
# 删除多余的列
orginal_data.drop(['num', 'unit'], axis=1, inplace=True)
print(orginal_data.head())


                          goods_name shop_name  price purchase_num location  \
0              商用意大利面通心粉螺旋面粉30斤意面螺蛳粉   尚膳食品专营店  100.0        27人付款    山东 济南   
1     预售 李子柒柳州螺蛳粉广西特产正宗螺丝粉方便面米线螺狮粉3袋    李子柒旗舰店   39.7      65万+人付款    浙江 嘉兴   
2  嘻螺会柳州正宗螺蛳粉300gx5包广西特产螺狮粉酸辣螺丝粉速食米线  嘻螺会食品旗舰店   49.9     9.5万+人付款    广西 柳州   
3  螺霸王螺蛳粉280g*10包装礼盒正宗广西柳州螺狮粉特产螺丝粉整箱   丝皇食品专营店  115.0      2059人付款    广西 柳州   
4    预售李子柒柳州螺蛳粉广西特产正宗螺丝粉方便面速食米线11袋礼盒    李子柒旗舰店  139.0     5500+人付款    浙江 嘉兴   

   purchase_num_new       sales  shop_type weight  
0              27.0      2700.0       True         
1          650000.0  25805000.0       True         
2           95000.0   4740500.0       True   300g  
3            2059.0    236785.0       True   280g  
4            5500.0    764500.0       True         


In [None]:

3.数据可视化 

分箱
1)店铺省份分布
2）销量城市分布√
3）官方和非官方店铺占比以及销售收入

5)词云√
6)价格分箱√


In [10]:
print(orginal_data['weight'].value_counts().head())

        2019
300g     628
320g     293
280g     252
400g     240
Name: weight, dtype: int64


In [11]:
def tranform_price(x):
    if x <= 20:
        return '0~20'
    elif x <= 50:
        return '20~50'
    elif x <= 100:
        return '50~100'
    elif x <= 200:
        return '100~200'
    else:
        return '200~2500'
    
orginal_data['price_cut'] = orginal_data.price.apply(lambda x: tranform_price(x)) 

price_num = orginal_data.price_cut.value_counts()/orginal_data.price_cut.count()
price_num

20~50       0.445357
50~100      0.308921
100~200     0.178873
0~20        0.061373
200~2500    0.005476
Name: price_cut, dtype: float64

In [12]:
import jieba 
import jieba.analyse


jieba.add_word('螺蛳粉')
data=orginal_data['goods_name'].str.cat(sep='。')

stop_words=[]
with open(r'D:\BaiduNetdiskDownload\cda\完美关系\chineseStopWords-me.txt','r',encoding='utf-8') as f:
    lines=f.readlines()
    for line in lines:
        stop_words.append(line.strip())



# 添加停用词
stop_words.extend(['hellip',   '哈哈哈'])      

# 评论字段分词处理
word_num = jieba.analyse.extract_tags(data,
                                      topK=100,
                                      withWeight=True,
                                      allowPOS=())

# 去停用词
word_num_selected = []

for i in word_num:
    if i[0] not in stop_words:
        word_num_selected.append(i)

key_words = pd.DataFrame(word_num_selected, columns=['words','num'])
key_words.head()




Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\pr\AppData\Local\Temp\jieba.cache
Loading model cost 0.798 seconds.
Prefix dict has been built successfully.


Unnamed: 0,words,num
0,螺蛳粉,1.136864
1,柳州,0.670196
2,正宗,0.491532
3,螺丝,0.480384
4,速食,0.460393


In [13]:


from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType

# 词云图
word1 = WordCloud(init_opts=opts.InitOpts(width='1350px', height='750px'))
word1.add("", [*zip(key_words.words, key_words.num)],
          word_size_range=[20, 200],
          shape=SymbolType.DIAMOND)
word1.set_global_opts(title_opts=opts.TitleOpts('商品标题分布词云图'),
                      toolbox_opts=opts.ToolboxOpts())
word1.render() 


'C:\\WINDOWS\\system32\\render.html'

In [14]:
# 导入包
from pyecharts.charts import Bar
from pyecharts import options as opts 

# 计算top10店铺
shop_top10 = orginal_data.groupby('shop_name')['purchase_num_new'].sum().sort_values(ascending=False).head(10)

# 绘制柱形图
bar1 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px')) 
bar1.add_xaxis(shop_top10.index.tolist())
bar1.add_yaxis('销量', shop_top10.values.tolist()) 
bar1.set_global_opts(title_opts=opts.TitleOpts(title='螺蛳粉商品销量店铺排名Top10'),
                     xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-10)),
                     visualmap_opts=opts.VisualMapOpts(max_=shop_top10.values.max())) 
bar1.render() 

'C:\\WINDOWS\\system32\\render.html'

In [17]:
# 计算销量top10
province_top10 = orginal_data.groupby('province')['purchase_num_new'].sum().sort_values(ascending=False).head(10)

# 条形图
bar2 = Bar(init_opts=opts.InitOpts(width='1350px', height='750px')) 
bar2.add_xaxis(province_top10.index.tolist())
bar2.add_yaxis('销量', province_top10.values.tolist()) 
bar2.set_global_opts(title_opts=opts.TitleOpts(title='螺蛳粉商品销量省份排名Top10'),
                     visualmap_opts=opts.VisualMapOpts(max_=province_top10.values.max())) 
bar2.render() 

'C:\\WINDOWS\\system32\\render.html'

In [19]:
from pyecharts.charts import Map 

# 计算销量
province_num = orginal_data.groupby('province')['sales'].sum().round()

# 绘制地图
map1 = Map(init_opts=opts.InitOpts(width='1350px', height='750px'))
map1.add("", [list(z) for z in zip(province_num.index.tolist(), province_num.values.tolist())],
         maptype='china'
        ) 
map1.set_global_opts(title_opts=opts.TitleOpts(title='国内各省份螺蛳粉销量分布'),
                     visualmap_opts=opts.VisualMapOpts(max_=200000000),
                     toolbox_opts=opts.ToolboxOpts()
                    )
map1.render() 

'C:\\WINDOWS\\system32\\render.html'

In [23]:
#未去重
from pyecharts.charts import Map 

# 计算销量
province_num = orginal_data.groupby('province')['shop_name'].count().sort_values(ascending=False) 

# 绘制地图
map1 = Map(init_opts=opts.InitOpts(width='1350px', height='750px'))
map1.add("", [list(z) for z in zip(province_num.index.tolist(), province_num.values.tolist())],
         maptype='china'
        ) 
map1.set_global_opts(title_opts=opts.TitleOpts(title='国内各省份螺狮粉店铺分布'),
                     visualmap_opts=opts.VisualMapOpts(max_=2000),
                     toolbox_opts=opts.ToolboxOpts()
                    )
map1.render() 

'C:\\WINDOWS\\system32\\render.html'

In [21]:
def countShop(dataframe):
    a={}
    for index,row in dataframe.iterrows():
        province = row['province']
        shop_name = row['shop_name']
        if province not in a:
            a[province] = {}
        if shop_name not in a[province]:
            a[province][shop_name] = 1
        else:
            a[province][shop_name] += 1
    return {k: len(v) for k,v in a.items()}

In [22]:
#去重
from pyecharts.charts import Map 

province_count = countShop(orginal_data)
# 绘制地图
map1 = Map(init_opts=opts.InitOpts(width='1350px', height='750px'))
map1.add("", list(province_count.items()),  maptype='china') 
map1.set_global_opts(title_opts=opts.TitleOpts(title='国内各省份螺狮粉店铺分布'),
                     visualmap_opts=opts.VisualMapOpts(max_=2000),
                     toolbox_opts=opts.ToolboxOpts()
                    )
map1.render() 

'C:\\WINDOWS\\system32\\render.html'

In [24]:
from pyecharts.charts import Map 

# 计算销量
city_num = orginal_data[orginal_data['province']=='广西'].groupby('city')['purchase_num_new'].sum().sort_values(ascending=False) 
# 绘制地图
map1 = Map(init_opts=opts.InitOpts(width='1350px', height='750px'))
map1.add("", [list(z) for z in zip((city_num.index+'市').tolist(), province_num.values.tolist())],
         maptype='广西'
        ) 
map1.set_global_opts(title_opts=opts.TitleOpts(title='广西省各市螺狮粉销量分布'),
                     visualmap_opts=opts.VisualMapOpts(max_=4000),
                     toolbox_opts=opts.ToolboxOpts()
                    )
map1.render() 

'C:\\WINDOWS\\system32\\render.html'

In [30]:
from pyecharts.charts import Pie

price_cut_num = orginal_data.groupby('price_cut')['purchase_num_new'].sum() 
data_pair = [list(z) for z in zip(price_cut_num.index, price_cut_num.values)]

# 饼图
pie1 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))
# 内置富文本
pie1.add( 
        series_name="sales",
        radius=["35%", "55%"],
        data_pair=data_pair,
        label_opts=opts.LabelOpts(
            position="outside",
            formatter="{a|{a}}{abg|}\n{hr|}\n {b|{b}: }{c}  {per|{d}%}  ",
            background_color="#eee",
            border_color="#aaa",
            border_width=1,
            border_radius=4,
            rich={
                "a": {"color": "#999", "lineHeight": 22, "align": "center"},
                "abg": {
                    "backgroundColor": "#e3e3e3",
                    "width": "100%",
                    "align": "right",
                    "height": 22,
                    "borderRadius": [4, 4, 0, 0],
                },
                "hr": {
                    "borderColor": "#aaa",
                    "width": "100%",
                    "borderWidth": 0.5,
                    "height": 0,
                },
                "b": {"fontSize": 16, "lineHeight": 33},
                "per": {
                    "color": "#eee",
                    "backgroundColor": "#334455",
                    "padding": [2, 4],
                    "borderRadius": 2,
                },
            },
        ),
)
pie1.set_global_opts(legend_opts=opts.LegendOpts(pos_left="left", pos_top='30%', orient="vertical"), 
                     toolbox_opts=opts.ToolboxOpts(),
                     title_opts=opts.TitleOpts(title='螺狮粉不同价格销量占比'))
pie1.set_series_opts(
    tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)")
    )
pie1.render() 

'C:\\WINDOWS\\system32\\render.html'