## Call Libraries 

In [1]:
import pandas as pd
import numpy as np

## Call the Data 

In [2]:
# Path of the data's location
path = 'C:/Users/Kim/Desktop/Book/python-main/data/'

In [3]:
# Call the data
df = pd.read_csv(path + 'Sales data/Data.csv')

##  Create and Arrange Data Variables

In [4]:
# Create variables for the year and month
df['year'] = df['OrderDate'].str.slice(start = 0, stop = 4)
df['month'] = df['OrderDate'].str.slice(start = 5, stop = 7)
# Arrange the data
df = df.sort_values(by = ['Region','Channel','Category','Item Type','year','month','Gender'])

### Decimal Point Setting 

In [5]:
#  Using the display function to set number to show to 2 decimal places (disable exponential expression)
pd.options.display.float_format = '{:.2f}'.format

## Process the Data for Visualization

In [6]:
# Arrange in ascending order of sales for the continent, channel, item in 2020
df_g = df[df['year'] == '2020'].iloc[:,[13,4,11,9]].copy()
df_g = df_g.sort_values(by = ['Region','Channel','Category'])
df_g.head(2)

Unnamed: 0,Region,Channel,Category,Revenue
2009,Africa,Offline,Beauty & Health,30171.1
10034,Africa,Offline,Beauty & Health,38133.8


In [7]:
value1 = df_g.groupby(by = ['Region','Channel'], as_index = False).sum()
value1

Unnamed: 0,Region,Channel,Revenue
0,Africa,Offline,4015718.1
1,Africa,Online,12342417.5
2,America,Offline,13206995.2
3,America,Online,39311590.6
4,Asia,Offline,27794901.85
5,Asia,Online,80291457.54
6,Europe,Offline,27142873.33
7,Europe,Online,75434600.1
8,Oceania,Offline,5444565.4
9,Oceania,Online,16521398.5


In [8]:
value2 = df_g.groupby(by = ['Channel','Category'], as_index = False).sum()
value2

Unnamed: 0,Channel,Category,Revenue
0,Offline,Beauty & Health,12873974.4
1,Offline,Clothes,2482423.2
2,Offline,Foods,57785592.18
3,Offline,Home,2770573.3
4,Offline,Office,1692490.8
5,Online,Beauty & Health,37548449.2
6,Online,Clothes,6740321.6
7,Online,Foods,158206931.14
8,Online,Home,14696612.5
9,Online,Office,6709149.8


In [9]:
# https://plotly.com/python/sankey-diagram/

In [10]:
import plotly.graph_objects as go

In [11]:
# label : node name 
# source : source node 
# target : target node 
# value : flow value 

In [13]:
trace = go.Sankey(node = dict(label = ['Africa', 'Offline', 'Online']),
                  link = dict(source = [0, 0],
                              target = [1, 2],
                              value = [4015718.1, 12342417.5])
                 )
data = [trace]
layout = go.Layout(title = 'Chapter 2.4 - Sankey Diagram', font_size = 15)
fig = go.Figure(data, layout)
fig.show()

In [15]:
trace = go.Sankey(node = dict(label = ['Africa', 'Offline', 'Online'],
                              x = [0, 1, 1],      # x node location
                              y = [0, 0.1, 0.7]   # y node location
                             ),
                  link = dict(source = [0, 0],
                              target = [1, 2],
                              value = [4015718.1, 12342417.5]),
                 )
data = [trace]
layout = go.Layout(title = 'Chapter 2.4 - Sankey Diagram (Adjust Node Location)', font_size = 15)
fig = go.Figure(data, layout)
fig.show()

In [16]:
labels = ['Africa', 'America', 'Asia', 'Europe', 'Oceania']+['Offline', 'Online']+['Beauty & Health', 'Clothes', 'Foods', 'Home', 'Office']
sources = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4] + [5, 5, 5, 5, 5, 6, 6, 6, 6, 6]
targets = [5, 6, 5, 6, 5, 6, 5, 6, 5, 6] + [7, 8, 9, 10, 11, 7, 8, 9, 10, 11]
values = list(value1['Revenue']) + list(value2['Revenue'])

In [19]:
trace = go.Sankey(node = dict(label = labels,
                              pad = 15,
                              thickness = 20,
                              line = dict(color = 'black', width = 0.5),
                              color = 'blue',
                             ),
                  link = dict(source = sources,
                              target = targets,
                              value = values),
                 )
data = [trace]
layout = go.Layout(title = 'Chapter 2.4 - Sankey Diagram (Adjust Node Location)', font_size = 15)
fig = go.Figure(data, layout)
fig.show()

## Automating the source & target, noting the number of labels

In [21]:
# label
l_c1 = list(df_g['Region'].unique()) # 5 values(order = 0 1 2 3 4)
l_c2 = list(df_g['Channel'].unique()) # 2 values (order = 5 6)
l_c3 = list(df_g['Category'].unique()) # 5 values (order = 7 8 9 10 11)
labels = l_c1 + l_c2 + l_c3 # 12 values (order = 0 ~ 11)

In [22]:
print(labels)

['Africa', 'America', 'Asia', 'Europe', 'Oceania', 'Offline', 'Online', 'Beauty & Health', 'Clothes', 'Foods', 'Home', 'Office']


In [23]:
# source
source1 = list(np.repeat(range(0, len(l_c1)), len(l_c2)))
source2 = list(np.repeat(range(len(l_c1), len(l_c1)+len(l_c2)), len(l_c3)))
sources = source1 + source2

In [25]:
print(sources)

[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6]


In [26]:
# target
target1 = list(range(len(l_c1), len(l_c1)+len(l_c2))) * len(l_c1)
target2 = list(range(len(l_c1)+len(l_c2), len(l_c1)+len(l_c2)+len(l_c3))) * len(l_c2)
targets = target1 + target2

In [27]:
print(targets)

[5, 6, 5, 6, 5, 6, 5, 6, 5, 6, 7, 8, 9, 10, 11, 7, 8, 9, 10, 11]


In [28]:
# value
values = list(value1['Revenue']) + list(value2['Revenue'])

In [29]:
trace = go.Sankey(node = dict(label = labels,
                              pad = 15,
                              thickness = 20,
                              line = dict(color = 'black', width = 0.5),
                              color = "blue"),
                  link = dict(source = sources,
                              target = targets,
                              value = values)
                 )
data = [trace]
layout = go.Layout(title = 'Chapter 2.4 - Sankey Diagram', font_size = 15)
fig = go.Figure(data, layout)
fig.show()