### Waffle Plots

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

pokemon = pd.read_csv('data/pokemon.csv')

One alternative univariate plot type that you might see for categorical data is the waffle plot, also known as the square pie chart. While the standard pie chart uses a circle to represent the whole, a waffle plot is plotted onto a square divided into a 10x10 grid. Each small square in the grid represents one percent of the data, and a number of squares are colored by category to indicate total proportions. Compared to a pie chart, it is much easier to make precise assessments of relative frequencies.

In [1]:
def percentage_blocks(df, var):
    """
    Take as input a dataframe and variable, and return a Pandas series with
    approximate percentage values for filling out a waffle plot.
    """
    # compute base quotas
    percentages = 100 * df[var].value_counts() / df.shape[0]
    counts = np.floor(percentages).astype(int) # integer part = minimum quota
    decimal = (percentages - counts).sort_values(ascending = False)

    # add in additional counts to reach 100
    rem = 100 - counts.sum()
    for cat in decimal.index[:rem]:
        counts[cat] += 1

    return counts

In [9]:
pokemon['height'].value_counts()

0.6     73
0.4     62
1.0     61
0.5     61
0.3     54
1.2     50
0.8     44
1.5     44
0.7     40
1.1     37
0.9     34
1.3     29
1.4     29
1.6     25
1.7     22
1.8     22
2.0     22
0.2     18
1.9     13
2.1      8
2.2      6
2.5      5
0.1      5
2.4      3
3.0      3
2.3      3
9.2      2
4.0      2
4.5      2
5.5      2
3.2      2
3.5      2
3.8      2
2.7      2
14.5     1
7.0      1
6.5      1
5.0      1
6.2      1
3.7      1
5.8      1
4.2      1
8.8      1
2.9      1
2.6      1
5.4      1
3.4      1
5.2      1
3.6      1
2.8      1
3.9      1
3.3      1
Name: height, dtype: int64

In [10]:
pokemon['height'].value_counts() / pokemon.shape[0]

0.6     0.090458
0.4     0.076828
1.0     0.075589
0.5     0.075589
0.3     0.066914
1.2     0.061958
0.8     0.054523
1.5     0.054523
0.7     0.049566
1.1     0.045849
0.9     0.042131
1.3     0.035936
1.4     0.035936
1.6     0.030979
1.7     0.027261
1.8     0.027261
2.0     0.027261
0.2     0.022305
1.9     0.016109
2.1     0.009913
2.2     0.007435
2.5     0.006196
0.1     0.006196
2.4     0.003717
3.0     0.003717
2.3     0.003717
9.2     0.002478
4.0     0.002478
4.5     0.002478
5.5     0.002478
3.2     0.002478
3.5     0.002478
3.8     0.002478
2.7     0.002478
14.5    0.001239
7.0     0.001239
6.5     0.001239
5.0     0.001239
6.2     0.001239
3.7     0.001239
5.8     0.001239
4.2     0.001239
8.8     0.001239
2.9     0.001239
2.6     0.001239
5.4     0.001239
3.4     0.001239
5.2     0.001239
3.6     0.001239
2.8     0.001239
3.9     0.001239
3.3     0.001239
Name: height, dtype: float64

In [11]:
percentage_blocks(pokemon, 'height')

0.6     9
0.4     8
1.0     8
0.5     8
0.3     7
1.2     6
0.8     6
1.5     6
0.7     5
1.1     5
0.9     4
1.3     4
1.4     4
1.6     3
1.7     3
1.8     3
2.0     3
0.2     2
1.9     2
2.1     1
2.2     1
2.5     1
0.1     1
2.4     0
3.0     0
2.3     0
9.2     0
4.0     0
4.5     0
5.5     0
3.2     0
3.5     0
3.8     0
2.7     0
14.5    0
7.0     0
6.5     0
5.0     0
6.2     0
3.7     0
5.8     0
4.2     0
8.8     0
2.9     0
2.6     0
5.4     0
3.4     0
5.2     0
3.6     0
2.8     0
3.9     0
3.3     0
Name: height, dtype: int64

To **plot** those counts as boxex in the waffle plot form, use the __bar__ function

In [16]:
waffle_counts = percentage_blocks(pokemon, 'height')
for cat in range(waffle_counts.shape[0]):
    print(cat)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51


In [12]:
waffle_counts = percentage_blocks(pokemon, 'height')

prev_count = 0
# for each category,
for cat in range(waffle_counts.shape[0]):
    # get the block indices
    blocks = np.arange(prev_count, prev_count + waffle_counts[cat])
    # and put a block at each index's location
    x = blocks % 10 # use mod operation to get ones digit
    y = blocks // 10 # use floor division to get tens digit
    plt.bar(x = x, height = 0.8, width = 0.8, bottom = y)
    prev_count += waffle_counts[cat]

KeyError: 0.0

In [17]:
waffle_counts = percentage_blocks(pokemon, 'height')

prev_count = 0
# for each category,
for cat in range(waffle_counts.shape[0]):
    # get the block indices
    blocks = np.arange(prev_count, prev_count + waffle_counts[cat])
    # and put a block at each index's location
    x = blocks % 10 # use mod operation to get ones digit
    y = blocks // 10 # use floor division to get tens digit
    plt.bar(x = x, height = 0.8, width = 0.8, bottom = y)
    prev_count += waffle_counts[cat]

# aesthetic wrangling
plt.legend(waffle_counts.index, bbox_to_anchor = (1, 0.5), loc = 6)
plt.axis('off')
plt.axis('square')

KeyError: 0.0

In [18]:
# each box represents five full counts
waffle_counts = (df['cat_var'].value_counts() / 5).astype(int)

prev_count = 0
# for each category,
for cat in range(waffle_counts.shape[0]):
    # get the block indices
    blocks = np.arange(prev_count, prev_count + waffle_counts[cat])
    # and put a block at each index's location
    x = blocks % 10
    y = blocks // 10
    plt.bar(y, 0.8, 0.8, x)
    prev_count += waffle_counts[cat]

# box size legend
plt.bar(7.5, 0.8, 0.8, 2, color = 'white', edgecolor = 'black', lw = 2)
plt.text(8.1, 2.4,'= 5 data points', va = 'center')

# aesthetic wrangling
plt.legend(waffle_counts.index, bbox_to_anchor = (0.8, 0.5), loc = 6)
plt.axis('off')
plt.axis('square')

NameError: name 'df' is not defined