### Data visualization
#### (Slide) Effective Data Presentation (1)

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Relationship - two variables
#### Scatter Plot Chart
matplotlib.pyplot.scatter(x, y, s=None, c=None, marker=None, cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, edgecolors=None, *, data=None, **kwargs)

A scatter plot of y vs x with varying marker size and/or color.

In [None]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

fig = plt.figure(dpi=150, figsize=(4, 3))
plt.scatter(X[np.where(y==0), 0], X[np.where(y==0), 1], marker='o', s=10, c='red', label='setosa', alpha=0.5, edgecolor='black')
plt.scatter(X[np.where(y==1), 0], X[np.where(y==1), 1], marker='D', s=10, c='green', label='versicolor', alpha=0.5)
plt.scatter(X[np.where(y==2), 0], X[np.where(y==2), 1], marker='s', s=10, c='blue', label='virginica', alpha=0.5)
# plt.xlim(0, 8)
# plt.ylim(0, 4.5)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.xlabel('Sepal Length (cm)', fontsize=8)
plt.ylabel('Sepal Width (cm)', fontsize=8)
plt.title('Scatter Plot Chart', fontsize=8)
plt.legend(loc=(1.05, 0.5), fontsize=8, ncol=1)

### Relationship - three variables
#### Bubble Plot

In [None]:
x = np.random.rand(40)
y = np.random.rand(40)
z = np.random.rand(40)
plt.scatter(x, y, s=z*1000, alpha=0.5)

#### Line Graph
matplotlib.pyplot.plot(*args, scalex=True, scaley=True, data=None, **kwargs)

Plot y versus x as lines and/or markers.

In [None]:
years = [str(i) for i in range(2013, 2018)]
gross_margin = [43.96, 48.75, 43.23, 35.64, 35.63]
operating_margin = [18.55, 22.17, 12.15, 8.38, 4.12]
net_bf_tax = [21.72, 24.57, 13.77, 9.88, 11.43]
net_af_tax = [20.20, 21.78, 12.08, 8.72, 10.10]
plt.plot(years, gross_margin, color='orange', label='gross margin')
plt.plot(years, operating_margin, color='blue', label='operating margin')
plt.plot(years, net_bf_tax, color='red', label='net margin b/f tax')
plt.plot(years, net_af_tax, color='green', label='net margin a/f tax')
plt.ylabel('Margins (%)')
plt.legend()

In [None]:
X, y = load_iris(return_X_y=True)
X = X[:, :2]
y = (y != 0).astype(int)

from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X, y)
print(clf.coef_, clf.intercept_)
(clf.predict(X) >= 0.5).astype(int)

#### matplotlib.pyplot.contourf(*args, data=None, **kwargs)
Plot contours.

#### matplotlib.pyplot.contour(*args, data=None, **kwargs)
Plot contours.

In [None]:
iris = load_iris()
xx = np.linspace(min(X[:, 0]), max(X[:, 0]), 101)
yy = np.linspace(min(X[:, 1]), max(X[:, 1]), 101)
xx, yy = np.meshgrid(xx, yy)
zz = clf.predict(np.c_[xx.ravel(), yy.ravel()])
zz = (zz >= 0.5).astype(int)
plt.contourf(xx, yy, zz.reshape(xx.shape), cmap=plt.cm.coolwarm, alpha=0.3)
plt.contour(xx, yy, zz.reshape(xx.shape))

plt.scatter(X[np.where(y==0), 0], X[np.where(y==0), 1], marker='o', s=10, c='blue', label='setosa')
plt.scatter(X[np.where(y==1), 0], X[np.where(y==1), 1], marker='^', s=10, c='red', label='not setosa')
plt.xlim(min(X[:, 0]), max(X[:, 0]))
plt.ylim(min(X[:, 1]), max(X[:, 1]))
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
plt.legend(loc='lower right')

#### subplots
matplotlib.pyplot.subplots(nrows=1, ncols=1, sharex=False, sharey=False, squeeze=True, subplot_kw=None, gridspec_kw=None, **fig_kw)

Create a figure and a set of subplots.

In [None]:
iris = load_iris()
X = iris.data
y = iris.target
y_named = [iris.target_names[i] for i in y]

fig, axes = plt.subplots(2, 2, sharey=True, figsize=(8, 8))
plt.subplots_adjust(wspace=0.1, hspace=0.3)

for i, ax in enumerate(axes.ravel()):
    ax.scatter(X[:, i], y_named)
    ax.set_xlabel(f'{iris.feature_names[i]}')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    

#### Column Chart
matplotlib.pyplot.bar(x, height, width=0.8, bottom=None, *, align='center', data=None, **kwargs)

Make a bar plot.

In [None]:
EPS = np.array([6.82, 8.03, 8.51, 6.65, 4.62, 4.06, 5.09, 2.83, 2.79, 4.16, 4.98, 3.21, 4.29, 1.51, 3.26, 6.46, 1.69, 4.75, 4.39, 0])
qtr = []
for y in range(14, 19):
    for q in ['1Q', '2Q', '3Q', '4Q']:
        qtr.append(q + str(y))

plt.bar(qtr, EPS)
plt.xticks(rotation=45)
plt.ylabel('EPS')
plt.title('MediaTek')

In [None]:
labels = ['Q1', 'Q2', 'Q3', 'Q4']
eps_2016 = [2.79, 4.16, 4.98, 3.21]
eps_2017 = [4.29, 1.51, 3.26, 6.46]

p1 = plt.bar(labels, eps_2016, color='blue', width=-0.4, align='edge', label='2016')
p2 = plt.bar(labels, eps_2017, color='orange', width=0.4, align='edge', label='2017')
plt.ylabel('EPS')
plt.legend()

for i, (r1, r2) in enumerate(zip(p1, p2)):
    plt.text(r1.get_x()-0.3, eps_2016[i] - 0.5, eps_2016[i], color='white')
    plt.text(r2.get_x()+0.1, eps_2017[i] - 0.5, eps_2017[i], color='black')

#### Bar Graph
matplotlib.pyplot.barh(y, width, height=0.8, left=None, *, align='center', **kwargs)

Make a horizontal bar plot.

In [None]:
makers = ['Samsung', 'Apple', 'Huawei', 'Oppo', 'Xiaomi']
sales = [318, 216, 154, 112, 93]

makers.reverse()
sales.reverse()
plt.barh(makers, sales)
plt.xlabel('Unit Shipments (million)')

In [None]:
makers = ['Samsung', 'Apple', 'Huawei', 'Oppo', 'Xiaomi']
sales_2016 = [311, 215, 139, 100, 53]
sales_2017 = [318, 216, 154, 112, 93]
makers.reverse()
sales_2016.reverse()
sales_2017.reverse()

p1 = plt.barh(makers, sales_2016, color='blue', height=0.4, align='edge', label='2016')
p2 = plt.barh(makers, sales_2017, color='orange', height=-0.4, align='edge', label='2017')
plt.xlabel('Unit Shipments (million)')
plt.legend()

for i, (r1, r2) in enumerate(zip(p1, p2)):
    plt.text(r1.get_x(), r1.get_y()+0.1, sales_2016[i], color='white')
    plt.text(r2.get_x(), r2.get_y()-0.3, sales_2017[i], color='black')

#### Pie Chart
matplotlib.pyplot.pie(x, explode=None, labels=None, colors=None, autopct=None, pctdistance=0.6, shadow=False, labeldistance=1.1, startangle=None, radius=None, counterclock=True, wedgeprops=None, textprops=None, center=(0, 0), frame=False, rotatelabels=False, *, data=None)

Plot a pie chart.

In [None]:
makers = ['Samsung', 'Apple', 'Huawei', 'Oppo', 'Xiaomi', 'Others']
sales_2017 = [318, 216, 154, 112, 93, 591]
plt.pie(sales_2017, labels=makers, autopct='%.1f%%', shadow=True, startangle=90, explode=(0, 0.1, 0, 0, 0, 0))
# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')

#### Histogram
matplotlib.pyplot.hist(x, bins=None, range=None, density=None, weights=None, cumulative=False, bottom=None, histtype='bar', align='mid', orientation='vertical', rwidth=None, log=False, color=None, label=None, stacked=False, normed=None, *, data=None, **kwargs)

Plot a histogram.

In [None]:
hist_data = plt.hist(X[:, 0], bins=15, color='yellowgreen', edgecolor='black')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Frequency')

yy = hist_data[0].max()
xx = hist_data[1][np.where(hist_data[0] == yy)]
plt.annotate('max', xy=(xx[0], yy), xytext=(xx[0]+0.5, yy+3),
             arrowprops=dict(facecolor='red', width=3, headwidth=8, headlength=8))
plt.xlim(4, 8)
plt.ylim(0, 25)

In [None]:
print(hist_data[0])
print(hist_data[1])

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

fig, axes = plt.subplots(15, 2, figsize=(10, 20))
malignant = cancer.data[cancer.target == 0]
benign = cancer.data[cancer.target == 1]
ax = axes.ravel()

for i in range(30):
    # calculate bin_edges of the whole datasets
    _, bins = np.histogram(cancer.data[:, i], bins=50)
    ax[i].hist(malignant[:, i], bins=bins, color='r', alpha=.5)
    ax[i].hist(benign[:, i], bins=bins, color='b', alpha=.5)
    ax[i].set_title(cancer.feature_names[i])
    ax[i].set_yticks(())
    
ax[0].set_xlabel("Feature magnitude")
ax[0].set_ylabel("Frequency")
ax[0].legend(["malignant", "benign"], loc="best")
fig.tight_layout()

#### matplotlib.pyplot.matshow(A, fignum=None, **kwargs)
Display an array as a matrix in a new figure window.

In [None]:
# Principal Component Analysis (PCA)
pca_components = [
    [0.21890244, 0.10372458, 0.22753729, 0.22099499, 0.14258969, 0.23928535,
    0.25840048, 0.26085376, 0.13816696, 0.06436335, 0.20597878, 0.01742803,
    0.21132592, 0.20286964, 0.01453145, 0.17039345, 0.15358979, 0.1834174,
    0.04249842, 0.10256832, 0.22799663, 0.10446933, 0.23663968, 0.22487053,
    0.12795256, 0.21009588, 0.22876753, 0.25088597, 0.12290456, 0.13178394],
    [-0.23385713, -0.05970609, -0.21518136, -0.23107671, 0.18611302, 0.15189161,
    0.06016536, -0.0347675, 0.19034877, 0.36657547, -0.10555215, 0.08997968,
    -0.08945723, -0.15229263, 0.20443045, 0.2327159, 0.19720728, 0.13032156,
    0.183848, 0.28009203, -0.21986638, -0.0454673, -0.19987843, -0.21935186,
    0.17230435,  0.14359317, 0.09796411, -0.00825724, 0.14188335,  0.27533947]]

plt.matshow(pca_components, cmap='coolwarm')

plt.yticks([0, 1], ["First component", "Second component"])
plt.colorbar()
plt.xticks(range(30), cancer.feature_names, rotation=60, ha='left')
plt.xlabel("Feature")
plt.ylabel("Principal components")

#### matplotlib.pyplot.imshow(X, cmap=None, norm=None, aspect=None, interpolation=None, alpha=None, vmin=None, vmax=None, origin=None, extent=None, shape=None, filternorm=1, filterrad=4.0, imlim=None, resample=None, url=None, *, data=None, **kwargs)
Display an image, i.e. data on a 2D regular raster.

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
print(f'Image dimension = {digits.images.shape}')
print(f'Data dimension = {digits.data.shape}')

images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:8]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.binary, interpolation='nearest')
    plt.title('Digit: %i' % label)

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
print(digits.data.shape)

fig, axes = plt.subplots(1, 5, figsize=(8, 3))

for i, ax in enumerate(axes.ravel()):
    ax.imshow(digits.data[i].reshape(8, 8), cmap=plt.cm.binary, interpolation='nearest')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.axis('off')
    ax.set_title(digits.target[i])


#### Bonus: Stacked Bar Chart

In [None]:
def get_cumulated_array(data, **kwargs):
    # 給堆疊柱狀圖用
    # 把正數或負數分開(由**kwargs指定)，傳回累計值
    # Clip (limit) the values in an array
    cum = data.clip(**kwargs)
    # Return the cumulative sum of the elements along a given axis.
    cum = np.cumsum(cum, axis=0)
    # Return a new array of given shape and type, filled with zeros.
    d = np.zeros(np.shape(data))
    d[1:] = cum[:-1]
    return d


def get_cumulated_data_stack(data):
    # 給堆疊柱狀圖用
    # 傳入data，傳回data_stack (正負數分開累計)
    cumulated_data = get_cumulated_array(data, min=0)
    cumulated_data_neg = get_cumulated_array(data, max=0)
    # 合併正負數資料串列
    row_mask = (data < 0)
    cumulated_data[row_mask] = cumulated_data_neg[row_mask]
    data_stack = cumulated_data
    return data_stack


In [None]:
year = [2014, 2015, 2016, 2017, 2018]
EPS = np.array([6.82, 8.03, 8.51, 6.65, 4.62, 4.06, 5.09, 2.83, 2.79, 4.16, 4.98, 3.21, 4.29, 1.51, 3.26, 6.46, 1.69, 4.75, 4.39, 0])
EPS = EPS.reshape(-1, 4)

# Prepare to draw stacked bar chart
data = EPS.T
data_shape = np.shape(data)

# Take negative and positive data apart and cumulate
data_stack = get_cumulated_data_stack(data)

# Plot stacked bar chart
fig = plt.figure(dpi=150, figsize=(4, 3))
ax = plt.subplot(111)
x_interval = np.arange(data_shape[1])

p = [[], [], [], []]
colors = ["brown", "blue", "green", "orange"]
for i in np.arange(data_shape[0]):
    p[i] = ax.bar(
        x_interval, data[i], bottom=data_stack[i], width=0.6,
        align='center', color=colors[i])
    
# Define x-axis and y-axis
plt.xticks(x_interval, year, fontsize=9)
plt.yticks(fontsize=9)
plt.ylabel('EPS', fontsize=10)

# Add data labels for each data point
for r1, r2, r3, r4 in zip(p[0], p[1], p[2], p[3]):
    h1 = r1.get_y()
    h2 = r2.get_y()
    h3 = r3.get_y()
    h4 = r4.get_y()
    v1 = r1.get_height() * (-1) if h1 < 0 else r1.get_height()
    v2 = r2.get_height() * (-1) if h2 < 0 else r2.get_height()
    v3 = r3.get_height() * (-1) if h3 < 0 else r3.get_height()
    v4 = r4.get_height() * (-1) if h4 < 0 else r4.get_height()
    plt.text(
        r1.get_x()+r1.get_width()/2, h1, "%.2f" % v1,
        ha='center', va='bottom', color='white', fontsize=9)
    plt.text(
        r2.get_x()+r2.get_width()/2, h2, "%.2f" % v2,
        ha='center', va='bottom', color='white', fontsize=9)
    plt.text(
        r3.get_x()+r3.get_width()/2, h3, "%.2f" % v3,
        ha='center', va='bottom', color='white', fontsize=9)
    plt.text(
        r4.get_x()+r4.get_width()/2, h4, "%.2f" % v4,
        ha='center', va='bottom', color='white', fontsize=9)

plt.grid(True, axis='y', linestyle=':')
plt.title('MediaTek (2454)', loc='right', fontsize=10)
plt.tight_layout()
#plt.savefig(filename)
#plt.show()
#plt.close()

#### Bonus: Radar Chart

In [None]:
labels = ('A', 'B', 'C', 'D', 'E')
values = (5, 4, 3, 3, 4)
angles = np.linspace(np.pi*0.3, np.pi*2.3, len(values), endpoint=False)
values = np.concatenate((values, [values[0]]))
angles = np.concatenate((angles, [angles[0]]))
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, polar=True)
ax.plot(angles, values, 'o-', linewidth=2)
ax.fill(angles, values, alpha=0.25)
ax.set_thetagrids(angles * 180 / np.pi, labels, fontsize=16)
ax.set_ylim(0, 5)
ax.set_yticks(np.arange(0, 5, 1))
ax.yaxis.set_tick_params(labelsize=10)
ax.set_title('Radar Chart', fontsize=16)