In [1]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## Intro To Visualization In Python: Static Plots - 1 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [2]:
# =================================================-
#### Slide 3/32: Loading packages  ####

import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("diabetes.csv")
df_subset = df.set_index("id")
df_subset.head()

FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

In [None]:
# =================================================-
#### Slide 4/32: Directory settings  ####

# This is no longer needed after flattening the directory structure
data_dir = Path.cwd() / "data"
plot_dir = Path.cwd() / "plots"
data_dir.mkdir(exist_ok=True)
plot_dir.mkdir(exist_ok=True)

In [None]:
# =================================================-
#### Slide 5/32: Importing matplotlib  ####

import matplotlib.pyplot as plt

In [None]:
# =================================================-
#### Slide 6/32: Dataset for visualization  ####

# This dataset is of type dataframe. Let's assign this dataset to a variable, so that we can manipulate it freely.
df = pd.read_csv("diabetes.csv")
print(type(df))  # <- a Pandas DataFrame!
print(len(df))  # <- returns the number of rows

In [None]:
# =================================================-
#### Slide 7/32: Subsetting data  ####

df_subset = df.set_index("id")
df_subset.head()

In [None]:
# =================================================-
#### Slide 11/32: Prepare data: group and summarize (cont'd)  ####

grouping_col = df_subset.nunique().idxmin()
df_grouped_mean = df_subset.drop("Pregnancies", axis=1).groupby(grouping_col).mean()
df_grouped_mean 

In [None]:
# =================================================-
#### Slide 12/32: Prepare data: group and summarize (cont'd)  ####

# Reset index of the dataset.
df_grouped_mean = df_grouped_mean.reset_index()
df_grouped_mean

In [None]:
df_grouped_mean

In [None]:
# =================================================-
#### Slide 15/32: Wide to long format: melt (cont'd)  ####

# Melt the wide data into long.
df_grouped_mean_long = pd.melt(
    df_grouped_mean,  # <- wide dataset
    id_vars=grouping_col,  # <- identifying variable
    var_name="metric",  # <- contains col names of wide data
    value_name="mean",
)  # <- contains values from above columns
df_grouped_mean_long

In [None]:
# =================================================-
#### Slide 17/32: Long to wide format: pivot (cont'd)  ####

# Melt the long data into wide.
df_grouped_mean_wide = df_grouped_mean_long.pivot(
    index=grouping_col,  # <- identifying variable
    columns="metric",  # <- col names of wide data
    values="mean",
)  # <- values from above columns
df_grouped_mean_wide

In [None]:
# =================================================-
#### Slide 20/32: Univariate plots: histogram  ####

plt.rcParams.update({"font.size": 15})
plt.hist(df["DiabetesPedigreeFunction"]);

In [None]:
# =================================================-
#### Slide 21/32: Univariate plots: histogram (cont'd)  ####

df["DiabetesPedigreeFunction"].plot.hist(
    bins=20,
    title="DiabetesPedigreeFunction distribution",
    xlabel="DiabetesPedigreeFunction",
    ylabel="Frequency",
);

In [None]:
# =================================================-
#### Slide 22/32: Univariate plots: boxplot  ####

plt.boxplot(df["DiabetesPedigreeFunction"], vert=False);

In [None]:
# =================================================-
#### Slide 23/32: Univariate plots: boxplot (cont'd)  ####

df["DiabetesPedigreeFunction"].plot.box(
    title="DiabetesPedigreeFunction distribution",
);

In [None]:
# =================================================-
#### Slide 25/32: Univariate plots: bar chart - cont'd  ####

df_grouped_mean_long.head()

In [None]:
# =================================================-
#### Slide 26/32: Univariate plots: bar chart - cont'd  ####

df_true_means = df_grouped_mean_long.query("Outcome==0")[["metric", "mean"]]
df_true_means

In [None]:
mask = df_grouped_mean_long["Outcome"]==0
df_true_means = df_grouped_mean_long[mask].drop("Outcome", axis=1)
df_true_means

In [None]:
# =================================================-
#### Slide 27/32: Univariate plots: bar chart - cont'd  ####

bar_labels = df_true_means["metric"]  # <- 1
bar_heights = df_true_means["mean"]  # <- 2
num_bars = len(bar_heights)
bar_positions = np.arange(num_bars)  # <- 3

In [None]:
# =================================================-
#### Slide 28/32: Univariate plots: bar chart - cont'd  ####

plt.figure(figsize=(12, 9))
plt.barh(bar_positions, bar_heights)
plt.yticks(bar_positions, bar_labels)
plt.xlabel("Mean values")


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################

In [None]:
import pandas as pd

df = pd.read_csv("diabetes.csv").drop("id", axis=1)

In [None]:
ax = df.mean().sort_values().plot.barh(
    figsize=(9, 6),
    xlabel="Mean values",
    title="Columns and their means",
);
for i in ax.containers:
    ax.bar_label(i, fmt=" %.1f")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

In [None]:
means = df.drop(["Outcome", "DiabetesPedigreeFunction"], axis=1).mean().sort_values()
units = [" pregnancies", " mm", r" kg/m$^2$", " years", " mm Hg", " pM", " mM"]
labs = [f" {v:.2f}" + u for v, u in zip(means, units)]
ax = means.plot.barh(
    title="Average Diabetes Patient Measurements", xlabel="Mean values"
)
for i in ax.containers:
    ax.bar_label(i, labels=labs)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## Intro To Visualization In Python: Static Plots - 2 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 13/32: Bivariate plots: scatterplot  ####

plt.scatter(df["DiabetesPedigreeFunction"], df["Glucose"]);

In [None]:
df.plot.scatter(
    x="DiabetesPedigreeFunction",
    y="Glucose",
    marker="D",
    title="Glucose versus DiabetesPedigreeFunction",
    c="darkorange",
);

In [None]:
# =================================================-
#### Slide 14/32: Bivariate plots: scatterplot - cont'd  ####

plt.scatter(
    df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], marker="D"
)  # <- set marker type to diamond
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Glucose")
plt.title("DiabetesPedigreeFunction Glucose distribution");

In [None]:
# =================================================-
#### Slide 17/32: Customize colors - cont'd  ####

plt.hist(df_subset["DiabetesPedigreeFunction"], facecolor="goldenrod")  # <- set color
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Frequency")
plt.title("DiabetesPedigreeFunction distribution")
plt.show()

In [None]:
# =================================================-
#### Slide 18/32: Customize colors - cont'd  ####

plt.barh(bar_positions, bar_heights, color="orchid")
plt.yticks(bar_positions, bar_labels)
plt.xlabel("Mean values")
plt.title("Column Means");

In [None]:
# =================================================-
#### Slide 19/32: Customize color: scatterplot  ####

plt.scatter(
    df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], c="darkorange"
)  # <- set marker type to diamond
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Glucose")
plt.title("DiabetesPedigreeFunction Glucose distribution")
plt.show()

In [None]:
df["color"] = df["Outcome"].map({0: "darkseagreen", 1: "palevioletred"})
df.plot.scatter(
    x="DiabetesPedigreeFunction",
    y="Glucose",
    marker="D",
    title="Glucose versus DiabetesPedigreeFunction",
    c="color",
    alpha=.6,
    label="Diabetic",
);

In [None]:
# =================================================-
#### Slide 21/32: Customize color: map colors - cont'd  ####

color = df_subset["Outcome"].map({0: "darkseagreen", 1: "palevioletred"})
plt.scatter(df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], c=color)
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Glucose")
plt.title("DiabetesPedigreeFunction Glucose distribution")
plt.show()

In [None]:
# =================================================-
#### Slide 23/32: Customize color: opacity (cont'd)  ####

plt.scatter(
    df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], c=color, alpha=0.3
)
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Glucose")
plt.show()

In [None]:
# =================================================-
#### Slide 24/32: Customize plot settings: available styles  ####

print(plt.style.available)
# plt.style.use("ggplot")

In [None]:
# =================================================-
#### Slide 25/32: Customize plot settings: test ggplot style  ####

plt.scatter(
    df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], c=color, alpha=0.3
)
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Glucose")
plt.title("DiabetesPedigreeFunction Glucose distribution")
plt.show()

In [None]:
# =================================================  -
#### Slide 27/32: Customize plot settings: labels  ####

plt.rcParams["axes.labelsize"] = 20
plt.rcParams["axes.labelcolor"] = "red"
plt.rcParams["axes.titlesize"] = 25
plt.scatter(
    df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], c=color, alpha=0.3
)
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Glucose")
plt.title("DiabetesPedigreeFunction Glucose distribution")
plt.show()

In [None]:
# ================================================= -
#### Slide 28/32: Customize plot settings: reset defaults  ####

plt.rcdefaults()
plt.scatter(
    df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], c=color, alpha=0.3
)
plt.xlabel("DiabetesPedigreeFunction")
plt.ylabel("Glucose")
plt.title("DiabetesPedigreeFunction Glucose distribution")
plt.show()


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################

In [None]:
#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## Intro To Visualization In Python: Static Plots - 3 ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs

In [None]:
# =================================================-
#### Slide 4/29: Complex univariate plots: violin plots  ####

plt.violinplot(df_subset["DiabetesPedigreeFunction"], showmeans=False, showmedians=True)
plt.show()

In [None]:
# =================================================-
#### Slide 6/29: Univariate plots: violin plot (cont'd)  ####

plt.violinplot(
    df_subset["DiabetesPedigreeFunction"], vert=False, showmeans=False, showmedians=True
)
plt.xlabel("DiabetesPedigreeFunction")
plt.title("DiabetesPedigreeFunction distribution")
plt.show()

In [None]:
# =================================================-
#### Slide 8/29: Compound visualizations: grids  ####

# Create a 2 x 2 figure and axes grid.
fig, axes = plt.subplots(2, 2)
plt.show()

In [None]:
# =================================================-
#### Slide 9/29: Compound visualizations: axes  ####

print(axes)

In [None]:
# =================================================-
#### Slide 10/29: Compound visualizations: axes (cont'd)  ####

plt.clf()
plt.figure(figsize=(8, 8))
plt.rcParams.update({"font.size": 14})
fig, axes = plt.subplots(2, 2)

axes[0, 0].hist(
    df_subset["DiabetesPedigreeFunction"], facecolor="goldenrod"
)  # <- set color

In [None]:
# =================================================-
#### Slide 11/29: Compound visualizations: axes (cont'd)  ####

plt.figure(figsize=(12, 8))
fig, axes = plt.subplots(2, 2)
color_dict = {int("0"): "darkseagreen", int("1"): "palevioletred"}
color = df_subset["Outcome"].map(color_dict)
axes[0, 0].hist(
    df_subset["DiabetesPedigreeFunction"], facecolor="goldenrod"
)  # <- set color
axes[0, 1].boxplot(df_subset["DiabetesPedigreeFunction"])
axes[1, 0].scatter(
    df_subset["DiabetesPedigreeFunction"], df_subset["Glucose"], c=color, alpha=0.3
)
axes[1, 1].bar(bar_positions, bar_heights, color="salmon")
plt.tight_layout()
plt.show()

In [None]:
# =================================================-
#### Slide 12/29: Compound visualizations: labeling axes  ####

# Histogram.
axes[0, 0].set_ylabel("DiabetesPedigreeFunction distribution")
axes[0, 0].set_xlabel("DiabetesPedigreeFunction")

# Boxplot.
axes[0, 1].set_ylabel("DiabetesPedigreeFunction")

# Scatterplot.
axes[1, 0].set_xlabel("DiabetesPedigreeFunction")
axes[1, 0].set_ylabel("Glucose")

# Mean values of categories of variable means.
axes[1, 1].set_ylabel("Mean values")

In [None]:
# =================================================-
#### Slide 13/29: Compound visualizations: labeling ticks   ####

# No labels for ticks for boxplot.
axes[0, 1].xaxis.set_ticklabels([""])
# Tick positions set to bar positions in bar chart.
axes[1, 1].xaxis.set_ticks(bar_positions)

# Tick labels set to bar categories in bar chart.
axes[1, 1].xaxis.set_ticklabels(bar_labels, rotation=18)

In [None]:
# =================================================-
#### Slide 14/29: Compound visualizations: figure adjustments  ####

plt.rcParams["axes.labelsize"] = 20
plt.rcParams["figure.titlesize"] = 25
fig.set_size_inches(18, 7.5)
fig.suptitle("Data Summary")

In [None]:
# =================================================-
#### Slide 15/29: Compound visualizations: putting it all together  ####

plt.clf()
plt.figure(figsize=(8, 8))
plt.rcParams.update({"font.size": 14})
fig, axes = plt.subplots(2, 2)

color_dict = {int("0"): "darkseagreen", int("1"): "palevioletred"}
color = df_subset["Outcome"].map(color_dict)
axes[0, 0].hist(df_subset["SkinThickness"], facecolor="goldenrod")  # <- set color
axes[0, 1].boxplot(df_subset["SkinThickness"])
axes[1, 0].scatter(df_subset["SkinThickness"], df_subset["Glucose"], c=color, alpha=0.3)
axes[1, 1].bar(bar_positions, bar_heights, color="salmon")

# Histogram.
axes[0, 0].set_ylabel("SkinThickness distribution")
axes[0, 0].set_xlabel("SkinThickness")

# Boxplot.
axes[0, 1].set_ylabel("SkinThickness")

# Scatterplot.
axes[1, 0].set_xlabel("SkinThickness")
axes[1, 0].set_ylabel("Glucose")

# Mean values of categories of variable means.
axes[1, 1].set_ylabel("Mean values")

# No labels for ticks for boxplot.
axes[0, 1].xaxis.set_ticklabels([""])

# Tick positions set to bar positions in bar chart.
axes[1, 1].xaxis.set_ticks(bar_positions)

# Tick labels set to bar categories in bar chart.
axes[1, 1].xaxis.set_ticklabels(bar_labels, rotation=18)

plt.rcParams["axes.labelsize"] = 20
plt.rcParams["figure.titlesize"] = 25
fig.set_size_inches(18, 7.5)
fig.suptitle("Data Summary")
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2)

color = df["Outcome"].map(
    {
        0: "darkseagreen",
        1: "palevioletred",
    }
)
axes[0, 0].hist(df_subset["SkinThickness"], color="goldenrod")
axes[0, 1].boxplot(df["SkinThickness"], vert=False)
axes[1, 0].scatter(df["SkinThickness"], df["Glucose"], c=color, alpha=0.3)
axes[1, 1].barh(bar_positions, bar_heights, color="salmon")

# Histogram.
axes[0, 0].set_title("SkinThickness distribution")
axes[0, 0].set_ylabel("Frequency")
axes[0, 0].set_xlabel("SkinThickness")

# Boxplot.
axes[0, 1].set_title("SkinThickness distribution")
axes[0, 1].set_xlabel("SkinThickness")

# Scatterplot.
axes[1, 0].set_xlabel("SkinThickness")
axes[1, 0].set_ylabel("Glucose")
axes[1, 0].set_title("Glucose versus Skin Thickness")

# Mean values of categories of variable means.
axes[1, 1].set_xlabel("Mean values")
axes[1, 1].set_title("Mean values for all variables")

# No labels for ticks for boxplot.
axes[0, 1].yaxis.set_ticklabels([""])

# Tick positions set to bar positions in bar chart.
axes[1, 1].yaxis.set_ticks(bar_positions)

# Tick labels set to bar categories in bar chart.
axes[1, 1].yaxis.set_ticklabels(bar_labels)

plt.rcParams["axes.labelsize"] = 20
plt.rcParams["figure.titlesize"] = 25
fig.set_size_inches(18, 7.5)
fig.suptitle("Data Summary")
plt.tight_layout()

In [None]:
# =================================================-
#### Slide 18/29: Compound visualizations: layered plots (cont'd)  ####

plt.clf()  # <- clear plotting area
fig, axes = plt.subplots()  # <- create a new figure and axes objects for plotting

grouping_col_levels = list(df_grouped_mean_long[grouping_col].unique())
grouping_category_1 = grouping_col_levels[0]
grouping_category_2 = grouping_col_levels[1]

for key, value in color_dict.items():
    query = str("Outcome") + "==" + str(key)
    sc_col_1 = df_subset.query(query)["DiabetesPedigreeFunction"]
    sc_col_2 = df_subset.query(query)["Glucose"]

    if key == int(grouping_category_1):
        Flag = "Category 1"
    else:
        Flag = "Category 2"

    axes.scatter(sc_col_1, sc_col_2, c=value, label=Flag, alpha=0.3)
axes.legend()  # <- add a legend that would automatically get labels and colors from layers!

plt.show()

In [None]:
color = df["Outcome"].map({0: "#ff08", 1: "#f0f8"})
df.plot.scatter(
    x="DiabetesPedigreeFunction",
    y="Glucose",
    title="Glucose versus DiabetesPedigreeFunction",
    c=color,
    s=32,
    edgecolors="#0008",
    label="Diabetic",
);

In [None]:
# =================================================-
#### Slide 20/29: Compound visualizations: layered plots (cont'd)  ####

# We already have `'Outcome'` = `'0'` mean data.
print(df_true_means.head())
# Let's get the `'Outcome'` = `'1'` mean data.
query = str("Outcome") + "==" + str("1")
df_false_means = df_grouped_mean_long.query(query)[["metric", "mean"]]
print(df_false_means)

In [None]:
# =================================================-
#### Slide 21/29: Compound visualizations: layered plots (cont'd)  ####

# Mean values for `'Outcome'` = `'0'` data.
category_1_bar_heights = df_true_means["mean"]
# Mean values for `'Outcome'` = `'1'` data.
category_2_bar_heights = df_false_means["mean"]
# Labels of bars, their width, and positions are shared for both categories.
bar_labels = df_false_means["metric"]
num_bars = len(bar_labels)
bar_positions = np.arange(num_bars)
width = 0.35

In [None]:
# =================================================-
#### Slide 22/29: Compound visualizations: layered plots (cont'd)  ####

# Clear the plotting area for the new plot.
plt.clf()
# Create the figure and axes objects.
fig, axes = plt.subplots()
category_1_bar_chart = axes.bar(
    bar_positions,  # <- set bar positions
    category_1_bar_heights,  # <- set bar heights
    width,  # <- set width of the bars
    color=color_dict[0],
)  # <- set color corresponding to '0' in dictionary
category_2_bar_chart = axes.bar(
    bar_positions + width,  # <- set bar positions
    category_2_bar_heights,  # <- set bar heights
    width,  # <- set width of the bars
    color=color_dict[1],
)  # <- set color corresponding to '1' in dictionary

In [None]:
# =================================================-
#### Slide 23/29: Compound visualizations: layered plots (cont'd)  ####

# Add text for labels, title and axes ticks.
axes.set_ylabel("Mean values")
axes.set_title("Data metrics summary")
axes.set_xticks(bar_positions + width / 2)
axes.set_xticklabels(bar_labels)

In [None]:
# =================================================-
#### Slide 24/29: Compound visualizations: layered plots (cont'd)  ####

# Clear the plotting area for the new plot.
plt.clf()
# Create the figure and axes objects.
fig, axes = plt.subplots()

category_1_bar_chart = axes.bar(
    bar_positions,  # <- set bar positions
    category_1_bar_heights,  # <- set bar heights
    width,  # <- set width of the bars
    color=color_dict[0],
)  # <- set color corresponding to '0' in dictionary
category_2_bar_chart = axes.bar(
    bar_positions + width,  # <- set bar positions
    category_2_bar_heights,  # <- set bar heights
    width,  # <- set width of the bars
    color=color_dict[1],
)  # <- set color corresponding to '1' in dictionary

# Add text for labels, title and axes ticks.
axes.set_ylabel("Mean values")
axes.set_title("Data metrics summary")
axes.set_xticks(bar_positions + width / 2)
axes.set_xticklabels(bar_labels)

# Add a legend for each chart and corresponding labels.
axes.legend(
    (category_1_bar_chart, category_2_bar_chart),
    (f"{grouping_category_1}", f"{grouping_category_2}"),
)
fig.set_size_inches(12, 4)
plt.show()

In [None]:
pd.read_csv("diabetes.csv").drop("id", axis=1).groupby("Outcome").mean().T.sort_values(0).plot.barh();

In [None]:
# =================================================-
#### Slide 28/29: Exercise  ####


#######################################################
####  CONGRATULATIONS ON COMPLETING THIS MODULE!   ####
#######################################################