# Visualization

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import IFrame

from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [None]:
IFrame("https://observablehq.com/embed/@jrus/munsell-spin?cells=spinning_picture", width="75%", height="597")

In [None]:
IFrame("https://observablehq.com/embed/@pierreleripoll/how-to-visualize-periodic-signals?cells=teaserPeriodicViz", width="100%", height="584")

In [None]:
sns.get_dataset_names()

In [None]:
penguins = sns.load_dataset('penguins')
penguins.head()

In [None]:
anscombe = sns.load_dataset('anscombe')
anscombe.head()

In [None]:
exercise = sns.load_dataset('exercise').set_index('Unnamed: 0')
exercise.head()

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds.head()

In [None]:
mpg = sns.load_dataset('mpg')
mpg.head()

## Relational

In [None]:
sns.relplot(data=mpg, x='model_year', y='mpg')

In [None]:
sns.relplot(data=mpg, x='model_year', y='mpg', kind='line')

In [None]:
sns.relplot(data=mpg, x='model_year', y='mpg', hue='cylinders')

In [None]:
sns.relplot(data=mpg, x='model_year', y='mpg')
sns.relplot(data=mpg, x='model_year', y='mpg', kind='line')

In [None]:
fig, ax = plt.subplots()

sns.scatterplot(data=mpg, x='model_year', y='mpg')
sns.lineplot(data=mpg, x='model_year', y='mpg')

fig.set_size_inches(12,8)

plt.show()

In [None]:
fig, axes = plt.subplots(1,2)

sns.lineplot(data=mpg, x='model_year', y='mpg', ax=axes[0])
sns.lineplot(data=mpg, x='model_year', y='weight', ax=axes[1], color=sns.color_palette()[9])

fig.set_size_inches(12,8)

plt.show()

## Distribution

In [None]:
sns.pairplot(diamonds)

In [None]:
sns.pairplot(penguins, hue='species')

In [None]:
sns.pairplot(penguins, hue='species', kind='kde')

In [None]:
sns.displot(data=diamonds, x='price', col='cut', col_wrap=3)

In [None]:
sns.displot(data=diamonds, x='price', col='cut', col_wrap=3, hue='clarity', kind='kde')

## Categorical

In [None]:
exercise.head()

In [None]:
sns.catplot(data=exercise, x='time', y='pulse')

In [None]:
sns.catplot(data=exercise, x='time', y='pulse', hue='kind')

In [None]:
sns.catplot(data=exercise, x='time', y='pulse', hue='kind', dodge=True)

In [None]:
sns.relplot(data=exercise, x='time', y='pulse')

In [None]:
sns.relplot(data=exercise, x='time', y='pulse', kind='line', hue='kind')

In [None]:
mpg.head()

In [None]:
sns.catplot(data=mpg, x='cylinders', y='mpg')

In [None]:
sns.catplot(data=mpg, x='cylinders', y='mpg', col='origin')

In [None]:
sns.catplot(data=mpg, x='cylinders', y='mpg', col='origin', kind='swarm')

In [None]:
sns.catplot(data=mpg, x='cylinders', y='mpg', col='origin', kind='swarm', s=2)

In [None]:
def filter_cylinders(cyl_count):
    return cyl_count in [4, 6, 8]

In [None]:
plot_df = mpg[mpg['cylinders'].apply(filter_cylinders)]

sns.catplot(data=plot_df, x='cylinders', y='mpg', col='origin', kind='swarm', s=4)

In [None]:
plot_df = mpg[mpg['cylinders'].apply(filter_cylinders)]

sns.catplot(data=plot_df, x='cylinders', y='mpg', col='origin', kind='box')

In [None]:
plot_df = mpg[mpg['cylinders'].apply(filter_cylinders)]

sns.catplot(data=plot_df, x='cylinders', y='mpg', col='origin', kind='boxen')

In [None]:
plot_df = mpg[mpg['cylinders'].apply(filter_cylinders)]

sns.catplot(data=plot_df, x='cylinders', y='mpg', col='origin', kind='violin')

In [None]:
plot_df = mpg[mpg['cylinders'].apply(filter_cylinders)]

fig, axes = plt.subplots(1, 3, sharex=True, sharey=True)

for i, origin in enumerate(plot_df['origin'].unique()[::-1]):
    ax = axes[i]
    ax.set_title(origin)
    local_df = plot_df[plot_df['origin'] == origin]
    sns.stripplot(data=local_df, x='cylinders', y='mpg', ax=ax)
    sns.boxplot(data=local_df, x='cylinders', y='mpg', ax=ax, boxprops={'alpha': 0.3})

fig.set_size_inches(14,8)
plt.show()
    

## Regression

In [None]:
sns.lmplot(data=penguins, x='bill_length_mm', y='bill_depth_mm', hue='species')

In [None]:
penguins_1h = penguins.join(pd.get_dummies(penguins['species'])).dropna()
penguins_1h.head()

In [None]:
md = sm.OLS(penguins_1h['bill_depth_mm'], sm.add_constant(penguins_1h.loc[:, ['bill_length_mm', 'Chinstrap', 'Gentoo']]))
mdf = md.fit()
mdf.summary()

In [None]:
fit_df = penguins.dropna()

md2 = smf.mixedlm("bill_depth_mm ~ bill_length_mm", fit_df, groups=fit_df['species'])

mdf2 = md2.fit()

mdf2.summary()

In [None]:
for species in penguins['species'].unique():
    fit_df = penguins[penguins['species'] == species].dropna()
    ols = stats.linregress(fit_df['bill_length_mm'], fit_df['bill_depth_mm'])
    print("Slope for", species, ":", ols.slope, "(standard error:", ols.stderr, ")")

### Doing it wrong: Islands as a facet

In [None]:
sns.lmplot(data=penguins, x='bill_length_mm', y='bill_depth_mm', hue='island')

In [None]:
sns.lmplot(data=penguins, x='bill_length_mm', y='bill_depth_mm', col='island', hue='island')

In [None]:
sns.lmplot(data=penguins, x='bill_length_mm', y='bill_depth_mm', col='island', hue='species')

In [None]:
plot_df = penguins.dropna(subset=['sex'], how='any')

colors = sns.color_palette()[:3]
columns = plot_df['species'].unique()
rows = plot_df['sex'].unique()

g = sns.FacetGrid(penguins, col="species",  row="sex", hue='species', palette=colors, col_order=columns, row_order=rows, hue_order=columns)

g.map_dataframe(sns.scatterplot, x='bill_length_mm', y='bill_depth_mm')

g.fig.set_size_inches(12,8)

plt.show()

In [None]:
anscombe.head()

In [None]:
sns.lmplot(data=anscombe, x='x', y='y', col='dataset', col_wrap=2, ci=None)

In [None]:
sns.lmplot(data=anscombe, x='x', y='y', col='dataset', col_wrap=2)

In [None]:
sns.lmplot(data=anscombe, x='x', y='y', col='dataset', col_wrap=2, robust=True)

In [None]:
sns.lmplot(data=anscombe, x='x', y='y', col='dataset', col_wrap=2, lowess=True)

## Custom

In [None]:
us = pd.read_csv("https://github.com/nytimes/covid-19-data/blob/master/us.csv?raw=true", parse_dates=['date'])
us_states = pd.read_csv("https://github.com/nytimes/covid-19-data/blob/master/us-states.csv?raw=true", parse_dates=['date'])
us_counties = pd.read_csv("https://github.com/nytimes/covid-19-data/blob/master/us-counties-recent.csv?raw=true", parse_dates=['date'])

In [None]:
us_diff = us.set_index('date') - us.set_index('date').shift(1)
us_diff.iloc[0, :] = us.iloc[0, 1:]
us_diff

In [None]:
plot_df = us_diff.reset_index()
sns.lineplot(data=us_diff, x='date', y='cases')

In [None]:
fig, ax = plt.subplots()

sns.lineplot(data=us_diff.rolling('14d').mean(), x='date', y='cases')

fig.set_size_inches(12,8)
plt.show()

In [None]:
us_diff['weekday'] = us_diff.reset_index()['date'].apply(lambda val: val.weekday()).to_numpy()

fig, ax = plt.subplots()

sns.lineplot(data=us_diff, x='date', y='cases', hue='weekday')

fig.set_size_inches(12,8)
plt.show()

In [None]:
def convert_weekday(number):
    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    return weekdays[number]

In [None]:
us_diff['weekday_str'] = us_diff['weekday'].apply(convert_weekday)

fig, ax = plt.subplots()

sns.lineplot(data=us_diff, x='date', y='cases', hue='weekday_str')

fig.set_size_inches(12,8)
plt.show()

In [None]:
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


fig, ax = plt.subplots()

sns.lineplot(data=us_diff, x='date', y='cases', hue='weekday_str', hue_order=weekday_order)

fig.set_size_inches(12,8)
plt.show()

### Extras

In [None]:
# from sklearn.decomposition import PCA

In [None]:
# penguins

In [None]:
# augmented = penguins.dropna().copy()

# pengs_dim_reduced = PCA(n_components=2).fit_transform(augmented.iloc[:, 2:6])
# augmented['x'] = pengs_dim_reduced[:, 0]
# augmented['y'] = pengs_dim_reduced[:, 1]
# sns.relplot(data=augmented, x='x', y='y', hue='species')