# Plotting in Pandas

Sometimes there's no substitute for a good plot

In [None]:
!pip install scikit-learn matplotlib

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
from sklearn import datasets

## Scatter plots

In [None]:
d = datasets.load_iris()

In [None]:
d.keys()

In [None]:
d.target_names

In [None]:
d.feature_names

In [None]:
iris = pd.DataFrame(d.data, columns=d.feature_names)
iris['Species'] = d.target
iris.head()

In [None]:
iris.plot.scatter('sepal length (cm)', 'sepal width (cm)', c='Species', cmap='viridis');

The colorbar is ugly. Maybe we can fix this by doing multiple plots...

In [None]:
for Species, data in iris.groupby('Species'):
    data.plot.scatter('sepal length (cm)', 'sepal width (cm)', label=f'Species {Species}')


To plot all these on the *same* axis, we need to create an axis object and pass it to the plot function:

In [None]:
target_names = d.target_names
target_names

In [None]:
import matplotlib.pyplot as plt

ax = plt.axes()
colors = 'rgb'

for Species, data in iris.groupby('Species'):
    data.plot.scatter(
        'sepal length (cm)', 'sepal width (cm)', 
        s=data['petal length (cm)'] * 20,
        label=target_names[Species],
        c=colors[Species], 
        ax=ax,
        alpha=0.5,
    )

# Box plots

To get a feel for the distribution of a set of features, we can do box plots.

These show median, Q1, Q3, and (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)

In [None]:
iris[iris.columns[:4]].plot.box();

## Line plots

Line plots are nice for showing time series data

In [None]:
sales = pd.read_csv('./data/kaggle-sales/sales_train.csv.gz', parse_dates=['date'])
sales.head()

In [None]:
sales['revenue'] = sales.item_cnt_day * sales.item_price
daily_sales = sales.groupby('date').revenue.sum()
daily_sales.head()

In [None]:
daily_sales.plot();

In [None]:
daily_sales.groupby(daily_sales.index.weekday).sum().plot.bar();

In [None]:
daily_sales.resample('1Q').mean().plot();

In [None]:
daily_sales.tail()

In [None]:
stocks = pd.read_csv('./data/closing-prices.csv', parse_dates=[0], index_col=0)
stocks.head()


In [None]:
stocks.plot();

In [None]:
stocks.plot(logy=True);

## Histograms 

We can use a histogram to count how many days fell into a particular range:

In [None]:
daily_sales.head()

In [None]:
daily_sales.plot.hist();

In [None]:
item_price = sales.groupby('item_id').item_price.mean()
item_price.head()

In [None]:
item_price.hist();

In [None]:
item_price[item_price < 10_000].hist(bins=50, density=True);

## Using Seaborn to get nicer statistical plots

Seaborn provides two functions in particular that I like to use:

- distplot is a "nicer histogram"
- jointplot lets us look at how two features vary together

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns

sns.distplot(item_price[item_price < 10_000]);

In [None]:
sns.jointplot('item_price', 'item_cnt_day', sales.sample(10_000), alpha=0.01);

In [None]:
sns.jointplot('item_price', 'item_cnt_day', sales.sample(10_000), kind='hex');

In [None]:
small_sales = sales[
    (sales.item_price < 10_000) 
    & (sales.item_cnt_day < 10)
    & (sales.item_cnt_day > 0)
]

sns.jointplot('item_price', 'item_cnt_day', small_sales, kind='hex');

In [None]:
crime = pd.read_csv('./data/sf_crime_truncated.csv', parse_dates=['Dates'])
crime.plot.scatter('X', 'Y', alpha=0.1);

In [None]:
crime = crime[crime.Y < 90]
crime.plot.scatter('X', 'Y', alpha=0.1);

In [None]:
sns.jointplot('X', 'Y', crime, alpha=0.1);

In [None]:
sns.jointplot('X', 'Y', crime, kind='kde');

In [None]:
sns.jointplot('X', 'Y', crime, kind='hex');

## Scatterplot matrices

Sometimes we'd like to look at a number of different variables and see how they vary together

In [None]:
pd.plotting.scatter_matrix(
    iris[iris.columns[:-1]], 
    figsize=(12,12), 
    diagonal='kde', 
    c=iris.Species
);

We can also do correlation heatmaps:

In [None]:
d = datasets.fetch_california_housing()
housing = pd.DataFrame(d.data, columns=d.feature_names)
housing['MedVal'] = d.target

In [None]:
housing.corr()

In [None]:
sns.heatmap(housing.corr(), cmap='coolwarm')

In [None]:
sns.jointplot('MedInc', 'MedVal', housing, kind='kde');

Open the [Pandas plotting lab][pandas-plotting-lab]

[pandas-plotting-lab]: ./pandas-plotting-lab.ipynb

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
item_price.hist(ax=ax);

In [None]:
fig.savefig('mychart.png')

<img src="mychart.png">

Also see https://altair-viz.github.io