# Data visualization

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
from os.path import join
import sys

cwd = os.getcwd()
data_path = join(cwd, '..', '..', 'data')

sns.set(style='white', palette='tab10',
        context='notebook', font_scale=1.25,
        rc={'axes.linewidth': 1.5,
            'axes.grid': True,
            'axes.spines.right': False,
            'axes.spines.top': False})

In [None]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

In [None]:
# Import functions to load/clean the data files
%aimport data.filter_outliers
from data.filter_outliers import filter_outliers

## Load data

In [None]:
path = join(data_path, 'processed', 'facility_gen_emissions.csv')
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
len(df['state'].dropna()), len(df['facility_name'].dropna()), len(df['year'].dropna())

In [None]:
df.fuel_category.unique()

## Create a second dataframe with only records that have both *net_gen_mwh* and EPA emissions data
This is what we would have gotten with an inner join. From the information above it looks like there rows mission only some of the EPA values. It seems like a safe assumption that every EPA record has a value for `year`.

EIA net generation values can be negative. I'm going to use this data to calculate emission rates and don't want to deal with negative values for now so I'll only include records with positive (and non-zero) net generation. This will make plotting easier and isn't necessarily the best way to deal with this data for the purpose of analysis.

In [None]:
# I'm doing a copy of the returned dataframe object here because
# Pandas was giving a SettingWithCopyWarning later on.
df2 = df.dropna(subset=['year']).copy()

In [None]:
len(df2)

In [None]:
df2.fuel_category.unique()

In [None]:
df2 = df2.loc[df2['net_gen_mwh'] > 0]
len(df2)

## Matplotlib
[Matplotlib](https://matplotlib.org/) is probably the most popular Python plotting package and it improved significantly with version 2.0. If you can imagine a figure, and have several hours/days of spare time, you can probably make it with matplotlib. On the plus side it allows you to customize nearly every pixel on a figure.

In [None]:
# group to generation by month
total_monthly = df.groupby('month', as_index=False).sum()

plt.plot(total_monthly.month, total_monthly.net_gen_mwh)
plt.xlabel('Month')
plt.ylabel('Total generation (MWh)')

In [None]:
# group to generation by month and fuel
total_monthly_fuel = df.groupby(['month', 'fuel_category'], as_index=False).sum()

for fuel in total_monthly_fuel.fuel_category.unique():
    # Set x and y values
    x = total_monthly_fuel.loc[total_monthly_fuel['fuel_category'] == fuel, 'month']
    y = total_monthly_fuel.loc[total_monthly_fuel['fuel_category'] == fuel, 'net_gen_mwh']
    plt.plot(x, y, label=fuel)
    
plt.xlabel('Month')
plt.ylabel('Total generation (MWh)')
plt.legend(loc=(1.02 , 0.3))

### Plot with Pandas
Pandas has several matplotlib wrapper methods that let you plot the data in a dataframe. It will use the information (e.g. column names) to provide some labels on the figure.

## Plot with Seaborn
Seaborn also wraps functions around matplotlib and provides much more powerful tools for exploring data. It was designed to work closely with Pandas dataframes. I highly recommend going through the [Seaborn tutorial](https://seaborn.pydata.org/tutorial.html). 

### Distributions

### Categorical

Now I'll use the month as an **ordinal** variable on the x axis.

First I want to calculate the monthly emissions rate for each facility and emissions type.

$$rate = \frac{mass}{generation}$$

In [None]:
for col in ['nox_kg', 'so2_kg', 'co2_kg']:
    rate_col = col.replace('kg', 'rate')
    df2[rate_col] = df2[col] / df2['net_gen_mwh']
    df2[rate_col].replace(np.inf, 0, inplace=True)

In [None]:
# There are some extreme emission rate values. Might want to filter these.
df2.describe([.25, .5, .75, 0.95])

In [None]:
df2 = filter_outliers(df2, columns=['co2_rate', 'so2_rate', 'nox_rate'], percentile=0.95)

#### FacetGrid
FacetGrid is a lower-level but incredibly powerful plotting tool. It lets you create a grid with parameters splitting data by row, month, and hue. Then you can plot any function on that grid.

What's the difference when we change hue to `month` rather than `fuel_category`?

Seaborn assumes that the `hue` parameter represents something categorical. We can use it for a quantitative variable but it takes some work. 

In [None]:
# This code is based on a stackoverflow example
# https://stackoverflow.com/questions/44641669/

def facet_scatter(x, y, c, **kwargs):
    kwargs.pop("color")
    plt.scatter(x, y, c=c, **kwargs)

data = df2.loc[df2.fuel_category != 'Nuclear']
g = sns.FacetGrid(data=data, col='fuel_category', col_wrap=3, aspect=1.3)

# Create a normalized colormap 
vmin, vmax = data.net_gen_mwh.min(), data.net_gen_mwh.max()
cmap = plt.cm.viridis_r
norm=plt.Normalize(vmin=vmin, vmax=vmax)


g.map(facet_scatter, 'co2_rate', 'nox_rate', 'net_gen_mwh', norm=norm, cmap=cmap, alpha=0.3)

# Make space for the colorbar
g.fig.subplots_adjust(right=.92)

# Define a new Axes where the colorbar will go
cax = g.fig.add_axes([.94, .25, .02, .6])

# Get a mappable object with the same colormap as the data
points = plt.scatter([], [], c=[], vmin=vmin, vmax=vmax, cmap=cmap)

# Draw the colorbar
g.fig.colorbar(points, cax=cax)

## Regressions