# Crosstabs Example

In [18]:
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import bokeh

In [3]:
from bokeh.plotting import figure, show
from bokeh.palettes import Set1
from bokeh.io import save
from bokeh.layouts import gridplot
from bokeh.resources import CDN

In [2]:
def plot_feature_histograms(df_list, names=[], nbins=200, colors=[]):
    """
    Takes a list of dataframes. For each column that exists in all dataframes 
    from that list, creates a histogram that contrasts the distribution of 
    that column's values between dataframes.
    Note: The returned plots are in arbitrary order.
    Args:
        df_list: list of dataframes
        names: list of dataframe names
        nbins: the number of bins in the histogram.
        colors: list of colors to use; uses a standard palette by default
    Returns:
        List of bokeh.figures, list of common column names (to give the 
        order of the figures).
    """

    if names and len(names)!=len(df_list):
        raise ValueError("The number of names and dataframes must match.")

    if colors and len(colors)!=len(df_list):
        raise ValueError("The number of colors and dataframes must match.")

    if not names:
        names = ['dataframe_%.02d'%idx for idx in range(len(df_list))]

    if not colors:
        if len(df_list) > len(Set1[9]):
            raise ValueError("You must specify colors for this many dataframes.")
        colors = Set1[9][:len(df_list)]


    common_cols = set.intersection(*[set(df.columns) for df in df_list])

    if len(common_cols) == 0:
        raise ValueError("The dataframes have no columns in common.")

    # bokeh is picky with column names
    common_cols = map(str, common_cols)
    for df in df_list:
        df.columns = map(str, df.columns)

    # now we can plot each column, grouped by the dataframe name
    res = []
    for col in common_cols:

        # get the bin sizes
        all_values = pd.concat([df[col] for df in df_list], ignore_index=True)
        _, all_edges = np.histogram(all_values, bins=nbins)

        # make a figure
        p = figure(title="Distribution of %s"%col, tools=["save","zoom_in",
            "zoom_out", "xwheel_zoom", "ywheel_zoom", "box_zoom", "reset",
            "pan"],
            background_fill_color="#ffffff")

        # add a histogram plot per dataframe
        for idx, df in enumerate(df_list):
            hist, edges = np.histogram(df[col], density=True, bins=all_edges)
            p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
                   fill_color=colors[idx], line_color=colors[idx], 
                   alpha=0.3, legend=names[idx])

        p.legend.location = 'top_right'
        p.xaxis.axis_label = col
        p.yaxis.axis_label = 'density'
        res.append(p)

    return res, common_cols

In [20]:
df = pd.read_csv('test_data.csv')

In [24]:
df = df.sort_values(by='score', ascending=False)

In [27]:
cutoff = round(len(df) * 0.1)

In [28]:
df_highrisk = df.iloc[0:cutoff,:]
df_lowrisk = df.iloc[cutoff:, :]

In [29]:
df_lowrisk.shape

(450, 92)

In [30]:
df_highrisk.shape

(50, 92)

In [19]:
res, ccols = plot_feature_histograms([df_lowrisk, df_highrisk])

