In [10]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()
blue, green, red = sns.color_palette()[:3]

# Remove Outliers Functions

In [4]:
# Remove outliers greater than or lower than a threshold (support variable transformation on the fly)
def eda_remove_outliers_threshold(var, data, threshold, greaterThan=True, transform=None):
    if transform:
        if greaterThan:
            outliers = data[transform(data[var]) > threshold].index
        else:
            outliers = data[transform(data[var]) <= threshold].index
    else:
        if greaterThan:
            outliers = data[data[var] > threshold].index
        else:
            outliers = data[data[var] <= threshold].index
    data.drop(outliers, inplace=True)
    print('Number of {} outliers removed: {}'.format(var, outliers.shape[0]))
    
# Remove outliers from a filter (vector of true/false values)
def eda_remove_outliers(data, filter):
    outliers = data[filter].index
    data.drop(outliers, inplace=True)
    print('Number of outliers removed: {}'.format(outliers.shape[0]))

# Vizualisation functions

In [7]:
# Bivariate scatter plot (support x/y transformation on the fly)
def eda_bivariate_scatter(x_var, y_var, data, transformx=None, transformy=None, 
                          xlabel='', ylabel='', title='', xlim=None, ylim=None):
    x = data[x_var]
    y = data[y_var]
    if transformx:
        x = transformx(x)
    
    if transformy:
        y = transformy(y)

    plt.figure(figsize=(12, 6))
    plt.scatter(x, y, color=blue, s=5)
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

In [None]:
# wrapper around the value_counts function
def eda_value_counts(var, data):
    return data[var].value_counts()

# Return the n variables top occurences with step support
def eda_top_occurences_list(var, data, n=5, step=1):
    counts = eda_value_counts(var, data)[::step]
    return list(counts.head(n).index.values)

# Multivariate analysis (apply colors to data points according to a split variable and specific split values only)
# (some variable have too much categories)
def eda_multivariate_scatter(x_var, y_var, data, split_var, split_values, transformx=None, transformy=None, 
                             xlabel='', ylabel='', title='', xlim=None, ylim=None, colors=[], figsize=(12, 6)):
    plt.figure(figsize=figsize)
    
    i = 0
    for value in split_values:
        
        if type(split_var) == str:
            # when a variable name in the dataframe is passed
            filter = (data[split_var] == value)
        else:
            # when a pre-constructed filter (aka Series) is passed
            filter = (split_var == value)
            
        x = data.loc[filter, x_var]
        y = data.loc[filter, y_var]
        
        if transformx:
            x = transformx(x)
            
        if transformy:
            y = transformy(y)
        
        color = None
        if colors:
            color = colors[i]        
            i += 1
        
        plt.scatter(x, y, label=value, s=5, color=color)
    
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.legend(loc='lower right')
    plt.show()

In [8]:
# horizontal barplot with its exact value printed
def eda_barh(var, data, xlabel='', ylabel='', title='', figsize=(12, 6), text_offset=10, height=0.5):
    x = data[var].value_counts().sort_values(ascending=True)
    y = np.arange(x.shape[0])
    
    plt.figure(figsize=figsize)
    plt.barh(y, x, tick_label=x.index, align='center', height=height)
    for index, value in enumerate(x):
        plt.text(x=value + text_offset, y=index, s=value, va='center', color=blue, fontweight='bold')
    
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

In [9]:
# plot variable distribution with its transformed distribution (transformation on the fly)
def eda_histograms(var, data, xlabels, suptitle, transform=None, bins=None):
    ncols = 3
    x = data[var]
    if transform:
        # log10(0) workaround
        if (transform == np.log10) & ((x == 0).any()):
            x += 1
        x_transform = transform(x)
        x_zscore = zscore(x_transform)
    else:
        ncols = 2
        x_zscore = zscore(x)
    
    fix, ax = plt.subplots(nrows=1, ncols=ncols, sharey=True, figsize=(24, 8))
    sns.distplot(x, ax=ax[0], bins=bins, kde=False, rug=False)
    sns.distplot(x_zscore, ax=ax[ncols - 1], bins=bins, kde=False, rug=False)
    
    if transform:
        sns.distplot(x_transform, ax=ax[1], bins=bins, kde=False, rug=False)
        ax[1].set_xlabel(xlabels[1])
    
    ax[0].set_xlabel(xlabels[0])
    ax[ncols - 1].set_xlabel(xlabels[ncols - 1])
    ax[0].set_ylabel('# occurences')
    plt.suptitle(suptitle, y=0.95, size='xx-large')
    plt.show()

In [None]:
# Box plot sorted by decreasing median values that exclude categories without the sample size threshold requirement.
# Print the sample size in the box (very usefull)
# (can be usefull when a nominal variable has to much categories, used in another notebook I decided to leave)
def eda_boxplot_with_sample_size(x, y, data, sample_size_min, xlabel, ylabel, figsize=(12, 8), text_size='small', text_offset=1.5):
    
    # compute the box sample sizes
    sample_sizes = data[x].value_counts()
    
    # keep only boxes (x values) with a sample size greater than a threshold
    filter = sample_sizes > sample_size_min
    sample_sizes = sample_sizes[filter]
    
    # compute the box medians
    medians = data.groupby([x])[y].median()
    
    # filter the medians and order them by descending order
    medians = medians[filter].sort_values(ascending=False)
    
    # extract box labels and reindex sample size in the wished order
    order = medians.index.values
    sample_sizes = sample_sizes.reindex(order)
    
    # build the texts that will be displayed in each box
    sample_sizes = [str(x) for x in sample_sizes.tolist()]
    sample_sizes = ["n = " + i for i in sample_sizes]
     
    # boxplot
    plt.figure(figsize=figsize)
    ax = sns.boxplot(x=x, y=y, data=data, order=order)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    
    # add sample size text near the median in each box
    i = 0    
    for median in medians:
        ax.text(i, 
                median + text_offset, 
                sample_sizes[i],
                horizontalalignment='center', 
                size=text_size, 
                color='w', 
                weight='semibold')
        i += 1
    
    plt.show()    

# Boxplot of Log(SalePrice) VS Other variable 
def eda_saleprice_boxplot(x, figsize=(20, 8), text_offset=0.025):
    eda_boxplot_with_sample_size(x=x,
                                 y='SalePriceLog',
                                 data=data,
                                 sample_size_min = 0,
                                 xlabel=x,
                                 ylabel='$\log_{10}(sale\,price)$',
                                 figsize=figsize,
                                 text_size='medium',
                                 text_offset=text_offset)