In [None]:
# Plot target var
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.style as style

style.use('fivethirtyeight')

def plot_target(df,target):
    y= df['target']
    
    fig = plt.figure(constrained_layout=True,figsize=(15,8))
    grid = gridspec.GridSpec(ncols=3,nrows=3,figure=fig)
    
    # histogram
    ax1 = fig.add_subplot(grid[0,:])
    ax1.set_title('Histogram')
    sns.distplot(y,norm_hist=True,ax=ax1)
    
    #qq plot
    ax2 =fig.add_subplot(grid[1,:2])
    ax2.set_title('QQ Plot')
    scipy.stats.probplot(y,plot=ax2)
    
    # box plot
    ax3 = fig.add_subplot(grid[:,2])
    ax3.set_title('Box Plot')
    sns.boxplot(y,orient='v',ax=ax3) 

In [None]:
import keras
import matplotlib.pyplot as plt

# plot keras history
def plot_history(history: keras.callbacks.History):
    """
    Plots Keras hisory
    """
    metrics = [metric for metric in history.history.keys() if not metric.startswith('val_')]
    stride = len(history.epoch)//20
    plotted_epochs = history.epoch[::stride]
    
    fig, subplots = plt.subplots(len(metrics), figsize=(8, 4*len(metrics)))
    subplots = subplots if len(metrics) != 1 else (subplots,)
    fig.tight_layout(h_pad=3, rect=[0, 0, 1, 0.95])
    fig.suptitle('Model training history', fontsize=18)
    
    for metric, subplot in zip(metrics, subplots):
        subplot.plot(plotted_epochs, history.history[metric][::stride], marker='.')
        try: subplot.plot(plotted_epochs, history.history[f'val_{metric}'], marker='.')
        except KeyError: pass
        subplot.set_xticks(plotted_epochs)
        subplot.set_ylabel(metric)
        subplot.set_xlabel('epoch')
    
    if len(metrics) != len(history.history):
        fig.legend(['training', 'validation'])

In [None]:
from scipy.stats import chi2_contingency
import pandas as pd
import numpy as np
import seaborn as sns
    
# PLOT CORRELATIONS
def plot_chi2_heatmap(df, columns_to_compare):
    # columns_to_compare e.g. df.columns.values
    

    factors_paired = [(i, j) for i in columns_to_compare for j in columns_to_compare]

    chi2, p_values = [], []

    for f in factors_paired:
        if f[0] != f[1]:
            chitest = chi2_contingency(pd.crosstab(df[f[0]], df[f[1]]))
            chi2.append(chitest[0])
            p_values.append(chitest[1])
        else:
            chi2.append(0)
            p_values.append(0)

    chi2 = np.array(chi2).reshape((len(columns_to_compare), len(columns_to_compare)))  # shape it as a matrix
    chi2 = pd.DataFrame(chi2, index=columns_to_compare, columns=columns_to_compare)
    sns.heatmap(chi2)

In [None]:
# downcast dataframe to save memory usage
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

In [None]:
# perform TSNE
import pandas as pd
from sklearn.manifold import TSNE

def plot_tsne(X, y, perplexity=100, learning_rate=200, n_components=2):
    tsne = TSNE(n_components=n_components, init='random',
                random_state=None, perplexity=perplexity, verbose=1)
    result = tsne.fit_transform(X)
    result = pd.DataFrame(result)
    result = result.join(y)
    result.columns = ['x0', 'x1', 'y']
    sns.lmplot('x0', 'x1', result, fit_reg=False, hue='y', palette={0:"#2662c1", 1:"#c9001e"},
              scatter_kws={'alpha': .5})
    plt.title('t-SNE plot')
    plt.plot()

In [None]:
def auto_dummies(df):
    """
    :param df: Pandas DataFrame
    :return: Pandas Dataframe()
    """
    return pd.get_dummies(df, columns=list(df.select_dtypes(include='category').columns), drop_first=True)

In [None]:
def auto_remove_novariance(df):
    """
    :param df: Pandas DataFrame
    :return: Pandas DataFrame
    """
    uniques = df.apply(pd.Series.nunique)
    return df.drop(columns=list(uniques[uniques == 1].index))

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# get feature importance
def get_rf_feat_importances(X,y):
    rf = RandomForestClassifier(n_estimators=20, random_state = 42)
    rf.fit(X, y)
    df = pd.DataFrame(
        {'feature': X.columns, 'importance':rf.feature_importances_})
    df = df.sort_values(by=['importance'], ascending=False)
    return df

def plot_feature_importance_gbc(clf, feature_names, topk = 25, figsize = (50,70) ):
    #topk = 25
    fig = plt.figure(figsize = figsize)
    importances = clf.feature_importances_ 
    sorted_idx = np.argsort(importances)[-topk:]
    #sorted_idx = sorted_idx[::-1]
    padding = np.arange(len(sorted_idx)) + 0.5
    #plt.barh(padding, importances[sorted_idx], align='center')
    plt.barh(padding, importances[sorted_idx],\
       color="b", alpha = 0.5, align="center")    
    plt.tick_params(axis='y', which='major', labelsize=10)
    plt.yticks(padding, feature_names[sorted_idx])
    #plt.show()
    return fig

def plot_feature_importance(rf, feature_names, topk = 25, errorbar=False, figsize = (50,70) ):
    #topk = 25
    fig = plt.figure(figsize = figsize)
    importances = rf.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)    
    sorted_idx = np.argsort(importances)[-topk:]
    padding = np.arange(len(sorted_idx)) + 0.5
    #plt.barh(padding, importances[sorted_idx], align='center')
    if errorbar: 
        plt.barh(padding, importances[sorted_idx],\
            color="b", alpha = 0.5, xerr=std[sorted_idx], align="center")   
    else:
        plt.barh(padding, importances[sorted_idx],\
        color="b", alpha = 0.5, align="center")  
    plt.tick_params(axis='y', which='major', labelsize=10)
    plt.yticks(padding, feature_names[sorted_idx])
    plt.show()
    #plt.plot()
    return fig

In [None]:
# print False Postive and False Negative samples
def get_fp_fn_samples(test_y, test_y_pred, test_txt):

    i_lst_fp = [i for i in xrange(len(test_y)) if test_y[i] == 0 and test_y_pred[i] == 1]
    i_lst_fn = [i for i in xrange(len(test_y)) if test_y[i] == 1 and test_y_pred[i] == 0]
    print '\nfalse positive'
    for i in i_lst_fp[:20]:
        print i, test_y[i], ':', test_txt[i]
    print 'false negative'
    for i in i_lst_fn[:20]:
        print i, test_y[i], ':', test_txt[i]

In [None]:
def remove_outliers(df, outlier_column_name, drop_anomalies=False, threshold=3):
    """
    Given a dataframe, remove outliers from a given column, according to some threshold.
    Return a dataframe.
    """
    from scipy.stats import zscore
    z_name = outlier_column_name + '_z'
    df[z_name] = df[[outlier_column_name]].apply(zscore)
    initial = df.shape[0]
    if drop_anomalies:
        df = df[(abs(df[z_name]) < threshold)]
        df = df.drop(z_name, axis=1)
        after = initial - df.shape[0]
        print(f"{after} outliers for {outlier_column_name} have been removed")
    return df


def remove_entries_outside_iq_range(df, col):
    q1 = df.diff_entry.quantile(0.25)
    q3 = df.diff_entry.quantile(0.75)
    iqr = q3 - q1
    iq_rem = df[~((df[col] < (q1 - iqr)) | (df[col] > (q3 + iqr)))]
    return iq_rem

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
from metrics_helper import get_confusion_rates
from scipy import interp


def plot_class_hist(data, target, feature, kde=False):
    """
    In a binary classification setting this function plots 
    two histograms of a given variable grouped by a class label.
    
    It is a wrapper around Seaborn's .distplot()
    
    Parameters:
    data    : name of your pd.DataFrame
    target  : name of a target column in data (string)
    feature : name of a feature column you want to plot (string)
    kde     : if you want to plot density estimation (boolean)
    (C) Aleksander Molak, 2018 MIT License || https://github.com/AlxndrMlk/
    """
    
    sns.distplot(data[data[target]==1][feature],\
                 label='1', color='#b71633', norm_hist=True, kde=kde)
    sns.distplot(data[data[target]==0][feature],\
                 label='0', color='#417adb', norm_hist=True, kde=kde)
    plt.ylabel('Frequency')
    plt.title(feature)
    plt.legend()
    plt.show()

def plot_roc(y, y_pred_prob):
    '''
    for binary classification
    '''
    fpr, tpr, thresholds = roc_curve(y, y_pred_prob[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.4f)' % ( roc_auc))    

def plot_roc_cv(classifier, X, y, cv):
    '''
    cv = KFold(len(y),n_folds=5)
    '''
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
    