In [1]:
def data_summary(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Null/NaNs'] = df.isnull().sum().values
    summary['Null/NaNs %'] = np.round((df.isnull().sum().values/df.shape[0])*100, decimals = 2)
    summary['Unique values'] = df.nunique().values
    summary['Unique values %'] = np.round((df.nunique().values/df.shape[0])*100, decimals = 2)
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values
    summary['Skewness'] = df.skew(axis = 1, skipna = True, numeric_only = True)
    summary['Kurtosis'] = df.kurt(axis = 1, skipna = True, numeric_only = True)
#    if data_dict.empty() == False:
#        summary['Description'] = summary.Name.apply(lambda x: data_dict[x])
    return summary

In [2]:
def retrieve_value_counts(data):
    value_counts_frame = None
    for column in data.columns:
        if data[column].dtype == object :
            countsize = data[column].value_counts(dropna=False).size
            value_counts = pd.DataFrame(data[column].value_counts(dropna=False).reset_index())
            name_list = [column]*countsize
            value_counts['Name'] = name_list
            value_counts = value_counts.rename({'index':'Values', column:'ValueCount'}, axis=1)
            if value_counts_frame is None:
                value_counts_frame = pd.DataFrame(value_counts)
            else:    
                value_counts_frame= value_counts_frame.append(value_counts,ignore_index=True,sort=False)
        
            del value_counts
    value_counts_frame = value_counts_frame[['Name', 'Values', 'ValueCount']]        
    return(value_counts_frame)

In [3]:
def plot_boxsubplots(var, target, data, fig_size):
    plt.figure(figsize=fig_size)
    sns.set(font_scale=2)
    size = len(var)
    maxcol = 3
    total_rows = int(size / maxcol) + (size % maxcol)
    for i in range(0,len(var)):
        plt.subplot(total_rows,maxcol,i+1)
        fig = sns.boxplot(x = data[var[i]], y = data[target], data = data, order = data[var[i]].value_counts().index)
        fig.set_xlabel(var[i], color='black',fontweight='bold')
        fig.set_ylabel(target, color='black',fontweight='bold')
    plt.show()

In [4]:
def plot_distsubplots(var, data, fig_size):
    plt.figure(figsize=fig_size)
    sns.set(font_scale=2)
    size = len(var)
    maxcol = 3
    total_rows = int(size / maxcol) + (size % maxcol)
    for i in range(0,len(var)):
        plt.subplot(total_rows,maxcol,i+1)
        fig = sns.distplot(x = data[var[i]])
        fig.set_xlabel(var[i], color='black',fontweight='bold')
    plt.show()

In [5]:
def plot_countsubplots(var, data, fig_size):
    plt.figure(figsize=fig_size)
    sns.set(font_scale=2)
    size = len(var)
    maxcol = 3
#    if()
    total_rows = int(size / maxcol) + (size % maxcol)
    for i in range(0,len(var)):
        plt.subplot(total_rows,maxcol,i+1)
        fig = sns.countplot(x = data[var[i]], data = data, order = data[var[i]].value_counts().index)
        fig.set_xlabel(var[i], color='black',fontweight='bold')
    plt.show()

In [6]:
def plot_scattersubplots(var, target, data, fig_size):
    plt.figure(figsize=fig_size)
    sns.set(font_scale=2)
    size = len(var)
    maxcol = 3
#    if()
    total_rows = int(size / maxcol) + (size % maxcol)
    for i in range(0,len(var)):
        plt.subplot(total_rows,maxcol,i+1)
        fig = sns.scatterplot(x = data[var[i]], data = data, y=target)
        fig.set_xlabel(var[i], color='black',fontweight='bold')
        fig.set_ylabel(target, color='black',fontweight='bold')
    plt.show()

In [7]:
from sklearn import metrics
def get_confusion_matrix_values(y_true, y_pred):
    cm = metrics.confusion_matrix(y_true, y_pred)
    print('============================================Confusion Matrix============================================')
    print(cm)
    print('============================================Matrix visualization============================================')
    cmap = sns.cubehelix_palette(light=1, as_cmap=True)
    group_names = ['True Negative', 'False Positive', 'False Negative','True Positive']
    group_counts = ['{0:0.0f}'.format(value) for value in cm.flatten()]
    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_counts)]
    labels = np.asarray(labels).reshape(2,2)
    print(labels)
    sns.set(font_scale=1)
    sns.heatmap(cm, annot=labels, cmap=cmap, fmt='')
    plt.show()
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

def get_performance_metric(y_true, y_pred):
    TN, FP, FN, TP = get_confusion_matrix_values(y_true, y_pred)
    sensitivity = TP/(TP+FN)
    specificity = TN/(TN+FP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score_ = metrics.f1_score(y_true, y_pred)
    print('============================================sensitity/Specificity============================================')
    print('Sensitivity (True postive rate):', sensitivity)
    print('Specificity (True negative rate):', specificity)
    print('============================================Precision/Recall============================================')    
    print('Precision:', precision)
    print('Recall:', recall)
    print('============================================F1 score============================================')
    print('f1_score:', f1_score_)