In [None]:
def presentation(df):
    """Give some deatils of the dataframe"""

    display(df.head(3))

    # NA Ratio
    na = df.isna()
    nanumber = np.count_nonzero(na == True)
    total_val = len(df.columns) * len(df)

    print(
        f'NA ratio : {nanumber/total_val*100:.2f} %'
    )
    print('dF shape :', df.shape)

In [None]:
def plot_na(df):
    """Display 2 graph : Drop NA by [row/column]"""

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    fig.suptitle('NA Drop', size=14, weight='bold')

    # AX1 to drop rows
    x1 = []
    y1 = []
    for i in range(0, 101):
        x1.append(int(i * len(df.columns) / 100))
    for j in x1:
        y1.append(len(df.dropna(axis=0, thresh=j)))

    # AX2 to drop cols
    x2 = []
    y2 = []
    for i in range(0, 101):
        x2.append(int(i * len(df) / 100))
    for j in x2:
        new_df = df.dropna(axis=1, thresh=j)
        y2.append(len(new_df.columns))

    # Plots 1 and 2
    ax1.plot(x1, y1, color='b', linestyle='dashed')
    ax2.plot(x2, y2, color='red', linestyle='dashed')

    # Set labels x and y for AX1
    ax1.set_xlabel('NA per rows')
    ax1.set_ylabel('total rows')

    # Set labels x and y for AX2
    ax2.set_xlabel('NA per columns')
    ax2.set_ylabel('total columns')

    # Set yticks and y limit for AX2
    ax2.set_yticks(np.arange(35, 48, step=1))
    ax2.set_ylim(bottom=35, top=47)

    plt.show()

In [None]:
def drop_col(df, na_ratio, drop_or_not):
    """Create a new dataframe if drop or not = True"""
    
    df_original = df.copy()

    # Calculate NA ratio per column
    na_per_col = len(df) - na_ratio * len(df)
    df_shape = df.dropna(axis=1, thresh=na_per_col).shape
    col_name_df_original = df.columns
    print('If NA per col >', int(na_per_col), ' | New dF shape', df_shape)

    # Drop column when NA > threshold
    if drop_or_not == True:
        df.dropna(axis=1, thresh=na_per_col, inplace=True)

        # List the column dropped
        col_name_df_drop = df.columns
        list_col_dropped = list(
            set(col_name_df_original) - set(col_name_df_drop))

        print('Percentage of NA per column drop :')
        for i in list_col_dropped:
            print(
                f'{i} : {df_original[i].isna().sum() / df_original.shape[0] * 100 :.2F} %'
            )

In [None]:
def drop_row(df, na_ratio, drop_or_not):
    """Create a new dataframe if drop or not = True"""

    # Calculate NA ratio per row
    na_per_row = len(df.columns) - na_ratio * len(df.columns)
    df_shape = df.dropna(axis=0, thresh=na_per_row).shape
    index_df_original = df.index
    print('If NA per row >', int(na_per_row), ' | New dF shape', df_shape)

    # Drop row when NA > threshold
    if drop_or_not == True:
        df.dropna(axis=0, thresh=na_per_row, inplace=True)

        index_df_drop = df.index
        print(list(set(index_df_original) - set(index_df_drop)))

In [None]:
def bar_column_num(ser):
    """Display 2 graphs : Scatter plot and box plot"""
    
    # Nice flier props
    blood_diamond = dict(markerfacecolor='red', marker='D')

    # Param of the figure
    fig = plt.figure(figsize=(20, 7), constrained_layout=True)
    gs = fig.add_gridspec(4, 1)

    # Param of the axes
    ax1 = fig.add_subplot(gs[0:3, :])
    ax2 = fig.add_subplot(gs[-1:, :])

    ax2.tick_params(axis='both',
                    which='both',
                    bottom=False,
                    left=False,
                    labelleft=False,
                    labelbottom=False)

    # Plot 1 and 2
    ax1.hist(ser.dropna(), align='mid')
    ax2.boxplot(ser.dropna(), vert=False, flierprops=blood_diamond)

    # Title
    ax1.set_title(f'Graph representation of the feature: {ser.name}',
                  size=20,
                  weight='bold')

    # Some statistics annotation to display in plot 1
    stat_txt = f"""
    n : {ser.count()}

    Mean : {ser.mean(axis=0, skipna=True):.2f}
    Std : {ser.std(axis=0, skipna=True):.2f}
  
    Min : {ser.min(axis=0, skipna=True):.2f}
    Median : {ser.median(axis=0, skipna=True):.2f}
    Max : {ser.max(axis=0, skipna=True):.2f}"""

    ax1.text(0.9, 0.6, stat_txt, transform=ax1.transAxes,
             bbox=dict(facecolor='white', alpha=1),
             fontsize=14)

In [None]:
def bar_column_qual(df, ser):
    """Display 2 graphs : Scatter plot and box plot"""

    # Nice flier props
    blood_diamond = dict(markerfacecolor='red', marker='D')

    # Figure param, plot in axes and title
    fig = plt.subplots(figsize=(7, 7))
    ax = sns.histplot(data=df, y=ser.dropna())
    ax.set_title(f'Graph representation of the feature: {ser.name}',
                  size=20,
                  weight='bold')
    
    # n number in the plot series
    stat_txt = ser.count()
    ax.text(1, 0.9, stat_txt, transform=ax.transAxes,
             bbox=dict(facecolor='white', alpha=1),
             fontsize=14)