In [9]:
def populate_rows_columns(dataframe):
    print(f'No. of rows are: {dataframe.shape[0]}')
    print(f'No. of columns are: {dataframe.shape[1]}')

def populate_min_max_mean(dataframe):
    """
    input: dataframe
    
    output: dataframe
    
    The function is to populate min, max, mean and median for each column
    """    
    result = {}
    
    
    for column in dataframe.columns:
        
        dataframe_column = dataframe[column]
        
        # If data type of a column is object,
        # set min, max, mean and median as NaN
        if dataframe[column].dtype == 'object':
            min_value = max_value = np.nan
            mean_value = np.nan
            median_value = np.nan 
        
        # If data type of a column is not object
        else:
            # if min and max are available for a column,
            # set min_value and max_value
            try:
                min_value = dataframe_column.min()
                max_value =dataframe_column.max()
            # if min and max are not available for a column,
            # set min_value and max_value as NaN 
            except:
                min_value = max_value = np.nan
            #similar as above
            try:
                mean_value = dataframe_column.mean()
                median_value  = dataframe_column.median()
                
            except:
                mean_value = np.nan
                median_value = np.nan

        result[column] = [min_value, max_value, mean_value, median_value]
    #convert dictionary into Dataframe    
    df = pd.DataFrame.from_dict(result, orient='index', columns=['min_value', 'max_value', 'mean_value', 'median_value'])
   #reset index and rename 'index' to column names
    df = df.reset_index()
    df.rename(columns={'index': 'column_names'}, inplace=True)
    return df 


def populate_unique_count(dataframe):
    """
    input: dataframe
    
    output: dataframe
    
    The function is to populate unique values for each column
    """    
    
    result = {}
    
    # loop each column of a dataframe
    for column in dataframe.columns:
         # populate number of unique values
        unique_count = len(dataframe[column].unique())
        result[column] = unique_count
        
    df = pd.DataFrame.from_dict(result, orient='index', columns=['unique_count'])
   #reset index and rename 'index' to column names
    df = df.reset_index()
    df.rename(columns={'index': 'column_names'}, inplace=True)
    return df         

    
def populate_datatypes(dataframe):
    """
    input: dataframe
    
    output: dataframe
    
    The function is to populate data types for each columns
    """
    
    result = {}
    
    # read data type for each column
    for i in dataframe.columns:
        #read data type
        dtypes = dataframe[i].dtypes
        #save data type into the dictionary
        result[i] = dtypes
        
    #convert dictionary into Dataframe
    df = pd.DataFrame.from_dict(result, orient='index', columns=['data_type'])
    #reset index and rename 'index' to column names
    df = df.reset_index()
    df.rename(columns={'index': 'column_names'}, inplace=True)
    return df


def populate_null_values(dataframe):
    """
    input: dataframe
    
    output: dataframe
    
    The function is to populate the following null values information:
    1. number of rows with null values
    2. number of total rows
    3. percentage of rows with null values
    """
    
    result = {}
    #populate columns
    columns = dataframe.columns
    #populate number of rows into a list
    nullValue_list = list(dataframe.isna().sum())
    #number of total rows
    total_rows = dataframe.shape[0]
    
    for index, value in enumerate(dataframe):
    #     save the following information into the dictionary
    #     1. number of rows with null values
    #     2. number of total rows
    #     3. percentage of rows with null values
        result[columns[index]] = [nullValue_list[index], total_rows, nullValue_list[index]/total_rows]
        
    # convert the dictionary into dataframe   
    df = pd.DataFrame.from_dict(result, orient='index', columns=['null_values', 'total_rows', '%_null_values'])    
     #reset index and rename 'index' to column names
    df = df.reset_index()
    df.rename(columns={'index': 'column_names'}, inplace=True)
    
    return df    


def populate_outlier(dataframe):
    """
    input: dataframe
    
    output: dataframe
    
    The function is to populate the following outlier information by using Tukey rule:
    1. the first row is to populate number of rows less than the minimum of values of outliers
    2. the second row is to populate number of rows greater than the maximum of values of outliers
    
    * Find the first quartile (ie .25 quantile)
    * Find the third quartile (ie .75 quantile)
    * Calculate the inter-quartile range (Q3 - Q1)
    * Any value that is greater than Q3 + 1.5 * IQR is an outlier
    * Any value that is less than Qe - 1.5 * IQR is an outlier
    
    """    
    result = {}
        
    for col in dataframe.columns: 
        try:
        #remove the null value
            dataframe_column = dataframe[dataframe[col].notna()]

            # find the first quartile at 0.25
            Q1 = dataframe_column[col].quantile(0.25)
            # find the first quartile at 0.75
            Q3 = dataframe_column[col].quantile(0.75)
            # Calculate the inter-quartile range (Q3 - Q1)
            IQR = Q3 - Q1
            #Any value that is greater than Q3 + 1.5 * IQR is an outlier
            max_value = Q3 + 1.5 * IQR
            #Any value that is less than Qe - 1.5 * IQR is an outlier
            min_value = Q1 - 1.5 * IQR

            #populate number of rows identified as outliers
            min_value_outlier = dataframe_column[dataframe_column[col] < min_value].shape[0]
            max_value_outlier = dataframe_column[dataframe_column[col] > max_value].shape[0]

        except:
            min_value_outlier = max_value_outlier = np.nan
    
        result[col] = [min_value_outlier, max_value_outlier]
    #convert dictionary into dataframe
    df = pd.DataFrame.from_dict(result, orient='index', columns=[ 'num_lower_outliers', 'num_higher_outliers'])  
    df = df.reset_index()
    df.rename(columns={'index': 'column_names'}, inplace=True)

    return df


def check_duplication(dataframe):
    """
    input: dataframe
    
    output: None
    
    The function is to check whether there are duplicated rows for all columns

    """        
    
    if dataframe[dataframe.duplicated()].shape[0] == 0:
        print('No duplication found')
        
    else:
        print("Duplication found")
        


In [2]:
def main_data_profiling(dataframe):
    """
    input: dataframe
    
    output: dataframe
    
    The function is to merge the result for following checks into a dataframe
    * data type check
    * number of unique values
    * number of null values
    * min, max, mean and median
    """        
    
    df_types = populate_datatypes(dataframe)
    #only include the columns which are not "column_name"
    df_nunique = populate_unique_count(dataframe).iloc[:,1:]
    #only include the columns which are not "column_name"
    df_null = populate_null_values(dataframe).iloc[:,1:]
    #only include the columns which are not "column_name"
    df_mean = populate_min_max_mean(dataframe).iloc[:,1:]
    #only include the columns which are not "column_name"
    df_outliers = populate_outlier(dataframe).iloc[:,1:]
    
    # merge results of all tests into a dataframe
    return pd.concat([df_types, df_nunique, df_mean,df_null, df_outliers], axis=1)
    
    

In [10]:
def boxplot_outliers(dataframe, outliers_columns):
    """
    input: dataframe: a dataframe
             outliers_columns: a list of columns to be checked with outliers
    
    output: boxplot to show outliers
    
    The function is to show boxplots of outliers for each column in the list of input
    """            
    # cols is number of boxplot for each row
    cols = 4
    # rows: numbers of rows of boxplots
    # number of columns to be checked with outliers
    n_outliers = len(outliers_columns)
    
    # determine number of rows of boxplot
    # for example, there are 29 boxplots to be created. 
    # to determine number of rows, use 29 divided by 4 to get 7.25 rows 
    # It means that rows are more than 7 and so rows are set as 8. 
    if n_outliers/cols > n_outliers//cols:
        rows = n_outliers//cols + 1
    else:
        rows = n_outliers//cols
    
    # set subplots to create multiple boxplot
    # figsize set for size of boxplots
    fig, axes = plt.subplots(rows,cols, figsize=(30,30))

    # loop each column to create each boxplot
    for index, value in enumerate(outliers_columns):
        #remove NaN value from each column
        df_outlier = dataframe[dataframe[value].notna()]
        df_outlier.boxplot(column=[value], ax=axes.flatten()[index])
    # show all boxplots
    plt.show()