Loading env variables:

In [None]:
import os
import sys
from dotenv import load_dotenv

load_dotenv(override=True)
print('ENV variables loaded successfully!')

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

Loading pre-processed database:

In [None]:
import pandas as pd
import numpy as np
from lib.env_var_keys import EnvVarKeys
from lib.dataframe_helper import fill_nan

# Get the database file path
pre_processed_dataset_path = os.getenv(EnvVarKeys.PRE_PROCESSED_DATASET_PATH_KEY.value)
df = pd.read_csv(pre_processed_dataset_path, sep=',', low_memory=False)
df = df.drop(columns=['empty_count', 'empty_columns'], axis=1)
df = fill_nan(df)

print(f'Pre-processed data set shape: {df.shape}')

In [None]:
df.describe(include='all')

In [None]:
def get_distribution(df, target_col, numeric_columns=[]):
  '''
  Get DataFrame distribution for a given target column.
  
  Parameters
  ----------
  df : pandas.DataFrame
    DataFrame to get distribution from.
  target_col : str
    Target column name.
  numeric_columns : list
    List of numeric columns to get distribution from.
    
  Returns
  -------
  pandas.DataFrame
    DataFrame with distribution for the given target column.
  '''
  
  total = df.shape[0]
  columns = df.loc[:, df.columns != target_col].columns.to_list()
  
  dfs_by_target_col_classes = {}
  values_target_col = df[target_col].value_counts().index.to_list()
  
  for value in values_target_col:
    dfs_by_target_col_classes[value] = df[df[target_col] == value]
  
  data = {'Attributes': [], 'Total': []}
  
  for value_target_col in values_target_col:
    data[f'{target_col}:{value_target_col}'] = []
  
  for col in columns:
    if (col in numeric_columns):
      mean = df[col].mean()
      std = df[col].std()
      data['Attributes'].append(col)
      data['Total'].append(f'{mean:.1f} ({std:.1f})')
      
      for df_by_target_col_key in dfs_by_target_col_classes.keys():
          mean_by_target_col = dfs_by_target_col_classes[df_by_target_col_key][col].mean()
          std_by_target_col = dfs_by_target_col_classes[df_by_target_col_key][col].std()
          
          data[f'{target_col}:{df_by_target_col_key}'].append(f'{mean_by_target_col:.1f} ({std_by_target_col:.1f})')
    else: 
      value_counts_total = df[col].value_counts()
      value_index = value_counts_total.index.to_list()
      value_index.sort()
      
      for index in value_index:
        data['Attributes'].append(f'{col}:{index}')
        data['Total'].append(f'{value_counts_total[index]}/{total} ({(value_counts_total[index]/total) * 100:.1f})')
        
        for df_by_target_col_key in dfs_by_target_col_classes.keys():
          total_by_target_col_class = dfs_by_target_col_classes[df_by_target_col_key].shape[0]
          value_counts_total_by_target_col = dfs_by_target_col_classes[df_by_target_col_key][col].value_counts()
          
          if index in value_counts_total_by_target_col.index.to_list():
            data[f'{target_col}:{df_by_target_col_key}'].append(f'{value_counts_total_by_target_col[index]}/{total_by_target_col_class} ({(value_counts_total_by_target_col[index]/total_by_target_col_class)  * 100:.1f})')
          else:
            data[f'{target_col}:{df_by_target_col_key}'].append('-')
            
  return pd.DataFrame(data)
  

df_distribution = get_distribution(df, 'mc_cri_vdrl', ['idade'])

df_distribution

In [None]:
dataset_path = os.getenv(EnvVarKeys.DISTRIBUTION_DATASET_PATH_KEY.value)
df_distribution.to_csv(dataset_path, sep=',',  index=False)

print(f'Distribution data set saved to: {dataset_path}')