# `1 Table formatting`

<ol start=0>
    <li>Libraries and constants</li>
    <li>Multi index feature (label, description)</li>
    <li>Dtypes</li>
<ol>

## 1.0 Libraries and constants

In [1]:
# Libraries
import pandas as pd
from pandas.api.types import CategoricalDtype

# Project utils
from utils.data_utils import map_column_inplace
from utils.memory import MemoryUsageTracker

In [2]:
# Constants
COLUMNS_PATH                = "data/metadata/columns.csv"
COLUMNS                     = pd.read_csv(COLUMNS_PATH, index_col=0)

DTYPES                      = COLUMNS.dtype

CLASSES_PATH                = "data/metadata/classes.csv"
CLASSES                     = pd.read_csv(CLASSES_PATH, index_col=0)

GERMAN_CREDIT_DATA_PATH     = "data/statlog+german+credit+data/german.data"
GERMAN_CREDIT_DATA          = pd.read_csv(GERMAN_CREDIT_DATA_PATH, sep=' ', header=None)

## 1.1 Multi index feature (label, description)

In [3]:
# Apply (label, description) multi-index 
german_credit_data = GERMAN_CREDIT_DATA.copy()
german_credit_data.columns = pd.MultiIndex.from_tuples(
    list(zip(COLUMNS.index, COLUMNS.description)),
    names=['label', 'description']
)

## 1.2 Dtypes

In [4]:
# Track dtypes evolution
dtypes_evolution = pd.DataFrame()
dtypes_evolution = pd.concat([dtypes_evolution, german_credit_data.dtypes], axis=1)

In [5]:
dtypes_evolution.head()

Unnamed: 0,Unnamed: 1,0
A1,Status of existing checking account,object
A2,Duration in month,int64
A3,Credit history,object
A4,Purpose,object
A5,Credit amount,int64


In [6]:
# Track memory usage
memory_tracker = MemoryUsageTracker(german_credit_data.memory_usage(deep=True).sum())

In [7]:
# Convert dtypes
    
## 1 Map boolean columns -----------------------------------------------------------------------------------------
column_map = {
    # column: boolean_map
}

### 1.1 Arrange the mappings
#### A19 Telephone
''' A191,   none    =>  False
    A192,   yes     =>  True'''

column              = ('A19', 'Telephone')
boolean_map         = {'A191': False, 'A192': True}
column_map[column]  = boolean_map

#### A20 Foreign worker
''' A201,   yes     =>  True
    A202,   no      =>  False'''

column              = ('A20', 'Foreign worker')
boolean_map         = {'A201': True, 'A202': False}
column_map[column]  = boolean_map

#### T1 Is good credit
''' 1,      good    =>  True
    2,      bad     =>  False'''

column              = ('T1', 'Is good credit')
boolean_map         = {1: True, 2: False}
column_map[column]  = boolean_map

### 1.2 Map the columns in place
for (column, boolean_map) in column_map.items():
    map_column_inplace(
        df          = german_credit_data,
        column      = column,
        value_map   = boolean_map
    )

## 2 Convert dtypes ----------------------------------------------------------------------------------------------
for (column, _description) in german_credit_data.columns:
    if column == 'A8':
        continue  # Skip ordinal column for now
    german_credit_data[column] = german_credit_data[column].astype(DTYPES[column])

## 3 Finally convert the ordinal column ---------------------------------------------------------------------------

### 3.1. Define the order of your categories
installment_categories = [1, 2, 3, 4]

### 3.2 Create the custom ordinal data type
ordinal_dtype = CategoricalDtype(categories=installment_categories, ordered=True)

### 3.2 Apply the new data type to the column
ORDINAL_COLUMN = ('A8', 'Installment rate in percentage of disposable income')
german_credit_data[ORDINAL_COLUMN] = german_credit_data[ORDINAL_COLUMN].astype(ordinal_dtype)

In [8]:
# Append dtypes evolution
dtypes_evolution = pd.concat([dtypes_evolution, german_credit_data.dtypes], axis=1)

In [9]:
dtypes_evolution.head()

Unnamed: 0,Unnamed: 1,0,0.1
A1,Status of existing checking account,object,category
A2,Duration in month,int64,int64
A3,Credit history,object,category
A4,Purpose,object,category
A5,Credit amount,int64,float64


In [10]:
# Update memory usage tracker
memory_tracker.update(german_credit_data.memory_usage(deep=True).sum())

# Visualize memory usage as a DataFrame
memory_usage_df = pd.DataFrame({
    'Previous usage (bytes)': [memory_tracker.previous],
    'Current usage (bytes)': [memory_tracker.latest],
    'Difference (bytes)': [memory_tracker.diff],
    'Difference (%)': [(memory_tracker.diff / memory_tracker.previous) * 100 if memory_tracker.previous else None]
})

In [11]:
memory_usage_df.head()

Unnamed: 0,Previous usage (bytes),Current usage (bytes),Difference (bytes),Difference (%)
0,747144,67782,-679362,-90.927853


# TEST

In [12]:
# Import
from tests.table_formatting_test import TableFormattingTest

In [13]:
test = TableFormattingTest()
test.test_column_statistics(
    original = GERMAN_CREDIT_DATA,
    processed = german_credit_data
)

All column statistics are unchanged after processing.


# END

In [14]:
# Final German Credit Data
german_credit_data.head()

label,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A12,A13,A14,A15,A16,A17,A18,A19,A20,T1
description,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker,Is good credit
0,A11,6,A34,A43,1169.0,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,True,True,True
1,A12,48,A32,A43,5951.0,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,False,True,False
2,A14,12,A34,A46,2096.0,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,False,True,True
3,A11,42,A32,A42,7882.0,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,False,True,True
4,A11,24,A33,A40,4870.0,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,False,True,False


In [15]:
# Save Formatted Table
german_credit_data.to_csv('data/processed-data/1-formatted-table.csv', index=False)