# `1 Table formatting`

<ol>
    <li>Libraries and constants</li>
    <li>Multi index feature (label, description)</li>
    <li>Dtypes</li>
<ol>

## 1.1 Libraries and constants

In [384]:
# Libraries
import pandas as pd

In [385]:
# Constants
COLUMNS_PATH                = "data/metadata/columns.csv"
COLUMNS                     = pd.read_csv(COLUMNS_PATH, index_col=0)

DTYPES                      = COLUMNS.dtype

CLASSES_PATH                = "data/metadata/classes.csv"
CLASSES                     = pd.read_csv(CLASSES_PATH, index_col=0)

GERMAN_CREDIT_DATA_PATH     = "data/statlog+german+credit+data/german.data"
GERMAN_CREDIT_DATA          = pd.read_csv(GERMAN_CREDIT_DATA_PATH, sep=' ', header=None)

In [386]:
COLUMNS.head()

Unnamed: 0_level_0,dtype,description
label,Unnamed: 1_level_1,Unnamed: 2_level_1
A1,category,Status of existing checking account
A2,int64,Duration in month
A3,category,Credit history
A4,category,Purpose
A5,int64,Credit amount


In [387]:
CLASSES.head()

Unnamed: 0_level_0,description,column_label
label,Unnamed: 1_level_1,Unnamed: 2_level_1
A11,... < 0 DM,A1
A12,0 <= ... < 200 DM,A1
A13,... >= 200 DM/salary assignments for at least ...,A1
A14,no checking account,A1
A30,no credits taken/all credits paid back duly,A3


In [388]:
GERMAN_CREDIT_DATA.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


## 1.2 Multi index feature (label, description)

In [389]:
# Apply (label, description) multi-index 
german_credit_data = GERMAN_CREDIT_DATA.copy()
german_credit_data.columns = pd.MultiIndex.from_tuples(
    list(zip(COLUMNS.index, COLUMNS.description)),
    names=['label', 'description']
)

In [390]:
german_credit_data.head()

label,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A12,A13,A14,A15,A16,A17,A18,A19,A20,T1
description,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors / guarantors,...,Property,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker,Credit quality
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


## 1.3 Dtypes

In [391]:
# Track dtypes evolution
dtypes_evolution = german_credit_data.dtypes

In [392]:
dtypes_evolution.head()

label  description                        
A1     Status of existing checking account    object
A2     Duration in month                       int64
A3     Credit history                         object
A4     Purpose                                object
A5     Credit amount                           int64
dtype: object

In [393]:
class MemoryUsageTracker:
    def __init__(self, actual_memory_usage):
        self.previous = None
        self.actual = actual_memory_usage
        self.diff = 0
    
    def update(self, new_memory_usage):
        self.previous = self.actual
        self.actual = new_memory_usage
        self.diff = self.actual - self.previous
    
    def diff(self):
        return self.actual - self.previous

In [None]:
# Track memory usage
memory_tracker = MemoryUsageTracker(german_credit_data.memory_usage(deep=True).sum())

In [395]:
# Convert dtypes
for (column, _description) in german_credit_data.columns:
    
    german_credit_data[column] = german_credit_data[column].astype(DTYPES[column])

In [396]:
# Append dtypes evolution
dtypes_evolution = pd.concat([dtypes_evolution, german_credit_data.dtypes], axis=1)
dtypes_evolution.columns = ['initial', 'final']

In [397]:
dtypes_evolution.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,initial,final
label,description,Unnamed: 2_level_1,Unnamed: 3_level_1
A1,Status of existing checking account,object,category
A2,Duration in month,int64,int64
A3,Credit history,object,category
A4,Purpose,object,category
A5,Credit amount,int64,int64


In [398]:
# Update memory usage tracker
memory_tracker.update(german_credit_data.memory_usage(deep=True).sum())

In [None]:
# Visualize memory usage
print(f"Previous usage:             \t{memory_tracker.previous} bytes")
print(f"Current usage:              \t{memory_tracker.actual} bytes")
print(f"Memory usage difference:    \t{memory_tracker.diff} bytes")

Previous usage:             	851144 bytes
Current usage:              	74978 bytes
Memory usage difference:    	-776166 bytes


## 1.?? Replace class labels by their descriptions

>This is doable, but only serves visualization.<br>
Using CATEGORICAL_COLUMNS.to_dict() makes this easy.

> Thus this should be avoided in data!


In [400]:
# CATEGORICAL_COLUMNS = CLASSES.column_label.unique()
# CATEGORICAL_COLUMNS

In [401]:
# # Replace class labels by their descriptions

# ## Iterate over the columns
# for (column, _description) in german_credit_data.columns:
    
#     ### If the column is categorical
#     if column in CATEGORICAL_COLUMNS:
        
#         #### Then replace the class' label by its description 
#         german_credit_data[column] = german_credit_data[column].replace(
            
#             CLASSES.description
#                 .to_dict()
#         )
        

In [402]:
# german_credit_data.head()