# Real-world dataset example

In this example we demonstrate ...

In [2]:
%load_ext autoreload
%autoreload 2

import random
import datetime

import numpy as np
import pandas as pd
from visions import StandardSet

from compressario import Compress, storage_size, StorageSize, savings, savings_report

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
from typing import Union
from pint import UnitRegistry, Quantity


ureg = UnitRegistry()

class StorageSize:
    type_options = {"b": "byte", "kb": "kilobyte", "mb": "megabyte", "gb": "gigabyte"}

    def __init__(self, total: Union[int, float], units="mb") -> None:
        self._unit_str = self.type_options[units.lower()]
        self.total: Quantity = total * self._pint_unit

    @property
    def units(self) -> str:
        return self._unit_str

    @units.setter
    def units(self, val: str) -> None:
        trial_val = val.lower()
        if trial_val in self.type_options:
            self._unit_str = self.type_options[trial_val]
            self.total = self.convert_to_units(self.total)
        else:
            raise ValueError(f"Units must be one of {list(self.type_options.keys())}")
    
    @property
    def _pint_unit(self) -> Quantity:
        return getattr(ureg, self.units)

    def convert_to_units(self, value) -> float:
        return value.to(self.units)

    def __str__(self) -> str:
        return f"{self.total}"

    def __repr__(self) -> str:
        return self.__str__()
    
size = StorageSize(1000, 'b')
size.total == 1000


False

In [61]:
size.total.magnitude == 1000

True

In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.data.gz"
header = 'https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.names'
import urllib
urllib.request.urlopen(header).readlines()
#pd.read_csv(url)

[b'| This data was extracted from the census bureau database found at\n',
 b'| http://www.census.gov/ftp/pub/DES/www/welcome.html\n',
 b'| Donor: Terran Lane and Ronny Kohavi\n',
 b'|        Data Mining and Visualization\n',
 b'|        Silicon Graphics.\n',
 b'|        e-mail: terran@ecn.purdue.edu, ronnyk@sgi.com for questions.\n',
 b'|\n',
 b'| The data was split into train/test in approximately 2/3, 1/3\n',
 b"| proportions using MineSet's MIndUtil mineset-to-mlc.\n",
 b'|\n',
 b'| Prediction task is to determine the income level for the person\n',
 b'| represented by the record.  Incomes have been binned at the $50K\n',
 b'| level to present a binary classification problem, much like the\n',
 b'| original UCI/ADULT database.  The goal field of this data, however,\n',
 b'| was drawn from the "total person income" field rather than the\n',
 b'| "adjusted gross income" and may, therefore, behave differently than the\n',
 b'| orginal ADULT goal field.\n',
 b'|\n',
 b'| More informatio

Choose and load the dataset

In [None]:
file_name = r"gekentekende_voertuigen.1000000.csv"

# Load dataset
df = pd.read_csv(file_name)

In [None]:
compress = Compress()

In [None]:
original_size = storage_size(df, deep=True)
print(f'Original DataFrame size: {StorageSize(original_size, units="MB")}')

In [None]:
df_compressed = compress.it(df)

In [None]:
savings(df, df_compressed, deep=True)

In [None]:
savings_report(df, df_compressed, deep=True)

Memory reduction of around 90%, nice. But why stop there? We can go beyond. Up until now we have leveraged `visions` to detect dtypes and then compress accordingly. However, visions enables inference and coercion of types as well. It's possible to design relations between types that automatically coerse. For instance integers stored as floats (`[1.0, 2.0, 3.0]` => `[1, 2, 3]`) . 

In [None]:
df_compressed.memory_usage(deep=True)

In [None]:
df.memory_usage(deep=True)