# Exploration Data Analysis
## Datasets Summary

In [27]:
# import numpy as np
import pandas as pd

#### Options

In [92]:
pd.options.display.max_rows = None
# pd.options.display.max_columns = None

pd.options.display.float_format = '{:g}'.format

#### Tools

In [135]:
from typing import (
    Callable,
    Any, Union,
)
from pandas import (
    Series, DataFrame,
    MultiIndex,
    concat,
)
from itertools import product


def describe_1(
    data_frames: Union[dict[str, DataFrame], DataFrame],
    data_descriptors: dict[str, Callable[[DataFrame], Any]],
) -> Union[DataFrame, Series]:
    if isinstance(data_frames, dict):
        data_description_ = {
            data_key: {
                descriptor_key: descriptor_func(data_frame)
                for descriptor_key, descriptor_func in data_descriptors.items()
            }
            for data_key, data_frame in data_frames.items()
        }
        data_description_ = DataFrame(data_description_)
    elif isinstance(data_frames, DataFrame):
        data_description_ = {
            descriptor_key: descriptor_func(data_frames)
            for descriptor_key, descriptor_func in data_descriptors.items()
        }
        data_description_ = Series(data_description_)
    else:
        data_description_ = None
    
    return data_description_


def describe_2(
    data_frames: Union[dict[str, DataFrame], DataFrame],
    data_descriptors: dict[str, Callable[[DataFrame], Series]],
) -> DataFrame:
    if isinstance(data_frames, dict):
        data_description_ = {
            data_key: {
                descriptor_key: descriptor_func(data_frame)
                for descriptor_key, descriptor_func in data_descriptors.items()
            }
            for data_key, data_frame in data_frames.items()
        }
        data_description_ = concat(
            [
                pd.DataFrame(
                    {
                        (data_key, description_key): description_series
                        for description_key, description_series in data_description.items()
                    }
                )
                for data_key, data_description in data_description_.items()
            ],
            axis=1,
        )
    elif isinstance(data_frames, DataFrame):
        data_description_ = {
            descriptor_key: descriptor_func(data_frames)
            for descriptor_key, descriptor_func in data_descriptors.items()
        }
        data_description_ = DataFrame(data_description_)
    else:
        data_description_ = None
    return data_description_


def describe_2_(
    data_frames: dict[str, DataFrame],
    data_descriptors: dict[str, Callable[[DataFrame], Series]],
) -> DataFrame:
    data_description = describe_2(
        data_frames,
        data_descriptors,
    )
    data_description.columns = MultiIndex.from_tuples(
        [
            description_column[::-1]
            for description_column in data_description.columns
        ]
    )
    data_description = data_description[
        list(
            product(
                data_descriptors.keys(),
                data_frames.keys(),
            )
        )
    ]
    return data_description


def transform_1(
    data_frame: DataFrame,
    data_transformers: list[Callable[[DataFrame], DataFrame]]
) -> DataFrame:
    data_frame_ = data_frame
    for data_transformer in data_transformers:
        data_frame_ = data_transformer(data_frame_)
    return data_frame_

### Load

In [197]:
data_file_paths = dict(
    train='../input/train.csv',
    test='../input/test.csv',
    submission='../input/sample_submission.csv',
)

data_load_params = dict(
    # sep=',',
    # header=0,
    index_col='Id',
    # na_values='NA',
    # keep_default_na=True,
)

data_sets = {
    data_key: pd.read_csv(data_path, **data_load_params)
    for data_key, data_path in data_file_paths.items()
}

### Info

In [234]:
# for data_key, data_set in data_sets.items():
#     print(f'\n{data_key}:\n')
#     data_set.info()

### Counts

#### Types

In [236]:
# data_descriptors = {
#     'rows': lambda _: _.shape[0],
#     'columns': lambda _: _.shape[1],
#     'object': lambda _: _.select_dtypes(object).shape[1],
#     # 'number': lambda _: _.select_dtypes(np.number).shape[1],
#     'number': lambda _: _._get_numeric_data().shape[1],
#     'int': lambda _: _.select_dtypes(int).shape[1],
#     'float': lambda _: _.select_dtypes(float).shape[1],
#     'null': lambda _: _.isnull().values.any(),
#     # 'notnull': lambda _: _.notnull().values.all(),
# }

# data_description = describe_1(
#     data_sets,
#     data_descriptors,
# )

# data_description

#### Nulls

In [238]:
# data_keys = ['train', 'test']

# data_descriptors = {
#     # 'total counts': lambda _: _.shape[0],
#     'null counts': lambda _: _.isnull().sum(),
#     'null portion': lambda _: _.isnull().sum() / _.shape[0],
#     # 'notnull counts': lambda _: _.notnull().sum(),
#     # 'notnull counts': lambda _: _.count(),
#     # 'notnull portion': lambda _: _.count() / _.shape[0],
#     'type': lambda _: _.dtypes,
# }

# data_transformers = [
#     lambda _: _.dropna(),
#     lambda _: _[
#         (_['null counts'] != 0).any(axis=1)
#     ],
#     lambda _: _.sort_values(
#         by=[
#             ('null counts', 'train'),
#             ('null counts', 'test'),
#         ],
#         # ascending=[False, False],
#         ascending=False,
#     ),
#     lambda _: _.reset_index().rename(
#         columns={'index': 'variable'}
#     ),
# ]

# data_description = describe_2_(
#     {
#         data_key: data_sets[data_key]
#         for data_key in data_keys
#     },
#     data_descriptors,
# )

# data_description = transform_1(
#     data_description,
#     data_transformers
# )

# data_description

#### Uniques

In [242]:
# data_keys = ['train', 'test']

# data_descriptors = {
#     # 'total counts': lambda _: _.shape[0],
#     # 'unique counts': lambda _: _.nunique(dropna=True),
#     'unique counts': lambda _: _.nunique(
#         # dropna=True,
#     ),
#     'unique portion': lambda _: _.nunique() / _.shape[0],
#     'null': lambda _: _.isnull().any(),
#     'type': lambda _: _.dtypes,
# }

# data_transformers = [
#     lambda _: _.dropna(),
#     lambda _: _.sort_values(
#         by=[
#             ('unique counts', 'train'),
#             ('unique counts', 'test'),
#         ],
#         # ascending=[False, False],
#         ascending=False,
#     ),
#     lambda _: _.reset_index().rename(
#         columns={'index': 'variable'}
#     ),
# ]

# data_description = describe_2_(
#     {
#         data_key: data_sets[data_key]
#         for data_key in data_keys
#     },
#     data_descriptors,
# )

# data_description = transform_1(
#     data_description,
#     data_transformers,
# )

# data_description

### Train

In [253]:
# data_key = 'train'
# data_set = data_sets[data_key]

#### Variables

In [245]:
# data_descriptors = {
#     'type': lambda _: _.dtypes,
#     'unique': lambda _: _.nunique(
#         # dropna=True,
#     ),
#     'null': lambda _: _.isnull().any(),
# }

# data_transformers = [
#     lambda _: _.sort_values(
#         by=[
#             'unique',
#             'null',
#         ],
#         # ascending=[True, True],
#         # ascending=True,
#     ),
#     lambda _: _.reset_index().rename(
#         columns={'index': 'variable'}
#     ),
# ]

# data_description = describe_2(
#     data_set,
#     data_descriptors,
# )

# data_description = transform_1(
#     data_description,
#     data_transformers,
# )

# data_description

#### Sample

In [195]:
# n = 5

# # data_sample = data_set.head(n).transpose()
# # data_sample = data_set.tail(n).transpose()
# data_sample = data_set.sample(n).sort_index().transpose()

# data_sample

#### Describe

In [252]:
# data_type = object
# data_subset = data_set.select_dtypes(data_type)
# data_descr = data_subset.describe().transpose().sort_values(
#     ['count', 'unique'],
#     # ascending=[True, True],
# )

# data_descr

In [251]:
# data_type = int
# data_subset = data_set.select_dtypes(data_type)
# data_descr = data_subset.describe().transpose().sort_values(
#     'count',
#     # ascending=True,
# )

# data_descr

In [250]:
# data_type = float
# data_subset = data_set.select_dtypes(data_type)
# data_descr = data_subset.describe().transpose().sort_values(
#     'count',
#     # ascending=True,
# )

# data_descr

In [290]:
def describe_3(
    data_frames: Union[dict[str, DataFrame], DataFrame],
    data_descriptor: Callable[[DataFrame], DataFrame],
) -> DataFrame:
    if isinstance(data_frames, dict):
        data_description_ = {
            data_key: data_descriptor(data_frame)
            for data_key, data_frame in data_frames.items()
        }
        data_description_ = concat(
            [
                pd.DataFrame(
                    {
                        (data_key, description_column): data_description[description_column]
                        for description_column in data_description  #.columns
                    }
                )
                for data_key, data_description in data_description_.items()
            ],
            axis=1,
        )
    
    elif isinstance(data_frames, DataFrame):
        data_description_ = None
    
    else:
        data_description_ = None
    
    return data_description_

In [291]:
# data_transformers = [
#     lambda _: _.select_dtype(float),    
# ]

# data_description = [
    
# ]

In [295]:
data_description = describe_3(
    data_sets,
    lambda _: _._get_numeric_data().describe(),
)

describedata_description

Unnamed: 0_level_0,train,train,train,train,train,train,train,train,train,train,...,test,test,test,test,test,test,test,test,test,submission
Unnamed: 0_level_1,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,56.8973,70.05,10516.8,6.09932,5.57534,1971.27,1984.87,103.685,443.64,46.5493,...,93.1748,48.3139,24.2433,1.79438,17.0644,1.74435,58.1679,6.10418,2007.77,179184.0
std,42.3006,24.2848,9981.26,1.383,1.1128,30.2029,20.6454,181.066,456.098,161.319,...,127.745,68.8834,67.2278,20.2078,56.6098,30.4916,630.807,2.72243,1.30174,16518.3
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,135751.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,168703.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,179209.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,186789.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0,281644.0
