### Test

In [1]:
from pandas import (
    concat,
    set_option, reset_option,
)

from contest.data import load
from contest.exploration.summary import describe
from contest.exploration.summary.types.params import descriptors as types_descriptors
from contest.exploration.summary.nulls.params import descriptors as nulls_descriptors
from contest.exploration.summary.uniques.params import descriptors as uniques_descriptors

#### Load

In [2]:
data_key = 'test'

data_set = load(
    data_keys=data_key,
    # print_info=True,
)

#### Types

In [3]:
data_descr = describe(
    data=data_set,
    descriptors=types_descriptors,
)

In [4]:
data_descr

rows       1459
columns      79
object       43
number       36
int          25
float        11
null       True
dtype: object

#### Nulls

In [5]:
nulls_transformers = [
    lambda _: _[_['null counts'] != 0],
    lambda _: _.sort_values(
        by=['null counts', 'type']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=nulls_descriptors,
    transformers=nulls_transformers,
)

In [6]:
# set_option(
#     'display.max_rows', data_descr.shape[0],
#     'display.max_columns', data_descr.shape[1],
# )

data_descr

Unnamed: 0,variable,null counts,null portion,type
1,BsmtFinSF1,1,0.000685,float64
2,BsmtFinSF2,1,0.000685,float64
3,BsmtUnfSF,1,0.000685,float64
4,TotalBsmtSF,1,0.000685,float64
5,GarageCars,1,0.000685,float64
6,GarageArea,1,0.000685,float64
7,Exterior1st,1,0.000685,object
8,Exterior2nd,1,0.000685,object
9,KitchenQual,1,0.000685,object
10,SaleType,1,0.000685,object


In [7]:
# reset_option('display.*')

#### Uniques

In [8]:
uniques_transformers = [
    lambda _: _.sort_values(
        by=['unique counts', 'type', 'null']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=uniques_descriptors,
    transformers=uniques_transformers,
)

In [9]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Unnamed: 0,variable,unique counts,unique portion,null,type
1,Utilities,1,0.000685,True,object
2,Street,2,0.001371,False,object
3,CentralAir,2,0.001371,False,object
4,Alley,2,0.001371,True,object
5,PoolQC,2,0.001371,True,object
6,HalfBath,3,0.002056,False,int64
7,KitchenAbvGr,3,0.002056,False,int64
8,BsmtHalfBath,3,0.002056,True,float64
9,LandSlope,3,0.002056,False,object
10,PavedDrive,3,0.002056,False,object


In [10]:
reset_option('display.*')

#### Objects

In [11]:
data_type = object

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by=['count', 'unique'],
    ascending=[False, False],
    inplace=True,
)
# data_descr.index.name = 'variable'
# data_descr.reset_index(inplace=True)
# data_descr.index = data_descr.index + 1
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [12]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,unique,top,freq
1,Neighborhood,1459,25,NAmes,218
2,Condition1,1459,9,Norm,1251
3,HouseStyle,1459,7,1Story,745
4,RoofStyle,1459,6,Gable,1169
5,Foundation,1459,6,PConc,661
6,SaleCondition,1459,6,Normal,1204
7,LotConfig,1459,5,Inside,1081
8,Condition2,1459,5,Norm,1444
9,BldgType,1459,5,1Fam,1205
10,ExterCond,1459,5,TA,1256


In [13]:
# reset_option('display.*')

#### Integers

In [14]:
data_type = int

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [15]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,MSSubClass,1459.0,57.378341,42.74688,20.0,20.0,50.0,70.0,190.0
2,KitchenAbvGr,1459.0,1.042495,0.208472,0.0,1.0,1.0,1.0,2.0
3,MoSold,1459.0,6.104181,2.722432,1.0,4.0,6.0,8.0,12.0
4,MiscVal,1459.0,58.167923,630.806978,0.0,0.0,0.0,0.0,17000.0
5,PoolArea,1459.0,1.744345,30.491646,0.0,0.0,0.0,0.0,800.0
6,ScreenPorch,1459.0,17.064428,56.609763,0.0,0.0,0.0,0.0,576.0
7,3SsnPorch,1459.0,1.79438,20.207842,0.0,0.0,0.0,0.0,360.0
8,EnclosedPorch,1459.0,24.243317,67.227765,0.0,0.0,0.0,0.0,1012.0
9,OpenPorchSF,1459.0,48.313914,68.883364,0.0,0.0,28.0,72.0,742.0
10,WoodDeckSF,1459.0,93.174777,127.744882,0.0,0.0,0.0,168.0,1424.0


In [16]:
# reset_option('display.*')

#### Floats

In [17]:
data_type = float

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [18]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,BsmtFinSF1,1458.0,439.203704,455.268042,0.0,0.0,350.5,753.5,4010.0
2,BsmtFinSF2,1458.0,52.619342,176.753926,0.0,0.0,0.0,0.0,1526.0
3,BsmtUnfSF,1458.0,554.294925,437.260486,0.0,219.25,460.0,797.75,2140.0
4,TotalBsmtSF,1458.0,1046.11797,442.898624,0.0,784.0,988.0,1305.0,5095.0
5,GarageCars,1458.0,1.766118,0.775945,0.0,1.0,2.0,2.0,5.0
6,GarageArea,1458.0,472.768861,217.048611,0.0,318.0,480.0,576.0,1488.0
7,BsmtFullBath,1457.0,0.434454,0.530648,0.0,0.0,0.0,1.0,3.0
8,BsmtHalfBath,1457.0,0.065202,0.252468,0.0,0.0,0.0,0.0,2.0
9,MasVnrArea,1444.0,100.709141,177.6259,0.0,0.0,0.0,164.0,1290.0
10,GarageYrBlt,1381.0,1977.721217,26.431175,1895.0,1959.0,1979.0,2002.0,2207.0


In [19]:
# reset_option('display.*')

#### Sample

In [20]:
n = 3, 4, 3

data_subset = concat(
    [
        data_set.iloc[:n[0], :],
        data_set.sample(n[1]).sort_index(),
        data_set.iloc[-n[2]:, :],
    ]
)

data_descr = data_subset.transpose()
data_descr.index.name = 'variable'

In [21]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Id,1461,1462,1463,2136,2161,2434,2827,2917,2918,2919
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MSSubClass,20,20,60,190,20,20,50,20,85,60
MSZoning,RH,RL,RL,RM,RL,RL,RL,RL,RL,RL
LotFrontage,80.0,81.0,74.0,60.0,65.0,70.0,75.0,160.0,62.0,74.0
LotArea,11622,14267,13830,10320,9345,7903,9525,20000,10441,9627
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,Grvl,,,,,,
LotShape,Reg,IR1,IR1,Reg,IR1,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
LotConfig,Inside,Corner,Inside,Inside,Inside,Inside,Inside,Inside,Inside,Inside


In [22]:
reset_option('display.*')