### Single set: train

In [1]:
from numpy import number
from pandas import set_option, reset_option

from contest.data import load
from contest.exploration.tools import describe, transform, sample
from contest.exploration.descriptions.types import single_describe_params as types_describe_params
from contest.exploration.descriptions.objects import transform_transformers as objects_transform_transformers
from contest.exploration.descriptions.numbers import transform_transformers as numbers_transform_transformers
from contest.exploration.descriptions.nulls import single_describe_params as nulls_describe_params
from contest.exploration.descriptions.uniques import single_describe_params as uniques_describe_params
from contest.exploration.descriptions.sample import transform_transformers as sample_transform_transformers

#### Load

In [2]:
data_key = 'train'

data_set = load(
    data_keys=data_key,
    print_info=True,
)

train: train.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int6

#### Types

In [3]:
types_description = describe(
    data=data_set,
    **types_describe_params
)

In [4]:
# set_option('display.max_row', len(types_description))

types_description

                                                detect nulls
shape  rows                                                     1460
       columns                                                    80
                                                False             61
                                                True              19
type   <class 'object'>                                           43
                                                False             27
                                                True              16
       <class 'numpy.number'>                                     37
                                                False             34
                                                True               3
                               <class 'int'>                      34
                                                False             34
                                                True               0
                               <class 'flo

In [5]:
# reset_option('display.*')

##### Objects

In [6]:
data_type = object

objects_description = transform(
    data=data_set.select_dtypes(data_type).describe(),
    transformers=objects_transform_transformers,
)

In [7]:
# set_option(
#     'display.max_rows', objects_description.shape[0],
#     'display.max_columns', objects_description.shape[1],
# )

objects_description

Unnamed: 0_level_0,Unnamed: 1_level_0,count,unique,top,freq
Unnamed: 0_level_1,variable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Street,1460,2,Pave,1454
2,Utilities,1460,2,AllPub,1459
3,CentralAir,1460,2,Y,1365
4,Alley,91,2,Grvl,50
5,LandSlope,1460,3,Gtl,1382
6,PavedDrive,1460,3,Y,1340
7,GarageFinish,1379,3,Unf,605
8,PoolQC,7,3,Gd,3
9,LotShape,1460,4,Reg,925
10,LandContour,1460,4,Lvl,1311


In [8]:
# reset_option('display.*')

##### Numbers

In [9]:
data_type = number

numbers_description = transform(
    data=data_set.select_dtypes(data_type).describe(),
    transformers=numbers_transform_transformers,
)

In [10]:
# set_option(
#     'display.max_rows', numbers_description.shape[0],
#     'display.max_columns', numbers_description.shape[1],
# )

numbers_description

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0_level_1,variable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
2,LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
3,LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
4,OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
5,OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
6,YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
7,YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
8,MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
9,BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0
10,BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.0,0.0,0.0,1474.0


In [11]:
# reset_option('display.*')

#### Nulls 

In [12]:
nulls_description = describe(
    data=data_set,
    **nulls_describe_params
)

In [13]:
# set_option(
#     'display.max_rows', nulls_description.shape[0],
#     'display.max_columns', nulls_description.shape[1],
# )

nulls_description

Unnamed: 0_level_0,Unnamed: 1_level_0,type,counts,counts,counts,portion,portion
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,null,notnull,null,notnull
Unnamed: 0_level_2,variable,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,Electrical,object,1460,1,1459,0.000685,0.999315
2,MasVnrArea,float64,1460,8,1452,0.005479,0.994521
3,MasVnrType,object,1460,8,1452,0.005479,0.994521
4,BsmtQual,object,1460,37,1423,0.025342,0.974658
5,BsmtCond,object,1460,37,1423,0.025342,0.974658
6,BsmtFinType1,object,1460,37,1423,0.025342,0.974658
7,BsmtExposure,object,1460,38,1422,0.026027,0.973973
8,BsmtFinType2,object,1460,38,1422,0.026027,0.973973
9,GarageYrBlt,float64,1460,81,1379,0.055479,0.944521
10,GarageType,object,1460,81,1379,0.055479,0.944521


In [14]:
# reset_option('display.*')

#### Uniques

In [15]:
uniques_description = describe(
    data=data_set,
    **uniques_describe_params
)

In [16]:
set_option(
    'display.max_rows', uniques_description.shape[0],
    # 'display.max_columns', uniques_description.shape[1],
)

uniques_description

Unnamed: 0_level_0,Unnamed: 1_level_0,type,counts,counts,counts,unique,unique,unique,unique
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,non-null,null,counts,counts,ratio,ratio
Unnamed: 0_level_2,include nulls,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,True,False,True,False
Unnamed: 0_level_3,variable,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
1,Street,object,1460,1460,0,2,2,0.00137,0.00137
2,Utilities,object,1460,1460,0,2,2,0.00137,0.00137
3,CentralAir,object,1460,1460,0,2,2,0.00137,0.00137
4,Alley,object,1460,91,1369,3,2,0.002055,0.021978
5,BsmtHalfBath,int64,1460,1460,0,3,3,0.002055,0.002055
6,HalfBath,int64,1460,1460,0,3,3,0.002055,0.002055
7,LandSlope,object,1460,1460,0,3,3,0.002055,0.002055
8,PavedDrive,object,1460,1460,0,3,3,0.002055,0.002055
9,GarageFinish,object,1460,1379,81,4,3,0.00274,0.002175
10,PoolQC,object,1460,7,1453,4,3,0.00274,0.428571


In [17]:
reset_option('display.*')

#### Sample

In [18]:
n = 3, 6, 3

data_sample = transform(
    data=sample(
        data=load(data_keys=data_key),
        n=n,
    ),
    transformers=sample_transform_transformers
)

In [19]:
set_option(
    'display.max_rows', data_sample.shape[0],
    # 'display.max_columns', data_sample.shape[1],
)
    
data_sample

Unnamed: 0_level_0,id,1,2,3,130,166,883,953,972,1239,1458,1459,1460
Unnamed: 0_level_1,variable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,MSSubClass,60,20,60,20,190,60,85,160,20,70,20,20
2,MSZoning,RL,RL,RL,RL,RL,RL,RL,RL,RL,RL,RL,RL
3,LotFrontage,65.0,80.0,68.0,69.0,62.0,,60.0,36.0,63.0,66.0,68.0,75.0
4,LotArea,8450,9600,11250,8973,10106,9636,7200,2268,13072,9042,9717,9937
5,Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
6,Alley,,,,,,,,,,,,
7,LotShape,Reg,Reg,IR1,Reg,Reg,IR1,Reg,Reg,Reg,Reg,Reg,Reg
8,LandContour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
9,Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
10,LotConfig,Inside,FR2,Inside,Inside,Inside,Corner,Inside,Inside,Inside,Inside,Inside,Inside


In [20]:
reset_option('display.*')