### Train

In [1]:
from pandas import (
    concat,
    set_option, reset_option,
)

from contest.data import load
from contest.exploration.summary import describe
from contest.exploration.summary.types.params import descriptors as types_descriptors
from contest.exploration.summary.nulls.params import descriptors as nulls_descriptors
from contest.exploration.summary.uniques.params import descriptors as uniques_descriptors

#### Load

In [2]:
data_key = 'train'

data_set = load(
    data_keys=data_key,
    # print_info=True,
)

#### Types

In [3]:
data_descr = describe(
    data=data_set,
    descriptors=types_descriptors,
)

In [4]:
data_descr

rows       1460
columns      80
object       43
number       37
int          34
float         3
null       True
dtype: object

#### Nulls

In [5]:
nulls_transformers = [
    lambda _: _[_['null counts'] != 0],
    lambda _: _.sort_values(
        by=['null counts', 'type']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=nulls_descriptors,
    transformers=nulls_transformers,
)

In [6]:
# set_option(
#     'display.max_rows', data_descr.shape[0],
#     'display.max_columns', data_descr.shape[1],
# )

data_descr

Unnamed: 0,variable,null counts,null portion,type
1,Electrical,1,0.000685,object
2,MasVnrArea,8,0.005479,float64
3,MasVnrType,8,0.005479,object
4,BsmtQual,37,0.025342,object
5,BsmtCond,37,0.025342,object
6,BsmtFinType1,37,0.025342,object
7,BsmtExposure,38,0.026027,object
8,BsmtFinType2,38,0.026027,object
9,GarageYrBlt,81,0.055479,float64
10,GarageType,81,0.055479,object


In [7]:
# reset_option('display.*')

#### Uniques

In [8]:
uniques_transformers = [
    lambda _: _.sort_values(
        by=['unique counts', 'type', 'null']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=uniques_descriptors,
    transformers=uniques_transformers,
)

In [9]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Unnamed: 0,variable,unique counts,unique portion,null,type
1,Street,2,0.00137,False,object
2,Utilities,2,0.00137,False,object
3,CentralAir,2,0.00137,False,object
4,Alley,2,0.00137,True,object
5,BsmtHalfBath,3,0.002055,False,int64
6,HalfBath,3,0.002055,False,int64
7,LandSlope,3,0.002055,False,object
8,PavedDrive,3,0.002055,False,object
9,GarageFinish,3,0.002055,True,object
10,PoolQC,3,0.002055,True,object


In [10]:
reset_option('display.*')

#### Objects

In [11]:
data_type = object

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by=['count', 'unique'],
    ascending=[False, False],
    inplace=True,
)
# data_descr.index.name = 'variable'
# data_descr.reset_index(inplace=True)
# data_descr.index = data_descr.index + 1
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [12]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,unique,top,freq
1,Neighborhood,1460,25,NAmes,225
2,Exterior2nd,1460,16,VinylSd,504
3,Exterior1st,1460,15,VinylSd,515
4,Condition1,1460,9,Norm,1260
5,SaleType,1460,9,WD,1267
6,Condition2,1460,8,Norm,1445
7,HouseStyle,1460,8,1Story,726
8,RoofMatl,1460,8,CompShg,1434
9,Functional,1460,7,Typ,1360
10,RoofStyle,1460,6,Gable,1141


In [13]:
# reset_option('display.*')

#### Integers

In [14]:
data_type = int

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [15]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
2,OpenPorchSF,1460.0,46.660274,66.256028,0.0,0.0,25.0,68.0,547.0
3,KitchenAbvGr,1460.0,1.046575,0.220338,0.0,1.0,1.0,1.0,3.0
4,TotRmsAbvGrd,1460.0,6.517808,1.625393,2.0,5.0,6.0,7.0,14.0
5,Fireplaces,1460.0,0.613014,0.644666,0.0,0.0,1.0,1.0,3.0
6,GarageCars,1460.0,1.767123,0.747315,0.0,1.0,2.0,2.0,4.0
7,GarageArea,1460.0,472.980137,213.804841,0.0,334.5,480.0,576.0,1418.0
8,WoodDeckSF,1460.0,94.244521,125.338794,0.0,0.0,0.0,168.0,857.0
9,EnclosedPorch,1460.0,21.95411,61.119149,0.0,0.0,0.0,0.0,552.0
10,LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0


In [16]:
# reset_option('display.*')

#### Floats

In [17]:
data_type = float

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [18]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
2,GarageYrBlt,1379.0,1978.506164,24.689725,1900.0,1961.0,1980.0,2002.0,2010.0
3,LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0


In [19]:
# reset_option('display.*')

#### Sample

In [20]:
n = 3, 4, 3

data_subset = concat(
    [
        data_set.iloc[:n[0], :],
        data_set.sample(n[1]).sort_index(),
        data_set.iloc[-n[2]:, :],
    ]
)

data_descr = data_subset.transpose()
data_descr.index.name = 'variable'

In [21]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Id,1,2,3,1,95,100,431,1458,1459,1460
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MSSubClass,60,20,60,60,60,20,160,70,20,20
MSZoning,RL,RL,RL,RL,RL,RL,RM,RL,RL,RL
LotFrontage,65.0,80.0,68.0,65.0,69.0,77.0,21.0,66.0,68.0,75.0
LotArea,8450,9600,11250,8450,9337,9320,1680,9042,9717,9937
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,,,,,,,
LotShape,Reg,Reg,IR1,Reg,IR1,IR1,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
LotConfig,Inside,FR2,Inside,Inside,Inside,Inside,Inside,Inside,Inside,Inside


In [22]:
reset_option('display.*')