# Data Exploration
## Summary

### Info

In [1]:
from contest.data import load

In [2]:
data_keys = ['train', 'test']

data_sets = load(
    data_keys=data_keys,
    print_info=True,
)

train: train.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int6

### Types

In [3]:
from pandas import set_option, reset_option
from contest.exploration.summary.types import describe

In [4]:
data_descr = describe()

In [5]:
set_option(
    'display.max_rows', data_descr.shape[0],
    'display.max_columns', data_descr.shape[1],
)

data_descr

Unnamed: 0,train,test
rows,1460,1459
columns,80,79
object,43,43
number,37,36
int,34,25
float,3,11
,True,True


In [6]:
reset_option('display.*')

### Nulls 

In [7]:
# from pandas import set_option, reset_option
from contest.exploration.summary.nulls import describe

In [8]:
data_descr = describe()

In [9]:
# set_option(
#     'display.max_rows', data_descr.shape[0],
#     'display.max_columns', data_descr.shape[1],
# )

data_descr

Unnamed: 0_level_0,variable,null counts,null counts,null portion,null portion,type,type
Unnamed: 0_level_1,Unnamed: 1_level_1,train,test,train,test,train,test
0,Exterior1st,0,1,0.0,0.000685,object,object
1,Exterior2nd,0,1,0.0,0.000685,object,object
2,BsmtFinSF1,0,1,0.0,0.000685,int64,float64
3,BsmtFinSF2,0,1,0.0,0.000685,int64,float64
4,BsmtUnfSF,0,1,0.0,0.000685,int64,float64
5,TotalBsmtSF,0,1,0.0,0.000685,int64,float64
6,KitchenQual,0,1,0.0,0.000685,object,object
7,GarageCars,0,1,0.0,0.000685,int64,float64
8,GarageArea,0,1,0.0,0.000685,int64,float64
9,SaleType,0,1,0.0,0.000685,object,object


In [10]:
# reset_option('display.*')

### Uniques

In [11]:
from pandas import set_option, reset_option
from contest.exploration.summary.uniques import describe

In [12]:
data_descr = describe()

In [13]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Unnamed: 0_level_0,variable,unique counts,unique counts,unique portion,unique portion,null,null,type,type
Unnamed: 0_level_1,Unnamed: 1_level_1,train,test,train,test,train,test,train,test
0,Utilities,2,1,0.00137,0.000685,False,True,object,object
1,Street,2,2,0.00137,0.001371,False,False,object,object
2,CentralAir,2,2,0.00137,0.001371,False,False,object,object
3,Alley,2,2,0.00137,0.001371,True,True,object,object
4,PoolQC,3,2,0.002055,0.001371,True,True,object,object
5,LandSlope,3,3,0.002055,0.002056,False,False,object,object
6,HalfBath,3,3,0.002055,0.002056,False,False,int64,int64
7,PavedDrive,3,3,0.002055,0.002056,False,False,object,object
8,BsmtHalfBath,3,3,0.002055,0.002056,False,True,int64,float64
9,GarageFinish,3,3,0.002055,0.002056,True,True,object,object


In [14]:
reset_option('display.*')

### Train

In [15]:
from pandas import (
    concat,
    set_option, reset_option,
)

from contest.data import load
from contest.exploration.summary import describe
from contest.exploration.summary.types.params import descriptors as types_descriptors
from contest.exploration.summary.nulls.params import descriptors as nulls_descriptors
from contest.exploration.summary.uniques.params import descriptors as uniques_descriptors

#### Load

In [16]:
data_key = 'train'

data_set = load(
    data_keys=data_key,
    # print_info=True,
)

#### Types

In [17]:
data_descr = describe(
    data=data_set,
    descriptors=types_descriptors,
)

In [18]:
data_descr

rows       1460
columns      80
object       43
number       37
int          34
float         3
null       True
dtype: object

#### Nulls

In [19]:
nulls_transformers = [
    lambda _: _[_['null counts'] != 0],
    lambda _: _.sort_values(
        by=['null counts', 'type']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=nulls_descriptors,
    transformers=nulls_transformers,
)

In [20]:
# set_option(
#     'display.max_rows', data_descr.shape[0],
#     'display.max_columns', data_descr.shape[1],
# )

data_descr

Unnamed: 0,variable,null counts,null portion,type
1,Electrical,1,0.000685,object
2,MasVnrArea,8,0.005479,float64
3,MasVnrType,8,0.005479,object
4,BsmtQual,37,0.025342,object
5,BsmtCond,37,0.025342,object
6,BsmtFinType1,37,0.025342,object
7,BsmtExposure,38,0.026027,object
8,BsmtFinType2,38,0.026027,object
9,GarageYrBlt,81,0.055479,float64
10,GarageType,81,0.055479,object


In [21]:
# reset_option('display.*')

#### Uniques

In [22]:
uniques_transformers = [
    lambda _: _.sort_values(
        by=['unique counts', 'type', 'null']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=uniques_descriptors,
    transformers=uniques_transformers,
)

In [23]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Unnamed: 0,variable,unique counts,unique portion,null,type
1,Street,2,0.00137,False,object
2,Utilities,2,0.00137,False,object
3,CentralAir,2,0.00137,False,object
4,Alley,2,0.00137,True,object
5,BsmtHalfBath,3,0.002055,False,int64
6,HalfBath,3,0.002055,False,int64
7,LandSlope,3,0.002055,False,object
8,PavedDrive,3,0.002055,False,object
9,GarageFinish,3,0.002055,True,object
10,PoolQC,3,0.002055,True,object


In [24]:
reset_option('display.*')

#### Objects

In [25]:
data_type = object

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by=['count', 'unique'],
    ascending=[False, False],
    inplace=True,
)
# data_descr.index.name = 'variable'
# data_descr.reset_index(inplace=True)
# data_descr.index = data_descr.index + 1
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [26]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,unique,top,freq
1,Neighborhood,1460,25,NAmes,225
2,Exterior2nd,1460,16,VinylSd,504
3,Exterior1st,1460,15,VinylSd,515
4,Condition1,1460,9,Norm,1260
5,SaleType,1460,9,WD,1267
6,Condition2,1460,8,Norm,1445
7,HouseStyle,1460,8,1Story,726
8,RoofMatl,1460,8,CompShg,1434
9,Functional,1460,7,Typ,1360
10,RoofStyle,1460,6,Gable,1141


In [27]:
# reset_option('display.*')

#### Integers

In [28]:
data_type = int

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [29]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
2,OpenPorchSF,1460.0,46.660274,66.256028,0.0,0.0,25.0,68.0,547.0
3,KitchenAbvGr,1460.0,1.046575,0.220338,0.0,1.0,1.0,1.0,3.0
4,TotRmsAbvGrd,1460.0,6.517808,1.625393,2.0,5.0,6.0,7.0,14.0
5,Fireplaces,1460.0,0.613014,0.644666,0.0,0.0,1.0,1.0,3.0
6,GarageCars,1460.0,1.767123,0.747315,0.0,1.0,2.0,2.0,4.0
7,GarageArea,1460.0,472.980137,213.804841,0.0,334.5,480.0,576.0,1418.0
8,WoodDeckSF,1460.0,94.244521,125.338794,0.0,0.0,0.0,168.0,857.0
9,EnclosedPorch,1460.0,21.95411,61.119149,0.0,0.0,0.0,0.0,552.0
10,LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0


In [30]:
# reset_option('display.*')

#### Floats

In [31]:
data_type = float

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [32]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
2,GarageYrBlt,1379.0,1978.506164,24.689725,1900.0,1961.0,1980.0,2002.0,2010.0
3,LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0


In [33]:
# reset_option('display.*')

#### Sample

In [34]:
n = 3, 4, 3

data_subset = concat(
    [
        data_set.iloc[:n[0], :],
        data_set.sample(n[1]).sort_index(),
        data_set.iloc[-n[2]:, :],
    ]
)

data_descr = data_subset.transpose()
data_descr.index.name = 'variable'

In [35]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Id,1,2,3,381,727,1040,1131,1458,1459,1460
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MSSubClass,60,20,60,50,20,180,50,70,20,20
MSZoning,RL,RL,RL,RL,RL,RM,RL,RL,RL,RL
LotFrontage,65.0,80.0,68.0,50.0,,21.0,65.0,66.0,68.0,75.0
LotArea,8450,9600,11250,5000,21695,1477,7804,9042,9717,9937
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,Pave,,,,,,
LotShape,Reg,Reg,IR1,Reg,IR1,Reg,Reg,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
LotConfig,Inside,FR2,Inside,Inside,Corner,Inside,Inside,Inside,Inside,Inside


In [36]:
reset_option('display.*')

### Test

In [37]:
from pandas import (
    concat,
    set_option, reset_option,
)

from contest.data import load
from contest.exploration.summary import describe
from contest.exploration.summary.types.params import descriptors as types_descriptors
from contest.exploration.summary.nulls.params import descriptors as nulls_descriptors
from contest.exploration.summary.uniques.params import descriptors as uniques_descriptors

#### Load

In [38]:
data_key = 'test'

data_set = load(
    data_keys=data_key,
    # print_info=True,
)

#### Types

In [39]:
data_descr = describe(
    data=data_set,
    descriptors=types_descriptors,
)

In [40]:
data_descr

rows       1459
columns      79
object       43
number       36
int          25
float        11
null       True
dtype: object

#### Nulls

In [41]:
nulls_transformers = [
    lambda _: _[_['null counts'] != 0],
    lambda _: _.sort_values(
        by=['null counts', 'type']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=nulls_descriptors,
    transformers=nulls_transformers,
)

In [42]:
# set_option(
#     'display.max_rows', data_descr.shape[0],
#     'display.max_columns', data_descr.shape[1],
# )

data_descr

Unnamed: 0,variable,null counts,null portion,type
1,BsmtFinSF1,1,0.000685,float64
2,BsmtFinSF2,1,0.000685,float64
3,BsmtUnfSF,1,0.000685,float64
4,TotalBsmtSF,1,0.000685,float64
5,GarageCars,1,0.000685,float64
6,GarageArea,1,0.000685,float64
7,Exterior1st,1,0.000685,object
8,Exterior2nd,1,0.000685,object
9,KitchenQual,1,0.000685,object
10,SaleType,1,0.000685,object


In [43]:
# reset_option('display.*')

#### Uniques

In [44]:
uniques_transformers = [
    lambda _: _.sort_values(
        by=['unique counts', 'type', 'null']
    ),
    lambda _: _.reset_index().rename(
        columns={'index': 'variable'}
    ),
    lambda _: _.set_index(_.index + 1),
]

data_descr = describe(
    data=data_set,
    descriptors=uniques_descriptors,
    transformers=uniques_transformers,
)

In [45]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Unnamed: 0,variable,unique counts,unique portion,null,type
1,Utilities,1,0.000685,True,object
2,Street,2,0.001371,False,object
3,CentralAir,2,0.001371,False,object
4,Alley,2,0.001371,True,object
5,PoolQC,2,0.001371,True,object
6,HalfBath,3,0.002056,False,int64
7,KitchenAbvGr,3,0.002056,False,int64
8,BsmtHalfBath,3,0.002056,True,float64
9,LandSlope,3,0.002056,False,object
10,PavedDrive,3,0.002056,False,object


In [46]:
reset_option('display.*')

#### Objects

In [47]:
data_type = object

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by=['count', 'unique'],
    ascending=[False, False],
    inplace=True,
)
# data_descr.index.name = 'variable'
# data_descr.reset_index(inplace=True)
# data_descr.index = data_descr.index + 1
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [48]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,unique,top,freq
1,Neighborhood,1459,25,NAmes,218
2,Condition1,1459,9,Norm,1251
3,HouseStyle,1459,7,1Story,745
4,RoofStyle,1459,6,Gable,1169
5,Foundation,1459,6,PConc,661
6,SaleCondition,1459,6,Normal,1204
7,LotConfig,1459,5,Inside,1081
8,Condition2,1459,5,Norm,1444
9,BldgType,1459,5,1Fam,1205
10,ExterCond,1459,5,TA,1256


In [49]:
# reset_option('display.*')

#### Integers

In [50]:
data_type = int

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [51]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,MSSubClass,1459.0,57.378341,42.74688,20.0,20.0,50.0,70.0,190.0
2,KitchenAbvGr,1459.0,1.042495,0.208472,0.0,1.0,1.0,1.0,2.0
3,MoSold,1459.0,6.104181,2.722432,1.0,4.0,6.0,8.0,12.0
4,MiscVal,1459.0,58.167923,630.806978,0.0,0.0,0.0,0.0,17000.0
5,PoolArea,1459.0,1.744345,30.491646,0.0,0.0,0.0,0.0,800.0
6,ScreenPorch,1459.0,17.064428,56.609763,0.0,0.0,0.0,0.0,576.0
7,3SsnPorch,1459.0,1.79438,20.207842,0.0,0.0,0.0,0.0,360.0
8,EnclosedPorch,1459.0,24.243317,67.227765,0.0,0.0,0.0,0.0,1012.0
9,OpenPorchSF,1459.0,48.313914,68.883364,0.0,0.0,28.0,72.0,742.0
10,WoodDeckSF,1459.0,93.174777,127.744882,0.0,0.0,0.0,168.0,1424.0


In [52]:
# reset_option('display.*')

#### Floats

In [53]:
data_type = float

data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe()

data_descr = data_descr.transpose()
data_descr.sort_values(
    by='count',
    ascending=False,
    inplace=True,
)
data_descr.insert(0, 'variable', data_descr.index)
data_descr.index = range(1, data_descr.index.shape[0] + 1)

In [54]:
# set_option(
#     'display.max_rows', data_view.shape[0],
#     'display.max_columns', data_view.shape[1],
# )

data_descr

Unnamed: 0,variable,count,mean,std,min,25%,50%,75%,max
1,BsmtFinSF1,1458.0,439.203704,455.268042,0.0,0.0,350.5,753.5,4010.0
2,BsmtFinSF2,1458.0,52.619342,176.753926,0.0,0.0,0.0,0.0,1526.0
3,BsmtUnfSF,1458.0,554.294925,437.260486,0.0,219.25,460.0,797.75,2140.0
4,TotalBsmtSF,1458.0,1046.11797,442.898624,0.0,784.0,988.0,1305.0,5095.0
5,GarageCars,1458.0,1.766118,0.775945,0.0,1.0,2.0,2.0,5.0
6,GarageArea,1458.0,472.768861,217.048611,0.0,318.0,480.0,576.0,1488.0
7,BsmtFullBath,1457.0,0.434454,0.530648,0.0,0.0,0.0,1.0,3.0
8,BsmtHalfBath,1457.0,0.065202,0.252468,0.0,0.0,0.0,0.0,2.0
9,MasVnrArea,1444.0,100.709141,177.6259,0.0,0.0,0.0,164.0,1290.0
10,GarageYrBlt,1381.0,1977.721217,26.431175,1895.0,1959.0,1979.0,2002.0,2207.0


In [55]:
# reset_option('display.*')

#### Sample

In [56]:
n = 3, 4, 3

data_subset = concat(
    [
        data_set.iloc[:n[0], :],
        data_set.sample(n[1]).sort_index(),
        data_set.iloc[-n[2]:, :],
    ]
)

data_descr = data_subset.transpose()
data_descr.index.name = 'variable'

In [57]:
set_option(
    'display.max_rows', data_descr.shape[0],
    # 'display.max_columns', data_descr.shape[1],
)

data_descr

Id,1461,1462,1463,1649,1655,1701,2111,2917,2918,2919
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MSSubClass,20,20,60,20,120,60,50,20,85,60
MSZoning,RH,RL,RL,RL,RL,RL,RM,RL,RL,RL
LotFrontage,80.0,81.0,74.0,,24.0,,,160.0,62.0,74.0
LotArea,11622,14267,13830,8510,2280,11692,7425,20000,10441,9627
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,,,,,,,
LotShape,Reg,IR1,IR1,IR1,Reg,IR1,IR1,Reg,Reg,Reg
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Bnk,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
LotConfig,Inside,Corner,Inside,Inside,FR2,Inside,Corner,Inside,Inside,Inside


In [58]:
reset_option('display.*')