# Data Exploration Analysis


In [1]:
from math import ceil
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use('seaborn-deep')
pd.options.display.float_format = '{:g}'.format

In [2]:
# def describe(
#     data_variable_sets: list[pd.Series]
# ) -> pd.DataFrame:
#     data_descr = {
#         variable_set.name: variable_set.describe()
#         for variable_set in data_variable_sets
#     }
#     return pd.DataFrame(data_descr)


# def describe_counts(X: list[pd.Series]) -> pd.DataFrame:
#     X_descr = {
#         x.name: {
#             'type': x.dtype,
#             'count': x.count(),
#             'null count': x.isnull().sum(),
#             # 'notnull count': x.notnull().sum(),
#             'unique count': x.nunique(),
#             'unique portion': x.nunique() / x.count(),
#         }
#         for x in X
#     }
#     X_descr = pd.DataFrame(X_descr).loc[
#         [
#             'type',
#             'count', 'null count',  # 'null count',
#             'unique count', 'unique portion',
#         ],
#         [x.name for x in X]
#     ]
#     return X_descr


# def describe_statistics(X: list[pd.Series]) -> pd.DataFrame:
#     X_descr = {
#         x.name: {
#             # 'min': x.min(),
#             # 'Q1': x.quantile(.25),
#             'median': x.median(),
#             # 'Q3': x.quantile(.75),        
#             # 'max': x.max(),
#             'IQR': x.quantile(.75) - x.quantile(.25),
#             'mean': x.mean(),
#             'std': x.std(),
#             'skewness': x.skew(),
#             'kurtosis': x.kurtosis(),
#         }
#         for x in X
#     }
#     X_descr = pd.DataFrame(X_descr).loc[
#         [
#             # 'min', 'Q1',
#             'median',
#             # 'Q3', 'max',
#             'IQR',
#             'mean', 'std',
#             'skewness', 'kurtosis',
#         ],
#         [x.name for x in X]
#     ]
#     return X_descr



### Load

In [3]:
data_file_paths = dict(
    train='../input/train.csv',
    test='../input/test.csv',
)
data_load_params = dict(
    # sep=',',
    # header=0,
    index_col='Id',
    # na_values=['NA', 'None'],
    # keep_default_na=True,
)
data_variable_names = dict(
    feature='GrLivArea',
    target='SalePrice',
)

data_sets = {
    data_key: pd.read_csv(data_path, **data_load_params)
    for data_key, data_path in data_file_paths.items()
}
data_variable_sets = {
    data_key: { 
        variable_key: data_set[variable_name]
        for variable_key, variable_name in data_variable_names.items()
        if variable_name in data_set
    }
    for data_key, data_set in data_sets.items()
}
# x_train, y_train, x_test = (
#     data_variable_sets['train']['feature'],
#     data_variable_sets['train']['target'],
#     data_variable_sets['test']['feature'],
# )

### Describe

In [4]:
data_descr_keys = [
    ['train', 'test', 'train'],
    ['feature', 'feature', 'target'],
]

data_descr_sets = [
    data_variable_sets[data_key][variable_key]
    for data_key, variable_key in zip(*data_descr_keys)
]
data_descr_columns =[ 
    data_descr_keys[0],
    [
        data_variable_names[data_key]
        for data_key in data_descr_keys[1] 
    ]
]

In [36]:
def describe(
    data_descr_sets: list[pd.Series],
    columns: None,
) -> pd.DataFrame:
    data_descr = [
        descr_set.describe()
        for descr_set in data_descr_sets
    ]
    data_descr = pd.concat(
        data_descr,
        axis=1,
    )
    if columns is not None:
        data_descr.columns = columns
    return data_descr

In [37]:
describe(data_descr_sets, data_descr_columns)

Unnamed: 0_level_0,train,test,train
Unnamed: 0_level_1,GrLivArea,GrLivArea,SalePrice
count,1460.0,1459.0,1460.0
mean,1515.46,1486.05,180921.0
std,525.48,485.566,79442.5
min,334.0,407.0,34900.0
25%,1129.5,1117.5,129975.0
50%,1464.0,1432.0,163000.0
75%,1776.75,1721.0,214000.0
max,5642.0,5095.0,755000.0


#### Count

In [8]:
# describe_counts([x_train, x_test, y_train])

#### Statistics

In [9]:
# describe_statistics([x_train, x_test, y_train])

In [10]:
# nrows, ncols = 2, 1  # size of subplot grid

# subplot_props = dict(
#     sharex=True,
#     figsize=(6.4 * ncols, 4.8 * nrows),
#     constrained_layout=True,
# )

# fig, axs = plt.subplots(nrows, ncols, **subplot_props)

# i = 1
# axs[i].boxplot(
#     [x_train, x_test],
#     vert=False
# )

# i = 0
# axs[i].hist(
#     [x_train, x_test],
#     bins=25,
#     alpha=.5,
#     histtype='step',
#     label=['1', '2'],
#     density=True
#     # stacked=True
#     # density=True,
# )

# axs[i].legend()

# plt.show()

In [11]:
# y_min, y_max = y.min(), y.max()
# y_Q1, y_Q2, y_Q3 = y.quantile(.25), y.median(), y.quantile(.75)
# y_mean, y_std = y.mean(), y.std()

# k = 5  # approximate number of histogram bins in the IQR
# bins = ceil(k * (y_max - y_min) / (y_Q3 - y_Q1))

# nrows, ncols = 2, 1  # size of subplot grid

# subplot_props = dict(
#     sharex=True,
#     figsize=(6.4 * ncols, 4.8 * nrows),
#     constrained_layout=True,
# )

# median_props = dict(
#     color='red',
#     linestyle='solid',
#     linewidth=1,
# )
# quantile_props = dict(
#     color='red',
#     linestyle='dashed',
#     linewidth=1,
# )
# mean_props = dict(
#     color='black',
#     linestyle='dotted',
#     linewidth=1,
# )
# std_props = dict(
#     color='black',
#     linestyle='dashdot',
#     linewidth=1,
# )

# boxplot_props = dict(
#     vert=False,
#     showmeans=True,
#     medianprops=median_props,
#     # boxprops=quantile_props,
# )

# hist_props = dict(
#     bins=bins,
#     # label=y.name,
# )

# fig, axs = plt.subplots(nrows, ncols, **subplot_props)

# i = 0
# axs[i].boxplot(y, **boxplot_props)
# axs[i].set_yticks(ticks=[])

# i = 1
# axs[i].hist(y, **hist_props)
# axs[i].axvline(y_Q2, label='Q2', **median_props)
# axs[i].axvline(y_Q1, label='Q1', **quantile_props)
# axs[i].axvline(y_Q3, label='Q3', **quantile_props)
# axs[i].axvline(y_mean, label=f'$\mu$', **mean_props)
# axs[i].axvline(y_mean - y_std, label='$\mu - \sigma$', **std_props)
# axs[i].axvline(y_mean + y_std, label='$\mu + \sigma$', **std_props)
# axs[i].legend()

# fig.suptitle(y.name)

# plt.show()

In [12]:
# plt.scatter(x_train, y_train)