# Data exploration
## Summary

In [1]:
import pandas as pd
from data.paths import file_paths as data_file_paths
from data.load import load_params as data_load_params
from exploration.tools import set_options, set_option_context

### Parameters

In [2]:
# info_options = dict(
#      max_info_column=None,
# )

display_options = dict(
    max_rows=None,
    max_columns=None,
)


def get_info_options(dataframe: pd.DataFrame) -> dict[str: int]:
    return dict(
        max_info_column=dataframe.shape[1],
    )

# def get_display_options(dataframe: pd.DataFrame) -> dict[str: int]:
#     return dict(
#         max_rows=dataframe.shape[0],
#         max_columns=dataframe.shape[1],
#     )

set_options(display_options)

### Load

In [3]:
data_sets = {
    data_key: pd.read_csv(data_path, **data_load_params[data_key])
    for data_key, data_path in data_file_paths.items()
}

### Info

In [4]:
data_descr = {
    data_key: {
        'rows': data_set.shape[0],
        'columns': data_set.shape[1],
        'nulls': data_set.isnull().values.any(),
        # 'notnulls': data_set.notnull().values.all(),
    }
    for data_key, data_set in data_sets.items()
}
data_descr = pd.DataFrame(
    data=data_descr,
    index=['rows','columns', 'nulls']
)\
    .transpose()\
    .sort_values(
        by='rows',
        ascending=False
    )

data_descr

Unnamed: 0,rows,columns,nulls
train,10000,11,False
test,10000,10,False
submission,10000,1,False


In [5]:
for data_key, data_set in data_sets.items():
    print(f'\n{data_key}:\n')
    with set_option_context(get_info_options(data_set)):
        data_set.info()


train:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  10000 non-null  float64
 1   years_of_experience  10000 non-null  float64
 2   lesson_price         10000 non-null  float64
 3   qualification        10000 non-null  float64
 4   physics              10000 non-null  float64
 5   chemistry            10000 non-null  float64
 6   biology              10000 non-null  float64
 7   english              10000 non-null  float64
 8   geography            10000 non-null  float64
 9   history              10000 non-null  float64
 10  mean_exam_points     10000 non-null  float64
dtypes: float64(11)
memory usage: 937.5 KB

test:

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 10000 to 19999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------        

### Train

In [6]:
data_key = 'train'
data_set = data_sets[data_key]

In [7]:
data_descr = {
    'type': data_set.dtypes,
    'unique': data_set.nunique(),
    'null': data_set.isnull().sum(),
    # 'notnull': data_set.notnull().sum(),
}
data_descr = pd.DataFrame(
    data=data_descr,
    columns=['type', 'unique', 'null']
)\
    .sort_values(
        by=['type', 'unique'],
        ascending=[True, False]
    )

data_descr

Unnamed: 0,type,unique,null
lesson_price,float64,74,0
mean_exam_points,float64,69,0
age,float64,46,0
years_of_experience,float64,11,0
qualification,float64,4,0
physics,float64,2,0
chemistry,float64,2,0
biology,float64,2,0
english,float64,2,0
geography,float64,2,0


#### Sample

In [8]:
n = 10
data_sample = data_set.sample(n)\
    .sort_index()\
    .transpose()

data_sample

Id,179,2713,2987,3034,5195,7801,8470,9345,9485,9691
age,53.0,51.0,45.0,49.0,52.0,37.0,47.0,55.0,52.0,41.0
years_of_experience,2.0,1.0,3.0,1.0,0.0,0.0,0.0,6.0,2.0,1.0
lesson_price,1300.0,2050.0,2250.0,1200.0,1550.0,2250.0,1450.0,800.0,1050.0,1750.0
qualification,1.0,2.0,2.0,1.0,1.0,3.0,2.0,1.0,1.0,2.0
physics,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
chemistry,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
biology,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
english,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
geography,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
history,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Describe

In [9]:
data_type = float
data_subset = data_set.select_dtypes(data_type)
data_descr = data_subset.describe().transpose()

data_descr

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,10000.0,45.878,8.043929,23.0,40.0,46.0,51.0,68.0
years_of_experience,10000.0,1.9868,1.772213,0.0,0.0,2.0,3.0,10.0
lesson_price,10000.0,1699.105,524.886654,200.0,1300.0,1500.0,2150.0,3950.0
qualification,10000.0,1.7195,0.792264,1.0,1.0,2.0,2.0,4.0
physics,10000.0,0.375,0.484147,0.0,0.0,0.0,1.0,1.0
chemistry,10000.0,0.1329,0.339484,0.0,0.0,0.0,0.0,1.0
biology,10000.0,0.1096,0.312406,0.0,0.0,0.0,0.0,1.0
english,10000.0,0.0537,0.225436,0.0,0.0,0.0,0.0,1.0
geography,10000.0,0.0321,0.176274,0.0,0.0,0.0,0.0,1.0
history,10000.0,0.0194,0.137933,0.0,0.0,0.0,0.0,1.0
