# Data exploration
## Summary

In [1]:
from pandas import  read_csv, set_option
from data.paths import data_paths

read_csv_params = dict(
    index_col='Id',
)


display_options = lambda _: dict(
    max_rows=_.shape[0],
    max_columns=_.shape[1],
)

data_sets = {
    data_key: read_csv(data_path, **read_csv_params)
    for data_key, data_path in data_paths.items()
}

### Info

In [2]:
for data_key, data_set in data_sets.items():
    set_option('max_info_columns', data_set.shape[1])
    print(f'{data_key}:')
    data_set.info()
    print()

train:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  10000 non-null  float64
 1   years_of_experience  10000 non-null  float64
 2   lesson_price         10000 non-null  float64
 3   qualification        10000 non-null  float64
 4   physics              10000 non-null  float64
 5   chemistry            10000 non-null  float64
 6   biology              10000 non-null  float64
 7   english              10000 non-null  float64
 8   geography            10000 non-null  float64
 9   history              10000 non-null  float64
 10  mean_exam_points     10000 non-null  float64
dtypes: float64(11)
memory usage: 937.5 KB

test:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 10000 to 19999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------           

### train

In [3]:
data_key = 'train'
data_set = data_sets[data_key]

#### nulls

In [4]:
data_set.isnull().sum()

age                    0
years_of_experience    0
lesson_price           0
qualification          0
physics                0
chemistry              0
biology                0
english                0
geography              0
history                0
mean_exam_points       0
dtype: int64

In [5]:
data_set.isnull().values.any()

False

#### sample

In [6]:
n = 10
data_sample = data_set.sample(n).sort_index().transpose()

for _ in display_options(data_sample).items():
    set_option(*_)
data_sample

Id,768,1371,3317,5197,5510,6047,6165,6825,7271,7841
age,48.0,42.0,35.0,41.0,53.0,31.0,41.0,39.0,55.0,40.0
years_of_experience,1.0,3.0,0.0,2.0,0.0,5.0,0.0,3.0,4.0,3.0
lesson_price,2300.0,1700.0,1450.0,2250.0,1150.0,2850.0,1300.0,2150.0,1200.0,2450.0
qualification,3.0,1.0,2.0,3.0,1.0,2.0,1.0,2.0,1.0,3.0
physics,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
chemistry,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
biology,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
english,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
geography,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
history,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### describe

In [7]:
data_description = data_set.describe().transpose()

for _ in display_options(data_description).items():
    set_option(*_)
data_description

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,10000.0,45.878,8.043929,23.0,40.0,46.0,51.0,68.0
years_of_experience,10000.0,1.9868,1.772213,0.0,0.0,2.0,3.0,10.0
lesson_price,10000.0,1699.105,524.886654,200.0,1300.0,1500.0,2150.0,3950.0
qualification,10000.0,1.7195,0.792264,1.0,1.0,2.0,2.0,4.0
physics,10000.0,0.375,0.484147,0.0,0.0,0.0,1.0,1.0
chemistry,10000.0,0.1329,0.339484,0.0,0.0,0.0,0.0,1.0
biology,10000.0,0.1096,0.312406,0.0,0.0,0.0,0.0,1.0
english,10000.0,0.0537,0.225436,0.0,0.0,0.0,0.0,1.0
geography,10000.0,0.0321,0.176274,0.0,0.0,0.0,0.0,1.0
history,10000.0,0.0194,0.137933,0.0,0.0,0.0,0.0,1.0


### test

In [8]:
data_key = 'test'
data_set = data_sets[data_key]

#### nulls

In [9]:
data_set.isnull().values.any()

False

#### sample

In [10]:
n = 10
data_sample = data_set.sample(n).sort_index().transpose()

for _ in display_options(data_sample).items():
    set_option(*_)
data_sample

Id,10198,11822,14861,15467,15636,16430,17055,17239,17757,19342
age,52.0,53.0,45.0,50.0,32.0,49.0,60.0,44.0,63.0,42.0
years_of_experience,0.0,3.0,2.0,5.0,0.0,1.0,2.0,0.0,2.0,0.0
lesson_price,1550.0,2000.0,1300.0,2650.0,1200.0,1500.0,1800.0,1450.0,950.0,2200.0
qualification,1.0,2.0,1.0,3.0,1.0,2.0,2.0,1.0,1.0,3.0
physics,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
chemistry,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
biology,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
english,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
geography,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
history,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### describe

In [11]:
data_description = data_set.describe().transpose()

for _ in display_options(data_description).items():
    set_option(*_)
data_description

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,10000.0,45.9728,7.95628,23.0,41.0,46.0,51.0,68.0
years_of_experience,10000.0,1.98,1.783289,0.0,0.0,2.0,3.0,10.0
lesson_price,10000.0,1697.095,524.262621,200.0,1300.0,1500.0,2150.0,4050.0
qualification,10000.0,1.7094,0.793483,1.0,1.0,2.0,2.0,4.0
physics,10000.0,0.3813,0.48573,0.0,0.0,0.0,1.0,1.0
chemistry,10000.0,0.1235,0.329027,0.0,0.0,0.0,0.0,1.0
biology,10000.0,0.1201,0.325095,0.0,0.0,0.0,0.0,1.0
english,10000.0,0.056,0.229933,0.0,0.0,0.0,0.0,1.0
geography,10000.0,0.0314,0.174405,0.0,0.0,0.0,0.0,1.0
history,10000.0,0.0184,0.1344,0.0,0.0,0.0,0.0,1.0


### submission example

In [12]:
data_key = 'submission_example'
data_set = data_sets[data_key]

#### nulls

In [13]:
data_set.isnull().values.any()

False

#### sample

In [14]:
n = 10
data_sample = data_set.sample(n).sort_index().transpose()

for _ in display_options(data_sample).items():
    set_option(*_)
data_sample

Id,12426,14519,15211,15396,15766,16605,17166,17600,19140,19538
mean_exam_points,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
