# Data exploration
## Summary

In [1]:
from pandas import  read_csv, set_option
from data.paths import data_paths

read_csv_params = dict(
    index_col='Id',
)


display_options = lambda _: dict(
    max_rows=_.shape[0],
    max_columns=_.shape[1],
)

data_sets = {
    data_key: read_csv(data_path, **read_csv_params)
    for data_key, data_path in data_paths.items()
}

### Info

In [2]:
for data_key, data_set in data_sets.items():
    set_option('max_info_columns', data_set.shape[1])
    print(f'{data_key}:')
    data_set.info()
    print()

train:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  10000 non-null  float64
 1   years_of_experience  10000 non-null  float64
 2   lesson_price         10000 non-null  float64
 3   qualification        10000 non-null  float64
 4   physics              10000 non-null  float64
 5   chemistry            10000 non-null  float64
 6   biology              10000 non-null  float64
 7   english              10000 non-null  float64
 8   geography            10000 non-null  float64
 9   history              10000 non-null  float64
 10  mean_exam_points     10000 non-null  float64
 11  choose               10000 non-null  int64  
dtypes: float64(11), int64(1)
memory usage: 1015.6 KB

test:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 10000 to 19999
Data columns (total 11 columns):
 #   Column

### train

In [3]:
data_key = 'train'
data_set = data_sets[data_key]

#### nulls

In [4]:
data_set.isnull().sum()

age                    0
years_of_experience    0
lesson_price           0
qualification          0
physics                0
chemistry              0
biology                0
english                0
geography              0
history                0
mean_exam_points       0
choose                 0
dtype: int64

In [5]:
data_set.isnull().values.any()

False

#### sample

In [6]:
n = 10
data_sample = data_set.sample(n).sort_index().transpose()

for _ in display_options(data_sample).items():
    set_option(*_)
data_sample

Id,262,1825,2082,2484,2977,6116,6179,6254,8336,8757
age,49.0,54.0,51.0,48.0,50.0,38.0,52.0,37.0,56.0,29.0
years_of_experience,2.0,3.0,0.0,5.0,2.0,0.0,0.0,3.0,3.0,0.0
lesson_price,1450.0,1450.0,2200.0,1550.0,1250.0,1500.0,1700.0,1650.0,1500.0,2250.0
qualification,2.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,3.0
physics,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
chemistry,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
biology,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
english,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
geography,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
history,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### describe

In [7]:
data_description = data_set.describe().transpose()

for _ in display_options(data_description).items():
    set_option(*_)
data_description

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,10000.0,45.8009,8.030274,23.0,40.0,46.0,51.0,68.0
years_of_experience,10000.0,1.9748,1.766883,0.0,0.0,2.0,3.0,9.0
lesson_price,10000.0,1702.44,523.789062,200.0,1300.0,1550.0,2150.0,3950.0
qualification,10000.0,1.7243,0.798845,1.0,1.0,2.0,2.0,4.0
physics,10000.0,0.3706,0.48299,0.0,0.0,0.0,1.0,1.0
chemistry,10000.0,0.1215,0.326724,0.0,0.0,0.0,0.0,1.0
biology,10000.0,0.1172,0.321675,0.0,0.0,0.0,0.0,1.0
english,10000.0,0.0591,0.235824,0.0,0.0,0.0,0.0,1.0
geography,10000.0,0.0277,0.16412,0.0,0.0,0.0,0.0,1.0
history,10000.0,0.018,0.132958,0.0,0.0,0.0,0.0,1.0


### test

In [8]:
data_key = 'test'
data_set = data_sets[data_key]

#### nulls

In [9]:
data_set.isnull().values.any()

False

#### sample

In [10]:
n = 10
data_sample = data_set.sample(n).sort_index().transpose()

for _ in display_options(data_sample).items():
    set_option(*_)
data_sample

Id,10575,12974,13204,13336,13863,14356,14699,15572,16632,19796
age,53.0,67.0,46.0,52.0,47.0,31.0,39.0,62.0,48.0,39.0
years_of_experience,0.0,4.0,1.0,0.0,1.0,2.0,1.0,2.0,0.0,3.0
lesson_price,2200.0,1750.0,1450.0,1750.0,2200.0,1700.0,1250.0,1600.0,1550.0,1250.0
qualification,3.0,1.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0
physics,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
chemistry,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
biology,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
english,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
geography,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
history,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### describe

In [11]:
data_description = data_set.describe().transpose()

for _ in display_options(data_description).items():
    set_option(*_)
data_description

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,10000.0,45.9245,8.031977,23.0,41.0,46.0,51.0,68.0
years_of_experience,10000.0,1.9857,1.771217,0.0,0.0,2.0,3.0,9.0
lesson_price,10000.0,1699.91,526.260094,300.0,1300.0,1550.0,2150.0,3950.0
qualification,10000.0,1.7023,0.789644,1.0,1.0,1.5,2.0,4.0
physics,10000.0,0.3721,0.483389,0.0,0.0,0.0,1.0,1.0
chemistry,10000.0,0.1281,0.334218,0.0,0.0,0.0,0.0,1.0
biology,10000.0,0.1158,0.320001,0.0,0.0,0.0,0.0,1.0
english,10000.0,0.049,0.215879,0.0,0.0,0.0,0.0,1.0
geography,10000.0,0.0292,0.168375,0.0,0.0,0.0,0.0,1.0
history,10000.0,0.0151,0.121957,0.0,0.0,0.0,0.0,1.0


### submission example

In [12]:
data_key = 'submission_example'
data_set = data_sets[data_key]

#### nulls

In [13]:
data_set.isnull().values.any()

False

#### sample

In [14]:
n = 10
data_sample = data_set.sample(n).sort_index().transpose()

for _ in display_options(data_sample).items():
    set_option(*_)
data_sample

Id,10280,10360,10913,11911,12237,12975,14759,17048,17159,18516
choose,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
