**Testing the DATA CLEANER functions**

In [1]:
import numpy as np
import pandas as pd
from data_cleaner import *

Dataset

In [2]:
fordata = [[110000, 120000, 130000, 140000, 150000, 160000, 170000, 180000, 190000, np.nan], 
           [210000, 220000, 230000, 240000, 250000, 260000, 270000, 280000, np.nan, np.nan], 
           [310000, 320000, 330000, 340000, 350000, 360000, 370000, np.nan, np.nan, np.nan], 
           [410000, 420000, 430000, 440000, 450000, 460000, np.nan, np.nan, np.nan, np.nan], 
           [510000, 520000, 530000, 540000, 550000, np.nan, np.nan, np.nan, np.nan, np.nan], 
           [610000, 620000, 630000, 640000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 
           [710000, 720000, 730000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 
           [810000, 820000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 
           [910000, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
           [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]]

data = pd.DataFrame(fordata, columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,110000.0,120000.0,130000.0,140000.0,150000.0,160000.0,170000.0,180000.0,190000.0,
1,210000.0,220000.0,230000.0,240000.0,250000.0,260000.0,270000.0,280000.0,,
2,310000.0,320000.0,330000.0,340000.0,350000.0,360000.0,370000.0,,,
3,410000.0,420000.0,430000.0,440000.0,450000.0,460000.0,,,,
4,510000.0,520000.0,530000.0,540000.0,550000.0,,,,,
5,610000.0,620000.0,630000.0,640000.0,,,,,,
6,710000.0,720000.0,730000.0,,,,,,,
7,810000.0,820000.0,,,,,,,,
8,910000.0,,,,,,,,,
9,,,,,,,,,,


---

**Function `drop_data`**

Dropping rows

In [3]:
rows_data = drop_data(data=data, axis='rows', threshold=2)

rows_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,110000.0,120000.0,130000.0,140000.0,150000.0,160000.0,170000.0,180000.0,190000.0,
1,210000.0,220000.0,230000.0,240000.0,250000.0,260000.0,270000.0,280000.0,,


In [4]:
drows_new = rows_data.shape[0]

drows_old = data.shape[0]

print(f"Old data rows number is {drows_old} \n" 
      f"New data rows number is {drows_new} \n"
      f"Are they equal? - {drows_old == drows_new}")

Old data rows number is 10 
New data rows number is 2 
Are they equal? - False


Dropping columns

In [5]:
cols_data = drop_data(data=data, axis='cols', threshold=0.8)

cols_data

Unnamed: 0,0,1
0,110000.0,120000.0
1,210000.0,220000.0
2,310000.0,320000.0
3,410000.0,420000.0
4,510000.0,520000.0
5,610000.0,620000.0
6,710000.0,720000.0
7,810000.0,820000.0
8,910000.0,
9,,


In [6]:
dcols_new = cols_data.shape[1]

dcols_old = data.shape[1]

print(f"Old data columns number is {dcols_old} \n" 
      f"New data columns number is {dcols_new} \n"
      f"Are they equal? - {dcols_old == dcols_new}")

Old data columns number is 10 
New data columns number is 2 
Are they equal? - False


Dropping rows without threshold (default value is 3)

In [7]:
drop_data(data=data, axis='rows') 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,110000.0,120000.0,130000.0,140000.0,150000.0,160000.0,170000.0,180000.0,190000.0,
1,210000.0,220000.0,230000.0,240000.0,250000.0,260000.0,270000.0,280000.0,,
2,310000.0,320000.0,330000.0,340000.0,350000.0,360000.0,370000.0,,,


Dropping columns without threshold (default value is 0.7)

In [8]:
drop_data(data=data, axis='cols') 

Unnamed: 0,0,1,2
0,110000.0,120000.0,130000.0
1,210000.0,220000.0,230000.0
2,310000.0,320000.0,330000.0
3,410000.0,420000.0,430000.0
4,510000.0,520000.0,530000.0
5,610000.0,620000.0,630000.0
6,710000.0,720000.0,730000.0
7,810000.0,820000.0,
8,910000.0,,
9,,,


Invalid axis 

In [9]:
drop_data(data=data, axis='table') 

Invalid axis or no axis


In [10]:
drop_data(data=data, axis=0)

Invalid axis or no axis


In [11]:
drop_data(data=data, threshold=0.5) 

Invalid axis or no axis


In [12]:
drop_data(data=data)

Invalid axis or no axis


Invalid threshold

In [13]:
drop_data(data=data, axis='rows', threshold=0) 

Threshhold cannot be zero or negative


In [14]:
drop_data(data=data, axis='cols', threshold=-5) 

Threshhold cannot be zero or negative


In [15]:
drop_data(data=data, axis='rows', threshold=11) 

Threshold cannot be more than number of columns


In [16]:
drop_data(data=data, axis='cols', threshold=2.1) 

Threshold cannot be equal to 1 or more


-----

**Function `outliers_iqr`**

Dataset

In [17]:
sber_data = pd.read_csv('data/sber_data.csv')

sber_data

Unnamed: 0,id,full_sq,life_sq,floor,sub_area,preschool_quota,preschool_education_centers_raion,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,...,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,theater_km,museum_km,ecology,mosque_count_1000,price_doc
0,1,43,27.0,4.0,Bibirevo,5001.0,5,11065.0,5,0,...,0.637189,0.947962,0.177975,0.625783,0.628187,14.053047,7.389498,good,0,5850000
1,2,34,19.0,3.0,Nagatinskij Zaton,3119.0,5,6237.0,8,0,...,0.688796,1.072315,0.273345,0.967821,0.471447,6.829889,0.709260,excellent,0,6000000
2,3,43,29.0,2.0,Tekstil'shhiki,1463.0,4,5580.0,7,0,...,1.543049,0.391957,0.158072,3.178751,0.755946,4.273200,3.156423,poor,0,5700000
3,4,89,50.0,9.0,Mitino,6839.0,9,17063.0,10,0,...,0.934273,0.892674,0.236455,1.031777,1.561505,16.990677,16.041521,good,0,13100000
4,5,77,77.0,4.0,Basmannoe,3240.0,7,7770.0,9,0,...,0.077901,0.810801,0.376838,0.378756,0.121681,1.112486,1.800125,excellent,0,16331452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,30469,44,27.0,7.0,Otradnoe,5088.0,4,12721.0,4,0,...,1.103579,0.167718,0.349899,1.235066,0.612359,7.482165,4.836787,good,0,7400000
30467,30470,86,59.0,3.0,Tverskoe,1874.0,4,6772.0,4,1,...,0.069986,0.086552,0.362681,0.850385,0.310021,0.778428,1.450108,poor,0,25000000
30468,30471,45,,10.0,Poselenie Vnukovskoe,,0,,0,0,...,4.338453,1.339078,1.234235,1.192543,1.186621,13.459593,9.890758,no data,0,6970959
30469,30472,64,32.0,5.0,Obruchevskoe,2372.0,6,6083.0,8,0,...,1.204798,1.340017,0.130667,1.644053,0.476021,2.088193,4.119706,satisfactory,0,13500000


Cleaning the data

In [18]:
iqr_out = outliers_iqr(sber_data, 'full_sq', what='outliers')

iqr_out

Unnamed: 0,id,full_sq,life_sq,floor,sub_area,preschool_quota,preschool_education_centers_raion,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,...,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,theater_km,museum_km,ecology,mosque_count_1000,price_doc
79,80,133,64.0,2.0,Izmajlovo,1313.0,4,4339.0,6,0,...,0.804087,0.435858,0.676590,2.098633,0.968482,7.141546,5.413698,good,0,17600000
128,129,325,325.0,7.0,Ivanovskoe,2697.0,7,9439.0,8,1,...,3.310054,0.602138,0.369984,1.104165,1.247850,2.194396,1.143931,poor,0,5000000
146,147,102,64.0,11.0,Sokolinaja Gora,643.0,4,5180.0,4,0,...,0.999584,1.426903,1.163062,1.627478,0.601487,3.449316,1.557170,excellent,0,4600000
147,148,117,108.0,20.0,Nagatino-Sadovniki,2508.0,4,5254.0,6,0,...,0.237372,0.437805,0.157045,1.855743,1.549737,4.452140,2.014427,excellent,0,14103600
170,171,115,60.0,7.0,Krylatskoe,3092.0,7,7478.0,7,0,...,0.727716,0.681249,0.165629,0.287008,1.147352,12.239214,7.247341,good,0,37000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30320,30323,151,,5.0,Jasenevo,4172.0,7,10559.0,7,0,...,2.130257,0.683810,0.694617,1.270087,0.808215,9.897054,8.813169,good,0,28038712
30355,30358,133,,12.0,Izmajlovo,1313.0,4,4339.0,6,0,...,1.325438,0.442404,0.862449,0.569742,0.569372,5.379698,4.738857,good,0,26055288
30369,30372,124,,12.0,Izmajlovo,1313.0,4,4339.0,6,0,...,1.325438,0.442404,0.862449,0.569742,0.569372,5.379698,4.738857,good,0,26055288
30397,30400,109,60.0,11.0,Ramenki,903.0,8,7788.0,8,0,...,0.608047,0.822709,0.435947,1.729040,0.480137,2.858389,3.224508,satisfactory,0,38400000


In [19]:
print(f"Data has {iqr_out.shape[0]} outliers")

Data has 963 outliers


In [20]:
iqr_cld = outliers_iqr(sber_data, 'full_sq', what='cleaned')

iqr_cld

Unnamed: 0,id,full_sq,life_sq,floor,sub_area,preschool_quota,preschool_education_centers_raion,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,...,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,theater_km,museum_km,ecology,mosque_count_1000,price_doc
0,1,43,27.0,4.0,Bibirevo,5001.0,5,11065.0,5,0,...,0.637189,0.947962,0.177975,0.625783,0.628187,14.053047,7.389498,good,0,5850000
1,2,34,19.0,3.0,Nagatinskij Zaton,3119.0,5,6237.0,8,0,...,0.688796,1.072315,0.273345,0.967821,0.471447,6.829889,0.709260,excellent,0,6000000
2,3,43,29.0,2.0,Tekstil'shhiki,1463.0,4,5580.0,7,0,...,1.543049,0.391957,0.158072,3.178751,0.755946,4.273200,3.156423,poor,0,5700000
3,4,89,50.0,9.0,Mitino,6839.0,9,17063.0,10,0,...,0.934273,0.892674,0.236455,1.031777,1.561505,16.990677,16.041521,good,0,13100000
4,5,77,77.0,4.0,Basmannoe,3240.0,7,7770.0,9,0,...,0.077901,0.810801,0.376838,0.378756,0.121681,1.112486,1.800125,excellent,0,16331452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,30469,44,27.0,7.0,Otradnoe,5088.0,4,12721.0,4,0,...,1.103579,0.167718,0.349899,1.235066,0.612359,7.482165,4.836787,good,0,7400000
30467,30470,86,59.0,3.0,Tverskoe,1874.0,4,6772.0,4,1,...,0.069986,0.086552,0.362681,0.850385,0.310021,0.778428,1.450108,poor,0,25000000
30468,30471,45,,10.0,Poselenie Vnukovskoe,,0,,0,0,...,4.338453,1.339078,1.234235,1.192543,1.186621,13.459593,9.890758,no data,0,6970959
30469,30472,64,32.0,5.0,Obruchevskoe,2372.0,6,6083.0,8,0,...,1.204798,1.340017,0.130667,1.644053,0.476021,2.088193,4.119706,satisfactory,0,13500000


In [21]:
print(f"Cleaned data has {iqr_cld.shape[0]} rows") 

Cleaned data has 29508 rows


Errors

In [22]:
outliers_iqr(sber_data, 'country') 

No such feature in data's columns


In [23]:
outliers_iqr(sber_data, 'full_sq', left=0, right=-3.4)

Bounds cannot be zero or negative


In [24]:
outliers_iqr(sber_data, 'full_sq')

Invalid literal for 'what'


In [25]:
outliers_iqr(sber_data, 'full_sq', what='idk?') 

Invalid literal for 'what'


----

**Function `outliers_sigmas`**

Cleaning the data

In [26]:
z_out = outliers_sigmas(sber_data, 'mkad_km', log_scale=True, what='outliers')

In [27]:
z_out.head()

Unnamed: 0,id,full_sq,life_sq,floor,sub_area,preschool_quota,preschool_education_centers_raion,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,...,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,theater_km,museum_km,ecology,mosque_count_1000,price_doc
12742,12745,134,,1.0,Poselenie Rogovskoe,,0,,0,0,...,10.623702,24.268209,47.394706,45.66906,2.533713,87.274546,59.203148,no data,0,5798497
14426,14429,70,42.0,3.0,Poselenie Kievskij,,0,,0,0,...,10.041862,9.767735,34.537964,30.17082,0.937914,52.153757,47.462727,no data,0,4250000
14685,14688,167,,1.0,Poselenie Rogovskoe,,0,,0,0,...,10.623702,24.268209,47.394706,45.66906,2.533713,87.274546,59.203148,no data,0,7115196
14729,14732,135,,1.0,Poselenie Rogovskoe,,0,,0,0,...,10.623702,24.268209,47.394706,45.66906,2.533713,87.274546,59.203148,no data,0,6852768
14895,14898,167,,1.0,Poselenie Rogovskoe,,0,,0,0,...,10.623702,24.268209,47.394706,45.66906,2.533713,87.274546,59.203148,no data,0,6740712


In [28]:
print(f"Data has {z_out.shape[0]} outliers")

Data has 33 outliers


In [29]:
z_cld = outliers_sigmas(sber_data, 'mkad_km', log_scale=True, what='cleaned')

In [35]:
z_cld

Unnamed: 0,id,full_sq,life_sq,floor,sub_area,preschool_quota,preschool_education_centers_raion,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,...,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,theater_km,museum_km,ecology,mosque_count_1000,price_doc
0,1,43,27.0,4.0,Bibirevo,5001.0,5,11065.0,5,0,...,0.637189,0.947962,0.177975,0.625783,0.628187,14.053047,7.389498,good,0,5850000
1,2,34,19.0,3.0,Nagatinskij Zaton,3119.0,5,6237.0,8,0,...,0.688796,1.072315,0.273345,0.967821,0.471447,6.829889,0.709260,excellent,0,6000000
2,3,43,29.0,2.0,Tekstil'shhiki,1463.0,4,5580.0,7,0,...,1.543049,0.391957,0.158072,3.178751,0.755946,4.273200,3.156423,poor,0,5700000
3,4,89,50.0,9.0,Mitino,6839.0,9,17063.0,10,0,...,0.934273,0.892674,0.236455,1.031777,1.561505,16.990677,16.041521,good,0,13100000
4,5,77,77.0,4.0,Basmannoe,3240.0,7,7770.0,9,0,...,0.077901,0.810801,0.376838,0.378756,0.121681,1.112486,1.800125,excellent,0,16331452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,30469,44,27.0,7.0,Otradnoe,5088.0,4,12721.0,4,0,...,1.103579,0.167718,0.349899,1.235066,0.612359,7.482165,4.836787,good,0,7400000
30467,30470,86,59.0,3.0,Tverskoe,1874.0,4,6772.0,4,1,...,0.069986,0.086552,0.362681,0.850385,0.310021,0.778428,1.450108,poor,0,25000000
30468,30471,45,,10.0,Poselenie Vnukovskoe,,0,,0,0,...,4.338453,1.339078,1.234235,1.192543,1.186621,13.459593,9.890758,no data,0,6970959
30469,30472,64,32.0,5.0,Obruchevskoe,2372.0,6,6083.0,8,0,...,1.204798,1.340017,0.130667,1.644053,0.476021,2.088193,4.119706,satisfactory,0,13500000


In [30]:
print(f"Cleaned data has {z_cld.shape[0]} rows")

Cleaned data has 30438 rows


----

**Function `drop_low_information`**

In [31]:
informative_data = drop_low_information(data=sber_data)

informative_data

Unnamed: 0,full_sq,life_sq,floor,sub_area,preschool_quota,preschool_education_centers_raion,school_quota,school_education_centers_raion,school_education_centers_top_20_raion,hospital_beds_raion,...,shopping_centers_km,office_km,additional_education_km,preschool_km,big_church_km,church_synagogue_km,theater_km,museum_km,ecology,price_doc
0,43,27.0,4.0,Bibirevo,5001.0,5,11065.0,5,0,240.0,...,0.648488,0.637189,0.947962,0.177975,0.625783,0.628187,14.053047,7.389498,good,5850000
1,34,19.0,3.0,Nagatinskij Zaton,3119.0,5,6237.0,8,0,229.0,...,0.519311,0.688796,1.072315,0.273345,0.967821,0.471447,6.829889,0.709260,excellent,6000000
2,43,29.0,2.0,Tekstil'shhiki,1463.0,4,5580.0,7,0,1183.0,...,1.486533,1.543049,0.391957,0.158072,3.178751,0.755946,4.273200,3.156423,poor,5700000
3,89,50.0,9.0,Mitino,6839.0,9,17063.0,10,0,,...,0.599914,0.934273,0.892674,0.236455,1.031777,1.561505,16.990677,16.041521,good,13100000
4,77,77.0,4.0,Basmannoe,3240.0,7,7770.0,9,0,562.0,...,0.429052,0.077901,0.810801,0.376838,0.378756,0.121681,1.112486,1.800125,excellent,16331452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,44,27.0,7.0,Otradnoe,5088.0,4,12721.0,4,0,,...,0.325885,1.103579,0.167718,0.349899,1.235066,0.612359,7.482165,4.836787,good,7400000
30467,86,59.0,3.0,Tverskoe,1874.0,4,6772.0,4,1,1046.0,...,0.540003,0.069986,0.086552,0.362681,0.850385,0.310021,0.778428,1.450108,poor,25000000
30468,45,,10.0,Poselenie Vnukovskoe,,0,,0,0,,...,1.806570,4.338453,1.339078,1.234235,1.192543,1.186621,13.459593,9.890758,no data,6970959
30469,64,32.0,5.0,Obruchevskoe,2372.0,6,6083.0,8,0,3300.0,...,1.108672,1.204798,1.340017,0.130667,1.644053,0.476021,2.088193,4.119706,satisfactory,13500000


In [32]:
inf_cols = informative_data.shape[1]

sber_cols = sber_data.shape[1]

print(f"Old data columns number is {sber_cols} \n" 
      f"New data columns number is {inf_cols} \n"
      f"Are they equal? - {sber_cols == inf_cols}")

Old data columns number is 61 
New data columns number is 55 
Are they equal? - False


Invalid limit

In [33]:
drop_low_information(data=sber_data, limit=1.23)

Invalid value for 'limit'


In [34]:
drop_low_information(data=sber_data, limit=-2)

Invalid value for 'limit'


---