In [77]:
import numpy as np 
import pandas as pd
import matplotlib as mtl
pd.set_option("display.max_rows", 300)

In [78]:
%%time
item_cat = pd.read_csv("raw_data/item_categories.csv")
items = pd.read_csv("raw_data/items.csv")
sales_train = pd.read_csv("raw_data/sales_train.csv")
shops = pd.read_csv("raw_data/shops.csv")
test = pd.read_csv("raw_data/test.csv")
# sample_submission = pd.read_csv("sample_submission.csv")

CPU times: user 864 ms, sys: 70 ms, total: 934 ms
Wall time: 932 ms


# Item categories #

In [79]:
item_cat.sample(10)
# description is clean

Unnamed: 0,item_category_name,item_category_id
37,Кино - Blu-Ray,37
69,Подарки - Сувениры,69
22,Игры - PSVita,22
58,Музыка - Винил,58
5,Аксессуары - PSVita,5
59,Музыка - Музыкальное видео,59
39,Кино - Blu-Ray 4K,39
30,Игры PC - Стандартные издания,30
83,Элементы питания,83
61,Подарки - Атрибутика,61


In [80]:
item_cat.info()
# 0 null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   item_category_name  84 non-null     object
 1   item_category_id    84 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [81]:
item_cat.describe(include=[object])
# no duplicates

Unnamed: 0,item_category_name
count,84
unique,84
top,PC - Гарнитуры/Наушники
freq,1


In [82]:
item_cat.describe()
# normal indexing range

Unnamed: 0,item_category_id
count,84.0
mean,41.5
std,24.392622
min,0.0
25%,20.75
50%,41.5
75%,62.25
max,83.0


# Shops 

In [83]:
shops.iloc[:]
# shops 38-40 have very common name, but their sales and counts of 
# items is kinda different, so i decided to leave them all.

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


In [84]:
shops['shop_name'] = shops['shop_name'].str.replace("!|\?|²|\*|/| фран", '', regex=True)
shops.head(10)
# cleaning shop names from excess symbols

Unnamed: 0,shop_name,shop_id
0,"Якутск Орджоникидзе, 56",0
1,"Якутск ТЦ ""Центральный""",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


In [85]:
shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   shop_name  60 non-null     object
 1   shop_id    60 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB


In [86]:
shops.describe(include=[object])
# after cleaning have appeared duplicates

Unnamed: 0,shop_name
count,60
unique,57
top,"Якутск Орджоникидзе, 56"
freq,2


In [87]:
shops.loc[shops.duplicated("shop_name", keep=False)]

Unnamed: 0,shop_name,shop_id
0,"Якутск Орджоникидзе, 56",0
1,"Якутск ТЦ ""Центральный""",1
10,Жуковский ул. Чкалова 39м,10
11,Жуковский ул. Чкалова 39м,11
57,"Якутск Орджоникидзе, 56",57
58,"Якутск ТЦ ""Центральный""",58


In [88]:
# take a look at one set of duplicats
test.loc[test.shop_id == 10]

Unnamed: 0,ID,shop_id,item_id
30600,30600,10,5037
30601,30601,10,5320
30602,30602,10,5233
30603,30603,10,5232
30604,30604,10,5268
...,...,...,...
35695,35695,10,18454
35696,35696,10,16188
35697,35697,10,15757
35698,35698,10,19648


In [89]:
test.loc[test.shop_id == 11]
#See that duplicat with id=11 is not presented in test set.

Unnamed: 0,ID,shop_id,item_id


In [90]:
sales_train.loc[sales_train.shop_id == 10]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
53564,26.01.2013,0,10,6000,190.0,1.0
53565,12.01.2013,0,10,6007,180.0,1.0
53566,08.01.2013,0,10,6093,200.0,1.0
53567,12.01.2013,0,10,6093,200.0,1.0
53568,13.01.2013,0,10,6095,1321.0,1.0
...,...,...,...,...,...,...
2919918,03.10.2015,33,10,7893,2465.0,1.0
2919919,03.10.2015,33,10,7879,2209.0,1.0
2919920,20.10.2015,33,10,7942,3418.0,1.0
2919921,24.10.2015,33,10,7933,822.0,1.0


In [91]:
sales_train.loc[sales_train.shop_id == 11]
# However duplicat with id = 11 is presented in sales_train

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2461045,27.02.2015,25,11,22162,237.11,1.0
2461046,23.02.2015,25,11,22162,237.11,1.0
2461047,20.02.2015,25,11,22162,270.00,1.0
2461048,11.02.2015,25,11,22162,270.00,1.0
2461049,10.02.2015,25,11,22162,270.00,1.0
...,...,...,...,...,...,...
2462003,12.02.2015,25,11,10389,219.00,1.0
2462004,07.02.2015,25,11,10382,460.00,1.0
2462005,17.02.2015,25,11,10379,195.00,1.0
2462006,18.02.2015,25,11,10298,99.00,1.0


In [92]:
# dealing with duplicates with same name but different id:
# 1. If in test set are presented more than 1 duplicates of the shop, 
#    deciding to not touch them.
# 2. If only 1 or 0(i.e. shop isn't in test set) duplicats of shop are 
#    presented in test set, then merging their id.
dup = shops.loc[shops.duplicated("shop_name", keep=False)] \
    .groupby("shop_name") \
    .agg(lambda x: list(x)) \
    .shop_id
test_shops = test.shop_id.to_numpy()
for idx, value in dup.items():
    value: np.ndarray = np.array(value)
    mask = np.isin(value, test_shops)
    if np.sum(mask) < 2:
        shops.loc[value, 'shop_id'] = value[mask][0] if value[mask].size > 0 else value[0]
        shops.drop_duplicates("shop_id", inplace=True)
        sales_train.loc[sales_train.shop_id.isin(value), 'shop_id'] = value[mask][0] if value[mask].size > 0 \
            else value[0]
shops.reset_index(drop=True, inplace=True)

In [93]:
shops.loc[shops.duplicated("shop_name")]
# no duplicates after cleaning

Unnamed: 0,shop_name,shop_id


In [94]:
shops.loc[:]

Unnamed: 0,shop_name,shop_id
0,"Якутск Орджоникидзе, 56",57
1,"Якутск ТЦ ""Центральный""",58
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4
5,"Вологда ТРЦ ""Мармелад""",5
6,"Воронеж (Плехановская, 13)",6
7,"Воронеж ТРЦ ""Максимир""",7
8,"Воронеж ТРЦ Сити-Парк ""Град""",8
9,Выездная Торговля,9


In [95]:
shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   shop_name  57 non-null     object
 1   shop_id    57 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ KB


In [96]:
sales_train.loc[sales_train.shop_id == 11]
# id of duplicates are succesfully merged

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day


# Items #

In [97]:
 items.head(20)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40
5,***НОВЫЕ АМЕРИКАНСКИЕ ГРАФФИТИ (UNI) ...,5,40
6,***УДАР ПО ВОРОТАМ (UNI) D,6,40
7,***УДАР ПО ВОРОТАМ-2 (UNI) D,7,40
8,***ЧАЙ С МУССОЛИНИ D,8,40
9,***ШУГАРЛЭНДСКИЙ ЭКСПРЕСС (UNI) D,9,40


In [98]:
items['item_name'] = items['item_name'].str.replace("!|\?|²|\*|/", '', regex=True)
items.head(20)
# cleaning item names from excess symbols

Unnamed: 0,item_name,item_id,item_category_id
0,ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,ABBYY FineReader 12 Professional Edition Full ...,1,76
2,В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,КОРОБКА (СТЕКЛО) D,4,40
5,НОВЫЕ АМЕРИКАНСКИЕ ГРАФФИТИ (UNI) D,5,40
6,УДАР ПО ВОРОТАМ (UNI) D,6,40
7,УДАР ПО ВОРОТАМ-2 (UNI) D,7,40
8,ЧАЙ С МУССОЛИНИ D,8,40
9,ШУГАРЛЭНДСКИЙ ЭКСПРЕСС (UNI) D,9,40


In [99]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


In [100]:
items.describe(include=[object])

Unnamed: 0,item_name
count,22170
unique,22166
top,МУЖИКИ (регион)
freq,2


In [101]:
items.loc[items.duplicated('item_name', keep=False)]

Unnamed: 0,item_name,item_id,item_category_id
12,МИХЕЙ И ДЖУМАНДЖИ Сука любовь,12,55
13011,КТО Я,13011,40
13012,КТО Я,13012,40
14690,МИХЕЙ И ДЖУМАНДЖИ Сука любовь,14690,55
14855,МУЖИКИ (регион),14855,40
14856,МУЖИКИ (регион),14856,40
15537,НА КРЮЧКЕ (BD),15537,37
15539,НА КРЮЧКЕ (BD),15539,37


In [102]:
# dealing with duplicates with same name but different id:
# 1. If in test set are presented more than 1 duplicate of the item, 
#    deciding to not touch them.
# 2. If only 1 or 0(i.e. shop isn't in test set) duplicats of item are 
#    presented in test set, then merging their id.

In [103]:
# group togheter duplicates of same item name
dup = items.loc[items.duplicated("item_name", keep=False)]\
                            .groupby("item_name")\
                            .agg(lambda x: list(x))\
                            .item_id
dup                                                    

item_name
КТО Я                             [13011, 13012]
МИХЕЙ И ДЖУМАНДЖИ  Сука любовь       [12, 14690]
МУЖИКИ (регион)                   [14855, 14856]
НА КРЮЧКЕ (BD)                    [15537, 15539]
Name: item_id, dtype: object

In [104]:
# converting test set to numpy for following operations
test_items = test.item_id.to_numpy()
test_items

array([ 5037,  5320,  5233, ..., 15757, 19648,   969])

In [105]:
# for every duplicate name check if it take part of test set.
# If at least 2 duplicates of one name are in test set, we don't
# touch them. Otherwise, we merge all duplicates of the same name 
# in sales_train and items.
for idx, value in dup.items():
    value = np.array(value)
    mask = np.isin(value,test_items)
    if np.sum(mask) < 2:
        items.loc[value, 'item_id'] = value[mask][0] if value[mask].size > 0 else value[0]
        items.drop_duplicates("item_id", inplace = True)
        sales_train.loc[sales_train.item_id.isin(value), 'item_id'] = value[mask][0] if value[mask].size > 0 else value[0]
items.reset_index(inplace=True)

In [106]:
items.describe(include=['object'])
# ✨ clean ✨

Unnamed: 0,item_name
count,22166
unique,22166
top,ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D
freq,1


In [107]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22166 entries, 0 to 22165
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   index             22166 non-null  int64 
 1   item_name         22166 non-null  object
 2   item_id           22166 non-null  int64 
 3   item_category_id  22166 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 692.8+ KB


# Sales

In [108]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [109]:
# deleting column date_block_num because it's redundant and 
# can be retrieved from date column.
sales_train.drop(columns='date_block_num',inplace=True)
sales_train.head()

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,59,22154,999.0,1.0
1,03.01.2013,25,2552,899.0,1.0
2,05.01.2013,25,2552,899.0,-1.0
3,06.01.2013,25,2554,1709.05,1.0
4,15.01.2013,25,2555,1099.0,1.0


In [110]:
# Converting date to appropriate format
sales_train['date'] = pd.to_datetime(sales_train['date'], dayfirst=True)
sales_train.head()

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,59,22154,999.0,1.0
1,2013-01-03,25,2552,899.0,1.0
2,2013-01-05,25,2552,899.0,-1.0
3,2013-01-06,25,2554,1709.05,1.0
4,2013-01-15,25,2555,1099.0,1.0


In [111]:
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 5 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   shop_id       int64         
 2   item_id       int64         
 3   item_price    float64       
 4   item_cnt_day  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 112.0 MB


In [112]:
# delete duplicates
sales_train.drop_duplicates(inplace=True, ignore_index = True)
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935843 entries, 0 to 2935842
Data columns (total 5 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   shop_id       int64         
 2   item_id       int64         
 3   item_price    float64       
 4   item_cnt_day  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 112.0 MB


In [113]:
sales_train.isnull().sum()
# zero null values

date            0
shop_id         0
item_id         0
item_price      0
item_cnt_day    0
dtype: int64

In [114]:
sales_train.sample(10)

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
1462084,2014-03-17,54,1002,69.0,1.0
969020,2013-10-30,50,6998,199.0,1.0
2931917,2015-10-20,21,5039,1499.0,1.0
1897887,2014-08-10,57,14066,99.0,1.0
1264637,2013-12-15,42,2808,999.0,1.0
1906602,2014-08-06,54,19685,74.0,1.0
1951316,2014-08-19,26,10989,349.0,1.0
2675933,2015-06-27,48,1916,249.0,1.0
1912709,2014-08-06,17,21540,349.0,1.0
1536603,2014-03-06,13,10500,475.0,1.0


In [115]:
sales_train.describe()

Unnamed: 0,shop_id,item_id,item_price,item_cnt_day
count,2935843.0,2935843.0,2935843.0,2935843.0
mean,33.30316,10197.23,890.8535,1.242641
std,16.14734,6324.29,1729.801,2.618837
min,2.0,0.0,-1.0,-22.0
25%,22.0,4476.0,249.0,1.0
50%,31.0,9343.0,399.0,1.0
75%,48.0,15684.0,999.0,1.0
max,59.0,22169.0,307980.0,2169.0


In [116]:
sales_train.loc[sales_train.item_price > 5e4]
# looking how many outliers with big price are presented is sales_train

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
885137,2013-09-17,12,11365,59200.0,1.0
1163157,2013-12-13,12,6066,307980.0,1.0
1488133,2014-03-20,25,13199,50999.0,1.0


In [117]:
items.iloc[6066]
# 522 persons, now it's clear why price is so high

index                               6066
item_name           Radmin 3  - 522 лиц.
item_id                             6066
item_category_id                      75
Name: 6066, dtype: object

In [118]:
sales_train.loc[sales_train.item_price < 1]
# looking how many outliers with low price are presented in sales_train

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
484682,2013-05-15,32,2973,-1.000000,1.0
531560,2013-06-16,30,11859,0.100000,1.0
531563,2013-06-13,30,11855,0.100000,1.0
531565,2013-06-06,30,11858,0.100000,1.0
531566,2013-06-09,30,11858,0.100000,1.0
...,...,...,...,...,...
2898261,2015-10-02,58,15334,0.990000,1.0
2898262,2015-10-11,58,15334,0.990000,1.0
2898263,2015-10-01,58,15344,0.990000,2.0
2909812,2015-10-28,12,11373,0.908714,2169.0


In [119]:
sales_train.loc[sales_train.item_price < 0.1]
# looking how many outliers with very low price are presented in
# sales_train

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
484682,2013-05-15,32,2973,-1.0,1.0
579765,2013-06-11,6,11864,0.07,1.0
608762,2013-06-08,58,11865,0.07,1.0
611399,2013-06-08,58,20146,0.0875,4.0
673154,2013-07-31,6,11872,0.09,1.0


In [121]:
items.iloc[[2973,11864, 11685, 20146, 11872]]
# definetely errors, these productsthey can't be worth that little

Unnamed: 0,index,item_name,item_id,item_category_id
2973,2973,"DmC Devil May Cry [PS3, русские субтитры]",2973,19
11864,11864,Змей ДЖЕККИ с сердечком 36 см,11864,63
11685,11685,ЗДК ПРИКЛЮЧЕНИЯ МЮНХГАУЗЕНА,11685,41
20146,20150,Умный пластилин Аметист 80 г,20150,69
11872,11872,Змея ВАСИЛИСА 23 см,11872,63


In [124]:
temp = sales_train.loc[(sales_train['item_cnt_day'] <= 0)]
temp 
# looking at examples with negative item_cnt_day

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
2,2013-01-05,25,2552,899.0,-1.0
148,2013-01-23,25,2321,999.0,-1.0
175,2013-01-07,25,2199,1449.0,-1.0
807,2013-01-02,25,2330,599.0,-1.0
1041,2013-01-13,25,5034,1989.0,-1.0
...,...,...,...,...,...
2934237,2015-10-26,25,3917,449.0,-1.0
2934456,2015-10-18,25,4896,6398.0,-1.0
2935257,2015-10-05,25,10039,249.0,-1.0
2935637,2015-10-16,25,7893,2990.0,-1.0


In [127]:
items.loc[items.item_id.isin(temp.item_id)]
# Majority of this item's is games and films, it's common to return
# them in shop.

Unnamed: 0,index,item_name,item_id,item_category_id
28,28,"007 Legends [PС, Jewel, русская версия]",28,30
29,29,"007 Legends [Xbox 360, русская версия]",29,23
30,30,007: КООРДИНАТЫ «СКАЙФОЛЛ»,30,40
31,31,007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),31,37
32,32,1+1,32,40
...,...,...,...,...
22143,22147,Я-ЛЕГЕНДА WB (регион),22147,40
22151,22155,"ЯДЫ, ИЛИ ВСЕМИРНАЯ ИСТОРИЯ ОТРАВЛЕНИЙ (BD)",22155,37
22158,22162,ЯРОСТЬ,22162,40
22160,22164,ЯРОСТЬ (BD),22164,37


In [128]:
temp.describe()

Unnamed: 0,shop_id,item_id,item_price,item_cnt_day
count,7356.0,7356.0,7356.0,7356.0
mean,32.514546,8982.021207,1442.945623,-1.02515
std,16.825725,6166.315366,2419.680132,0.368794
min,2.0,28.0,0.5,-22.0
25%,19.0,3734.0,349.0,-1.0
50%,31.0,6927.0,799.0,-1.0
75%,47.0,14056.0,1699.0,-1.0
max,59.0,22167.0,33490.0,-1.0


In [129]:
sales_train.loc[(sales_train['item_cnt_day'] > 800)]

Unnamed: 0,date,shop_id,item_id,item_price,item_cnt_day
2326924,2015-01-15,12,20949,4.0,1000.0
2909812,2015-10-28,12,11373,0.908714,2169.0


In [130]:
sales_train.loc[sales_train.item_cnt_day <= 0]\
            .groupby('date').count().sort_index()
# see that every day there are some negative item_cnt_day, 
# so I decided that negative item_cnt_day may mean return of
# item. Clarify my assumption with customer is impossible,
# so let's presume I'm right

Unnamed: 0_level_0,shop_id,item_id,item_price,item_cnt_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,6,6,6,6
2013-01-02,34,34,34,34
2013-01-03,21,21,21,21
2013-01-04,11,11,11,11
2013-01-05,14,14,14,14
...,...,...,...,...
2015-10-27,4,4,4,4
2015-10-28,6,6,6,6
2015-10-29,2,2,2,2
2015-10-30,3,3,3,3


In [131]:
# drop sales_train examples:
# 1. which item_id doesn't belong to items
# 2. which shop_id doesn't belong to shops
# 3. which item_price is to low or to high:
#      Mean value of item price is 890. Moreover we can define that
#      it's russian rubles. Have a price less than 1 ruble is near
#      impossible, so I made a restriction of 0.1 ruble min price.
#      Max item_price is 300_000, big but possible. I made a 
#      restriction of 10_000_000 rubles.
# 4. which item_cnt_day is to low or to high:
#      min negative item_cnt_day = -22, and most of times it don't 
#      surpass -1, so let low boudary be -1000 is more than enough.
#      Same with high boundary, max value is 2000 but 75% of times
#      it don't surpass 1, so 100_000 reasonable high boundary.
drop_conditions = (~sales_train.item_id.isin(items['item_id'])) | \
                      (~sales_train.shop_id.isin(shops['shop_id'])) | \
                      (sales_train.item_price < 0.1) | \
                      (sales_train.item_price > 1e7) | \
                      (sales_train.item_cnt_day > 1e5) | \
                      (sales_train.item_cnt_day < -1e3)

sales_train = sales_train.drop(sales_train.loc[drop_conditions].index)
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2935838 entries, 0 to 2935842
Data columns (total 5 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   shop_id       int64         
 2   item_id       int64         
 3   item_price    float64       
 4   item_cnt_day  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 134.4 MB


# Test #


In [132]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [133]:
test.shape

(214200, 3)

In [134]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   ID       214200 non-null  int64
 1   shop_id  214200 non-null  int64
 2   item_id  214200 non-null  int64
dtypes: int64(3)
memory usage: 4.9 MB


In [135]:
test.loc[test.duplicated()]
# check duplicates

Unnamed: 0,ID,shop_id,item_id


In [77]:
test.describe()
# check if distribution is ok

Unnamed: 0,ID,shop_id,item_id
count,214200.0,214200.0,214200.0
mean,107099.5,31.642857,11019.398627
std,61834.358168,17.561933,6252.64459
min,0.0,2.0,30.0
25%,53549.75,16.0,5381.5
50%,107099.5,34.5,11203.0
75%,160649.25,47.0,16071.5
max,214199.0,59.0,22167.0


## Done!