In [1]:
%matplotlib inline

In [2]:
import itertools
from time import sleep

import gc
import numpy as np
import pandas as pd
from pathlib2 import Path
from tqdm import tqdm_notebook

In [3]:
data_path = Path('data')

item_categories = pd.read_csv(data_path / 'item_categories.csv')
items = pd.read_csv(data_path / 'items.csv')
shops = pd.read_csv(data_path / 'shops.csv')

train = pd.read_csv(data_path / 'sales_train.csv')
test = pd.read_csv(data_path / 'test.csv')

groupby_cols = ['date_block_num', 'shop_id', 'item_id']

In [4]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [5]:
items.head()
shops.head()
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [6]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [7]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [8]:
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


Data Description

In [10]:
def basic_eda(df):
    print("----------TOP 5 RECORDS--------")
    print(df.head(5))
    print("----------INFO-----------------")
    print(df.info())
    print("----------Describe-------------")
    print(df.describe())
    print("----------Columns--------------")
    print(df.columns)
    print("----------Data Types-----------")
    print(df.dtypes)
    print("-------Missing Values----------")
    print(df.isnull().sum())
    print("-------NULL values-------------")
    print(df.isna().sum())
    print("-----Shape Of Data-------------")
    print(df.shape)

In [14]:
print("=============================Sales Data=============================")
basic_eda(train)
print("=============================Test data=============================")
basic_eda(test)
print("=============================Item Categories=============================")
basic_eda(item_categories)
print("=============================Items=============================")
basic_eda(items)
print("=============================Shops=============================")
basic_eda(shops)


----------TOP 5 RECORDS--------
         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0
----------INFO-----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
 #   Column          Dtype  
---  ------          -----  
 0   date            object 
 1   date_block_num  int64  
 2   shop_id         int64  
 3   item_id         int64  
 4   item_price      float64
 5   item_cnt_day    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB
None
----------Describe-------------
       date_block_num       shop_id       item_

In [15]:
train = train[train.item_price < 100000]
train = train[train.item_cnt_day < 1001]
train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [16]:
train.tail()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2935844,10.10.2015,33,25,7409,299.0,1.0
2935845,09.10.2015,33,25,7460,299.0,1.0
2935846,14.10.2015,33,25,7459,349.0,1.0
2935847,22.10.2015,33,25,7440,299.0,1.0
2935848,03.10.2015,33,25,7460,299.0,1.0


In [17]:
train.item_cnt_day.unique()

array([   1.,   -1.,    3.,    2.,    4.,    5.,   13.,    7.,    6.,
          8.,   10.,   11.,   30.,   22.,   14.,   15.,    9.,   12.,
         17.,   19.,   20.,   -5.,   -6.,   -3.,   -2.,   16.,   64.,
         18.,   29.,   26.,   39.,   21.,   27.,   53.,   25.,   23.,
         24.,   40.,   28.,   31.,   36.,   -4.,   32.,   35.,   41.,
         88.,   44.,   52.,   56.,   38.,   34.,   57.,   69.,   37.,
         47.,   50.,   48.,   33.,   61.,   80.,   42.,   82.,  200.,
        100.,   76.,   77.,  107.,   46.,   49.,   70.,   55.,   60.,
         45.,   71.,   65.,  150.,   85.,   89.,   43.,   58.,  217.,
        104.,   84.,  168.,   92.,  156.,   68.,   62.,  264.,  110.,
         83.,  133.,   51.,  148.,   54.,   67.,  106.,  109.,  102.,
         90.,  288.,  171.,  131.,   87.,   78.,  151.,   59.,  113.,
        118.,  127.,   97.,  135.,  -22.,  103.,  111.,   96.,  242.,
        105.,   72.,  126.,   73.,  101.,   91.,   63.,  194.,  255.,
         98.,   81.,

In [19]:
train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935847.0,2935847.0,2935847.0,2935847.0,2935847.0
mean,14.56991,33.00174,10197.23,890.7489,1.241903
std,9.422985,16.22697,6324.299,1720.49,2.292963
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,59200.0,1000.0


ModuleNotFoundError: No module named 'plotly'

In [22]:
pip install plotly

Collecting plotly
  Downloading plotly-5.13.1-py2.py3-none-any.whl (15.2 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.13.1 tenacity-8.2.2
Note: you may need to restart the kernel to use updated packages.


In [23]:
import plotly.express as px
fig = px.scatter(x=train.Item_price )

fig.show()

AttributeError: 'DataFrame' object has no attribute 'Item_price'