In [32]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import os

In [6]:
def load_data(data_dir):
    """Loads all data as a dictionary"""
    data_files = os.listdir(data_dir)
    data_dict = {}
    for file in data_files:
        file_name = file.split('.')[0]
        data_dict[file_name] = pd.read_csv(f'{data_dir}/' + file)
        
    return data_dict
    
data = load_data('data')

In [7]:
data.keys()

dict_keys(['items', 'shops', 'item_categories', 'sales_train', 'test', 'sample_submission'])

At this moment I won't be needing 'test' and 'sample_submission'

In [8]:
try:
    del data['test']
    del data['sample_submission']
except KeyError:
    pass

#### Basic overview of data

In [93]:
for key,value in data.items():
    print(f'{key}: \n')
    print(f"Null: \n {value.isnull().sum()} \n")
    value.info()

    end_line = '_' * 100
    print(end_line)

items: 

Null: 
 item_name           0
item_id             0
item_category_id    0
dtype: int64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_name         22170 non-null  object
 1   item_id           22170 non-null  int64 
 2   item_category_id  22170 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 519.7+ KB
____________________________________________________________________________________________________
shops: 

Null: 
 shop_name    0
shop_id      0
dtype: int64 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   shop_name  60 non-null     object
 1   shop_id    60 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ KB
_________________________________________

In [10]:
data.keys()

dict_keys(['items', 'shops', 'item_categories', 'sales_train'])

### Data relationships:

items.csv:  
- Unique values [item_name], [item_id -> sales_train.csv]
- [item_category_id -> item_categories.csv]

shops.csv:
- Unique values [shop_name], [shop_id -> sales_train.csv]

item_categories.csv:
- Unique values [item_category_name], [item_category_id]

sales_train.csv:
- main data
- Time data [date] (convert to datatime) [dd.mm.yy]
- date_block_num?
    - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33

- item_cnt_day?  
    - number of products sold

#### Convert dates from type string to datetime

In [39]:
sales = data['sales_train']
sales['date'] = pd.to_datetime(sales['date'], dayfirst=True)
sales.head(3)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0


In [97]:
item = sales[sales.item_price <0]
item

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
484683,2013-05-15,4,32,2973,-1.0,1.0


In [101]:
sales[sales.item_id == 2973]['item_price'].mean()

2041.6272820512836

#### Round item price to two decimal places

In [86]:
sales['item_price'] = [round(item, 2) for item in sales['item_price']]

In [87]:
# Set display options as type float with two decimal places
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sales['item_price'].describe()

count   2935849.00
mean        890.85
std        1729.80
min          -1.00
25%         249.00
50%         399.00
75%         999.00
max      307980.00
Name: item_price, dtype: float64

In [88]:
# Item_price:
## Minimum value of -1?
### Item with price -1 (item_id = 2973) is a mistake mean price for item is about 2000

## 75 percent of the items are less than 1000, max value of 307980 error?
### item with price 307980 (item_id = 6066) only observed once in entire data set [huge outlier]
#### Frequency decreases rapidly after (item_price > 1000)

## Dataset appears to be skewed to right

In [106]:
sales[sales.item_price == -1]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
484683,2013-05-15,4,32,2973,-1.0,1.0


In [103]:
sales[sales.item_price > 100000]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
1163158,2013-12-13,11,12,6066,307980.0,1.0
