# 0.0 - IMPORTS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import inflection

from IPython.core.display import HTML
from IPython.display import Image

## 0.1 Helper Funcitons 

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display(HTML('<style>.container{width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr', False)
    
    sns.set

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## 0.2 Loading Data 

In [4]:
df_train_raw = pd.read_csv('data/train.csv', low_memory=False)

# 1.0 - DATA DESCRIPTION

In [5]:
df1 = df_train_raw.copy()

## 1.1 Rename Columns 

In [9]:
cols_old = ['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
            'Item_Type', 'Item_MRP', 'Outlet_Identifier',
            'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Item_Outlet_Sales']
snakecase = lambda x: inflection.underscore(x)
cols_new = list(map(snakecase, cols_old))
df1.columns = cols_new

## 1.2 Data Dimensions 

In [11]:
print('Number of columns: {}'.format(df1.shape[1]))
print('Number of rows: {}'.format(df1.shape[0]))

Number of columns: 12
Number of rows: 8523


## 1.3 Data Types 

In [13]:
df1.dtypes

item_identifier               object
item_weight                  float64
item_fat_content              object
item_visibility              float64
item_type                     object
item_mrp                     float64
outlet_identifier             object
outlet_establishment_year      int64
outlet_size                   object
outlet_location_type          object
outlet_type                   object
item_outlet_sales            float64
dtype: object

## 1.4 Check NA

In [15]:
df1.isna().sum()

item_identifier                 0
item_weight                  1463
item_fat_content                0
item_visibility                 0
item_type                       0
item_mrp                        0
outlet_identifier               0
outlet_establishment_year       0
outlet_size                  2410
outlet_location_type            0
outlet_type                     0
item_outlet_sales               0
dtype: int64

## 1.5 Fillout NA 

In [None]:
#item_weight

#outlet_size 


In [49]:
df1[['item_identifier', 'item_weight']].groupby(['item_identifier', 'item_weight']).max().sort_values('item_weight')

item_identifier,item_weight
FDP40,4.555
DRE12,4.59
DRF23,4.61
FDX49,4.615
FDS23,4.635
FDU11,4.785
FDW02,4.805
FDY24,4.88
FDF50,4.905
FDT48,4.92


In [40]:
df1.loc[df1['outlet_size'].isna()]

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535
25,NCD06,13.0,Low Fat,0.099887,Household,45.906,OUT017,2007,,Tier 2,Supermarket Type1,838.908
28,FDE51,5.925,Regular,0.161467,Dairy,45.5086,OUT010,1998,,Tier 3,Grocery Store,178.4344
30,FDV38,19.25,Low Fat,0.170349,Dairy,55.7956,OUT010,1998,,Tier 3,Grocery Store,163.7868
33,FDO23,17.85,Low Fat,0.0,Breads,93.1436,OUT045,2002,,Tier 2,Supermarket Type1,2174.5028
45,FDM39,6.42,LF,0.089499,Dairy,178.1002,OUT010,1998,,Tier 3,Grocery Store,358.2004
46,NCP05,19.6,Low Fat,0.0,Health and Hygiene,153.3024,OUT045,2002,,Tier 2,Supermarket Type1,2428.8384
47,FDV49,10.0,Low Fat,0.02588,Canned,265.2226,OUT045,2002,,Tier 2,Supermarket Type1,5815.0972


In [48]:
df1.loc[df1['item_weight'].isna()]

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
18,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
21,FDW12,,Regular,0.0354,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432
23,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
29,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362
36,FDV20,,Regular,0.059512,Fruits and Vegetables,128.0678,OUT027,1985,Medium,Tier 3,Supermarket Type3,2797.6916
38,FDX10,,Regular,0.123111,Snack Foods,36.9874,OUT027,1985,Medium,Tier 3,Supermarket Type3,388.1614
39,FDB34,,Low Fat,0.026481,Snack Foods,87.6198,OUT027,1985,Medium,Tier 3,Supermarket Type3,2180.495
49,FDS02,,Regular,0.255395,Dairy,196.8794,OUT019,1985,Small,Tier 1,Grocery Store,780.3176
59,FDI26,,Low Fat,0.061082,Canned,180.0344,OUT019,1985,Small,Tier 1,Grocery Store,892.172


In [44]:
df1.loc[df1['item_identifier'] == 'DRI11']

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
18,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
1366,DRI11,,Low Fat,0.060237,Hard Drinks,115.9834,OUT019,1985,Small,Tier 1,Grocery Store,345.5502
3127,DRI11,8.26,Low Fat,0.034458,Hard Drinks,113.3834,OUT049,1999,Medium,Tier 1,Supermarket Type1,2073.3012
5310,DRI11,8.26,Low Fat,0.057586,Hard Drinks,113.7834,OUT010,1998,,Tier 3,Grocery Store,115.1834
7022,DRI11,8.26,Low Fat,0.034398,Hard Drinks,115.7834,OUT035,2004,Small,Tier 2,Supermarket Type1,2073.3012
7192,DRI11,8.26,Low Fat,0.034544,Hard Drinks,116.0834,OUT018,2009,Medium,Tier 3,Supermarket Type2,1267.0174
8483,DRI11,8.26,Low Fat,0.034474,Hard Drinks,117.0834,OUT045,2002,,Tier 2,Supermarket Type1,1612.5676


In [46]:
df1.loc[df1['outlet_identifier'] == 'OUT017']

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535
25,NCD06,13.0,Low Fat,0.099887,Household,45.906,OUT017,2007,,Tier 2,Supermarket Type1,838.908
53,FDA43,10.895,Low Fat,0.065042,Fruits and Vegetables,196.3794,OUT017,2007,,Tier 2,Supermarket Type1,3121.2704
54,NCP18,12.15,Low Fat,0.02876,Household,151.4708,OUT017,2007,,Tier 2,Supermarket Type1,4815.0656
73,FDG02,7.855,Low Fat,0.011325,Canned,189.6188,OUT017,2007,,Tier 2,Supermarket Type1,2285.0256
78,FDL04,19.0,Low Fat,0.112557,Frozen Foods,104.9622,OUT017,2007,,Tier 2,Supermarket Type1,1587.933
85,FDG20,15.5,Regular,0.126399,Fruits and Vegetables,177.0028,OUT017,2007,,Tier 2,Supermarket Type1,2479.4392
111,FDS52,8.89,low fat,0.005505,Frozen Foods,102.4016,OUT017,2007,,Tier 2,Supermarket Type1,2732.4432
116,FDZ16,16.85,Regular,0.16076,Frozen Foods,192.4478,OUT017,2007,,Tier 2,Supermarket Type1,4843.695
118,DRA12,11.6,Low Fat,0.041178,Soft Drinks,140.3154,OUT017,2007,,Tier 2,Supermarket Type1,2552.6772


In [33]:
df1.sample(10)

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
3958,FDB08,6.055,Low Fat,0.031152,Fruits and Vegetables,158.7578,OUT049,1999,Medium,Tier 1,Supermarket Type1,2085.9514
4127,FDE14,13.65,Regular,0.031573,Canned,99.47,OUT018,2009,Medium,Tier 3,Supermarket Type2,299.61
7028,FDT49,7.0,Low Fat,0.151406,Canned,107.728,OUT046,1997,Small,Tier 1,Supermarket Type1,2024.032
3015,NCX54,9.195,Low Fat,0.04806,Household,105.8622,OUT046,1997,Small,Tier 1,Supermarket Type1,2223.1062
7801,DRH01,17.5,Low Fat,0.098458,Soft Drinks,171.7738,OUT017,2007,,Tier 2,Supermarket Type1,2432.8332
5845,NCQ02,12.6,Low Fat,0.007468,Household,186.9556,OUT049,1999,Medium,Tier 1,Supermarket Type1,3379.6008
5694,FDR44,6.11,Regular,0.102901,Fruits and Vegetables,128.4968,OUT035,2004,Small,Tier 2,Supermarket Type1,1435.4648
7196,NCS17,18.6,Low Fat,0.080434,Health and Hygiene,93.4436,OUT013,1987,High,Tier 3,Supermarket Type1,1701.7848
8384,FDU10,10.1,Regular,0.045654,Snack Foods,35.6848,OUT013,1987,High,Tier 3,Supermarket Type1,633.8416
2940,FDZ20,16.1,Low Fat,0.057423,Fruits and Vegetables,253.7356,OUT010,1998,,Tier 3,Grocery Store,508.6712
