In [1]:
#Importing necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os

%matplotlib inline

In [2]:

# setup logger# setup l 
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
    
# get logger
logger = logging.getLogger(__name__)

In [3]:

# Check if interactive plot is enabled, if not enable it.# Check i 

logger.info('Checking Interactive session')
    
if plt.isinteractive():
    print("Plotting is not interactive, Setting it to interactive mode ...")
    plt.interactive(True)
else:
    print("Plotting is already interactive")

2019-01-25 14:09:34,641 - __main__ - INFO - Checking Interactive session


Plotting is not interactive, Setting it to interactive mode ...


## Data Variables

Variable Description <br>
Item_Identifier : Unique product ID <br>
Item_Weight : Weight of product <br> 
Item_Fat_Content : Whether the product is low fat or not <br>
Item_Visibility : The % of total display area of all products in a store allocated to the particular product <br>
Item_Type : The category to which the product belongs <br>
Item_MRP : Maximum Retail Price (list price) of the product <br>
Outlet_Identifier : Unique store ID <br>
Outlet_Establishment_Year : The year in which store was established <br>
Outlet_Size : The size of the store in terms of ground area covered <br>
Outlet_Location_Type : The type of city in which the store is located <br>
Outlet_Type : Whether the outlet is just a grocery store or some sort of supermarket <br>
Item_Outlet_Sales : Sales of the product in the particulat store. This is the outcome variable to be predicted. <br>

In [4]:
#Read DataSets, the files are read from disk in to memory:

logger.info('Reading the Data Mart datasets')


train_dataset = pd.read_csv(r"C:\Users\divyakamat\data\DataSets\mart\train.csv")
test_dataset = pd.read_csv(r"C:\Users\divyakamat\data\DataSets\mart\test.csv")

2019-01-25 14:09:34,837 - __main__ - INFO - Reading the Data Mart datasets


In [5]:
train_dataset.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [9]:
columns = train_dataset.columns
index = train_dataset.index
data = train_dataset.values


In [16]:
#RangeIndex is special type of index object analogous to Range object in Python, the values are not loaded into memory
#until its necessary
index 

RangeIndex(start=0, stop=8523, step=1)

In [8]:
columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [10]:
data

array([['FDA15', 9.3, 'Low Fat', ..., 'Tier 1', 'Supermarket Type1',
        3735.138],
       ['DRC01', 5.92, 'Regular', ..., 'Tier 3', 'Supermarket Type2',
        443.4228],
       ['FDN15', 17.5, 'Low Fat', ..., 'Tier 1', 'Supermarket Type1',
        2097.27],
       ..., 
       ['NCJ29', 10.6, 'Low Fat', ..., 'Tier 2', 'Supermarket Type1',
        1193.1136],
       ['FDN46', 7.21, 'Regular', ..., 'Tier 3', 'Supermarket Type2',
        1845.5976],
       ['DRG01', 14.8, 'Low Fat', ..., 'Tier 1', 'Supermarket Type1',
        765.67]], dtype=object)

In [15]:
#RangeIndex are subclass of Index and they are very similar
print(type(index))   #Row Index
print(type(columns)) #Column Index
print(type(data))

<class 'pandas.core.indexes.range.RangeIndex'>
<class 'pandas.core.indexes.base.Index'>
<class 'numpy.ndarray'>


## Understanding data types

In broad terms, data may be classified as Continous or Categorical.
Continuous Data 
- Numeric and represents measurement (height, weight etc)
- Takes an infinite number of possibilities

Categorical Data
- Represents discrete values
- Represents finite values


In [17]:
train_dataset.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

Pandas defaults core numeric types - integers and floats to 64 bits regardless of size for the data to fit in memory.
A column with integer value 0 will also have data type as int64

In [21]:
#Get the count of columns for each type
train_dataset.get_dtype_counts()

float64    4
int64      1
object     7
dtype: int64

## Object Data Types

- Object datatype column may contain values of any valid Python objects, they need not be necessarily strings
- These may possibly contain a mixture of Integers, Booleans, strings, lists or dictionaries
- Its a catch-all for columns, for which Pandas is not able to identify the data type.

### Selecting a single column of data from Series

- Indexing operation
- Dot notation

Series 
- is a single column of data from a DataFrame
- A single dimension of data, composed of just an index and the data

In [25]:
train_dataset['Outlet_Size'].head()

0    Medium
1    Medium
2    Medium
3       NaN
4      High
Name: Outlet_Size, dtype: object

In [None]:
train_dataset.Outlet_Size

Columns of data may be accessed using dot notation (as above), however its not recommended to use dot to access the data with production code for following reasons:
- Not a best practice and is prone to error and misuse
- Column names with spaces/special characters cannot be accessed
- Columns names that collide with DataFrame methods also fail
- Assigning new values or deleting columns might give unexpected results.

In [26]:
#convert series to dataframe using to_frame() method, it uses Series name as the new column name
df = train_dataset['Outlet_Size']
df.to_frame().head()

Unnamed: 0,Outlet_Size
0,Medium
1,Medium
2,Medium
3,
4,High


### Calling Series Methods

In [31]:
#Number of attributes and methods in Series
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

439

In [32]:
#Number of attributes and method in DataFrame
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

444

In [33]:
#Attributes and methods common to both - Series and DataFrames

len(s_attr_methods & df_attr_methods)

373

In [35]:
size = train_dataset['Outlet_Size']
item = train_dataset['Item_Visibility']

Useful method for objects datatype - value_counts()

In [36]:
item.count() #Gives total not null values

8523

In [38]:
#SummaryStatictics

item.describe()

count    8523.000000
mean        0.066132
std         0.051598
min         0.000000
25%         0.026989
50%         0.053931
75%         0.094585
max         0.328391
Name: Item_Visibility, dtype: float64

In [40]:
item.quantile(.1) # Gives the quantile values

0.012041769800000001

In [44]:
item.quantile([.1,.2,.3,.4,.5,.6,.7,.8,.9]) # pass the list of quantile values it resturns a series

0.1    0.012042
0.2    0.022558
0.3    0.031956
0.4    0.041754
0.5    0.053931
0.6    0.067958
0.7    0.083456
0.8    0.106924
0.9    0.139514
Name: Item_Visibility, dtype: float64

In [47]:
#Following operations can be done to the series.

# item + 1
# item // 7
# item ** 2


## Chaining Series Methods together

Sequential invocation of methods using dot notation is method chaining

In [49]:
size.isnull().sum()

2410

- Index provides label for each of the rows
- If no index is explicitly provided, then by default a RangeIndex is created with labels as inteers from 0 to n-1
- by default, set_index will drop the index column from the dataframe.
- This can be avoided by setting the drop parameter to false.

In [55]:
train_dataset=train_dataset.set_index('Item_Identifier')

#Reset index, the column will be the first column after reset
#train_dataset.reset_index()

### Renaming rows and columns names

In [57]:
#Renames index and column names

idx_rename = {'FDA15':'fda15','DRC01':'drc01'}
col_rename = {'Item_Weight':'itemweight','Item_Type':'itemtype'}

train_dataset.rename(index=idx_rename,columns=col_rename).head()

Unnamed: 0_level_0,itemweight,Item_Fat_Content,Item_Visibility,itemtype,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fda15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
drc01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [61]:
#There are two ways to select columns

train_dataset[['Item_Weight','Item_Type']].head()

cols = ['Item_Weight','Item_Type']
train_dataset.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## Select columns with methods

In [62]:
train_dataset.get_dtype_counts()

float64    4
int64      1
object     6
dtype: int64

In [65]:
train_dataset.select_dtypes(include=['float']).head(2)

Unnamed: 0_level_0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FDA15,9.3,0.016047,249.8092,3735.138
DRC01,5.92,0.019278,48.2692,443.4228


In [66]:
train_dataset.select_dtypes(include=['number']).head(2)

Unnamed: 0_level_0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
FDA15,9.3,0.016047,249.8092,1999,3735.138
DRC01,5.92,0.019278,48.2692,2009,443.4228


In [67]:
#Filter all columns with a particular column
train_dataset.filter(like='Item').head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Item_Outlet_Sales
Item_Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,3735.138
DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,443.4228
FDN15,17.5,Low Fat,0.01676,Meat,141.618,2097.27
FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,732.38
NCD19,8.93,Low Fat,0.0,Household,53.8614,994.7052


## Ordering column names

Import dataset and analayse the order of the column, below are few guidelines
- classify each column as either discrete or continuous
- Group common columns with discrete and continuous columns
- Place the most important groups of columns first with categorical before discrete

In [68]:
train_dataset.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type',
       'Item_Outlet_Sales'],
      dtype='object')

disc_core / disc_people / disc_other / cont_fb + cont_others