# Modifying Data

## Reading and loading data

In [20]:
# import the pandas library
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

print(pd.__version__)

2.1.1


In [21]:
# Read the dataset
data = pd.read_csv('datasets/big_mart_sales.csv')
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### How to impute the missing values using loc in any column?

First, we will check the number of missing values in each of the column. Use the function **`isna().sum()`**.

In [22]:
# check for the null values
data.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

#### Select the columns where Item_Weight has missing values using loc

In [23]:
# rows with null values in the Item_Weight
data.loc[data['Item_Weight'].isna() == True].head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
18,DRI11,,Low Fat,0.034238,Hard Drinks,113.2834,OUT027,1985,Medium,Tier 3,Supermarket Type3,2303.668
21,FDW12,,Regular,0.0354,Baking Goods,144.5444,OUT027,1985,Medium,Tier 3,Supermarket Type3,4064.0432
23,FDC37,,Low Fat,0.057557,Baking Goods,107.6938,OUT019,1985,Small,Tier 1,Grocery Store,214.3876
29,FDC14,,Regular,0.072222,Canned,43.6454,OUT019,1985,Small,Tier 1,Grocery Store,125.8362


In [24]:
# fill the null values in Item_Weight by mean
data.loc[(data['Item_Weight'].isna() == True) , 'Item_Weight'] = data['Item_Weight'].mean()
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [25]:
# Checking for NULL values
data.isna().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

#### Select the columns where Outlet_Size has missing values using loc

In [26]:
# rows with null values in the Item_Weight
data.loc[data['Outlet_Size'].isna() == True].head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535
25,NCD06,13.0,Low Fat,0.099887,Household,45.906,OUT017,2007,,Tier 2,Supermarket Type1,838.908
28,FDE51,5.925,Regular,0.161467,Dairy,45.5086,OUT010,1998,,Tier 3,Grocery Store,178.4344


In [27]:
# Checking the frequency of the column
data['Outlet_Size'].value_counts()

Outlet_Size
Medium    2793
Small     2388
High       932
Name: count, dtype: int64

In [28]:
# fill the null values in Outlet_Size by mode
data.loc[(data['Outlet_Size'].isna() == True) , 'Outlet_Size'] = 'Medium'
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


#### Use the fillna function to impute the missing values

**`fillna`** function is another way to impute the missing values. Use the parameter **`inplace=True`** to store the results in the dataframe.

In [29]:
# fill the null values in Outlet Size by the most frequent value: "Medium"
data['Outlet_Size'].fillna('Medium', inplace = True)
data.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

### How to update the values of a column?`

Let's have a look at the count of each category in the column `Item_Fat_Content`. We will use `value_counts()` function to to do that.

In [30]:
data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

- We can see that the categories `Low Fat`, `LF` and `low fat` are same and also `Regular`, `reg` are same.
- So to keep the data clean we will map all these to only two categories to `LF` and `R` using the map function.

In [31]:
# Create a new mapping 
mapping = {
    'Low Fat' : 'LF',
    'Regular' : 'R',
    'LF' : 'LF',
    'reg': 'R',
    'low fat' : 'LF'
}

# Map the mappings
data['Item_Fat_Content'] = data['Item_Fat_Content'].map(mapping)

# Checking the value_counts()
data['Item_Fat_Content'].value_counts()

Item_Fat_Content
LF    5517
R     3006
Name: count, dtype: int64

### How to crate a new column by modifying the existing column?

We will work on **Item_MRP** column for this.

In [33]:
# Item_MRP column
data['Item_MRP'].head()

0    249.8092
1     48.2692
2    141.6180
3    182.0950
4     53.8614
Name: Item_MRP, dtype: float64

#### APPLY function

Create a new column **`Item_MRP_USD`** by dividing the each value in the column **`Item_MRP`** by 90 using the apply function.

In [34]:
# Create a new column 
data['Item_MRP_USD'] = data['Item_MRP'].apply(lambda x: x/90)
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_MRP_USD
0,FDA15,9.3,LF,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,2.775658
1,DRC01,5.92,R,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,0.536324
2,FDN15,17.5,LF,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,1.573533
3,FDX07,19.2,R,0.0,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38,2.023278
4,NCD19,8.93,LF,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,0.59846


In [35]:
# Check the columns
data[['Item_MRP','Item_MRP_USD']].head()

Unnamed: 0,Item_MRP,Item_MRP_USD
0,249.8092,2.775658
1,48.2692,0.536324
2,141.618,1.573533
3,182.095,2.023278
4,53.8614,0.59846


### How to convert categorical variables into numerical?

- Most of the machine learning algorithms do not take categorical variables so we need to convert them into numerical ones.
- In pandas, we have one such function **`get_dummies`** which will help us in doing such tasks. It will create a binary column for each of the categories. 
- This is also known as **`One Hot Encoding`**.

In [38]:
# currently we have 13 columns in the data
print('Shape:', data.shape)

Shape: (8523, 13)


In [39]:
# convert categorical variables into numerical variables.
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_MRP_USD,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,1999,3735.138,2.775658,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
1,5.92,0.019278,48.2692,2009,443.4228,0.536324,False,False,False,False,...,False,True,False,False,False,True,False,False,True,False
2,17.5,0.01676,141.618,1999,2097.27,1.573533,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
3,19.2,0.0,182.095,1998,732.38,2.023278,False,False,False,False,...,False,True,False,False,False,True,True,False,False,False
4,8.93,0.0,53.8614,1987,994.7052,0.59846,False,False,False,False,...,True,False,False,False,False,True,False,True,False,False


In [40]:
# now, we have 1603 columns
print('Updated Shape:', data.shape)

Updated Shape: (8523, 1603)
