# Deriving New Columns:

1. Importing libraries and dataset
2. Creating column for price range using .loc()
3. Creating column to categorise day busyness level over a week using for and if else condition
4. Creating column to categorise hour busyness level over a day using for and if else condition
5. Exporting the dataset with newly derived columns

## 1. Importing libraries and dataset

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Accessing EnvFile for path

%run EnvFile.ipynb

Stored 'path' (str)


In [3]:
# Importing orders_products_merged.pkl

df_ords_prods_dept = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_dept.pkl'))

In [4]:
df_ords_prods_dept.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404289 entries, 0 to 32404288
Data columns (total 14 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
 6   product_id              int32  
 7   add_to_cart_order       int32  
 8   reordered               int8   
 9   product_name            object 
 10  aisle_id                int8   
 11  department_id           int8   
 12  prices                  float64
 13  department              object 
dtypes: float16(1), float64(1), int32(4), int8(6), object(2)
memory usage: 1.7+ GB


## 2. Creating column for price range using .loc()

In [5]:
df_ords_prods_dept.loc[df_ords_prods_dept['prices'] > 15, 'price_range'] = 'High-range product'

In [6]:
df_ords_prods_dept.loc[(df_ords_prods_dept['prices'] <= 15) & (df_ords_prods_dept['prices'] > 5), 'price_range'] = 'Mid-range product'

In [7]:
df_ords_prods_dept.loc[df_ords_prods_dept['prices'] <= 5, 'price_range'] = 'Low-range product'

In [8]:
df_ords_prods_dept['price_range'].value_counts(dropna = False)

Mid-range product     21865979
Low-range product     10125759
High-range product      412551
Name: price_range, dtype: int64

## 3. Creating column to categorise day busyness level over the week

In [9]:
df_ords_prods_dept['order_day_of_week'].value_counts(dropna = False)

0    6204040
1    5660135
6    4496403
2    4213760
5    4205721
3    3840476
4    3783754
Name: order_day_of_week, dtype: int64

#### 0 (Saturday) and 1 (Sunday) are the most busy days, 4 (Wednesday) and 3 (Tuesday) are the least busy days and the rest are regular busy days.

In [10]:
day_busyness_level = []
for value in df_ords_prods_dept['order_day_of_week']:
    if (value == 0) or (value == 1):
        day_busyness_level.append('Most Busy Day')
    elif (value == 4) or (value == 3):
        day_busyness_level.append('Least Busy Day')
    else:
        day_busyness_level.append('Average Busy Day')

In [11]:
# Assigning the above for loop result to a new column in df

df_ords_prods_dept['day_busyness_level'] = day_busyness_level

In [12]:
# Finding the frequency of new column

df_ords_prods_dept['day_busyness_level'].value_counts(dropna = False)

Average Busy Day    12915884
Most Busy Day       11864175
Least Busy Day       7624230
Name: day_busyness_level, dtype: int64

## 4. Creating column to categorise hour busyness level over the day

In [13]:
df_ords_prods_dept['order_hour_of_day'].value_counts(dropna = False)

10    2761710
11    2736075
14    2689086
15    2662094
13    2660900
12    2618481
16    2535154
9     2454165
17    2087609
8     1718100
18    1636469
19    1258290
20     976145
7      891040
21     795628
22     634216
23     402315
6      290492
0      218766
1      115699
5       87959
2       69374
4       53241
3       51281
Name: order_hour_of_day, dtype: int64

In [14]:
hour_busyness_level = []
for value in df_ords_prods_dept['order_hour_of_day']:
    if (value == 10) or (value == 11):
        hour_busyness_level.append('Most Busy Hour')
    elif (value == 3) or (value == 4):
        hour_busyness_level.append('Least Busy Hour')
    else:
        hour_busyness_level.append('Average Busy Hour')

In [15]:
# Assigning the above for loop result to new column in df

df_ords_prods_dept['hour_busyness_level'] = hour_busyness_level

In [16]:
# Finding the frequency of newly created column

df_ords_prods_dept['hour_busyness_level'].value_counts(dropna = False)

Average Busy Hour    26801982
Most Busy Hour        5497785
Least Busy Hour        104522
Name: hour_busyness_level, dtype: int64

In [17]:
# Checking the updated df with 3 newly created columns

df_ords_prods_dept.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,department,price_range,day_busyness_level,hour_busyness_level
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,beverages,Mid-range product,Average Busy Day,Average Busy Hour
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,beverages,Mid-range product,Least Busy Day,Average Busy Hour
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,beverages,Mid-range product,Least Busy Day,Average Busy Hour
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,beverages,Mid-range product,Least Busy Day,Average Busy Hour
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,beverages,Mid-range product,Least Busy Day,Average Busy Hour


In [18]:
df_ords_prods_dept.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404289 entries, 0 to 32404288
Data columns (total 17 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_day_of_week       int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
 6   product_id              int32  
 7   add_to_cart_order       int32  
 8   reordered               int8   
 9   product_name            object 
 10  aisle_id                int8   
 11  department_id           int8   
 12  prices                  float64
 13  department              object 
 14  price_range             object 
 15  day_busyness_level      object 
 16  hour_busyness_level     object 
dtypes: float16(1), float64(1), int32(4), int8(6), object(5)
memory usage: 2.4+ GB


## 5. Exporting the dataset with newly dervied columns

In [19]:
# Exporting the dataframe with new variables

df_ords_prods_dept.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_dept_newcolumns.pkl'))