# 4.7 Deriving new variables

### This script contains the following points:

#### 1. Create price range flags
#### 2. Rename day of week column
#### 3. Create a busyness flag for the newly named column
#### 4. Steps for Exercise 4.7

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Determine the path
path = r'/Users/nekow/Documents/Instacart Basket Analysis'

In [3]:
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

In [4]:
# Create a subset

df = ords_prods_merged[:1000000]

In [5]:
df.shape

(1000000, 16)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,prior,28,6,11,3.0,5,0,both
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,prior,30,6,17,20.0,1,1,both
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,prior,2,0,21,6.0,20,0,both
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,prior,1,3,13,,10,0,both
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,prior,3,4,17,9.0,11,1,both


## 1. Create price range flags

In [7]:
# Defining the price labels

def price_label(row):
    
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High range'
    else: return np.nan

In [8]:
# Apply the function to entire dataframe

ords_prods_merged['price_range'] = df.apply(price_label, axis=1)

In [9]:
# Checking the new column

ords_prods_merged['price_range'].value_counts(dropna = False)

price_range
NaN                  31404859
Mid-range product      652638
Low-range product      338018
High range               9344
Name: count, dtype: int64

In [10]:
ords_prods_merged['prices'].max()

99999.0

In [11]:
# Creating the high range user defined function


ords_prods_merged.loc[ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

  ords_prods_merged.loc[ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [12]:
# Creating the mid range user defined function

ords_prods_merged.loc[(ords_prods_merged['prices'] <= 15) & (ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [13]:
# Creating the low range user defined function

ords_prods_merged.loc[ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [14]:
# Checking the filter values

ords_prods_merged['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

## 2. Rename day of week column

In [15]:
# Renaming the order days to a more intuitive title

ords_prods_merged.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [16]:
# Counting values for renamed column

ords_prods_merged['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

## 3. Create a busyness flag from the newly named column

In [17]:
# Creating an if-statement

result = []

for value in ords_prods_merged["orders_day_of_week"]:
    if value == 0:
        result.append("Busiest day")
    elif value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

In [18]:
# Viewing the result for the coumn if-statement

result

['Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Bus

In [19]:
# Creating a new column to receieve result

ords_prods_merged['busiest_day'] = result

In [20]:
# Checking if result is received

ords_prods_merged['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

## 4. Steps for Exercise 4.7

### Step 1 of Exercise 4.7

In [21]:
# Changing name of busiest day

ords_prods_merged.rename(columns = {'busiest_day' : 'Busiest_days'}, inplace = True)

In [22]:
# Result of days of the week busyness

ords_prods_merged['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

### Step 2 of Exercise 4.7

In [23]:
# Busiest days and slowest days organization


result = []

for value in ords_prods_merged["orders_day_of_week"]:
    if value == 0 or value == 1:
        result.append("Busiest days")
    elif value == 4 or value == 3:
        result.append("Slowest days")
    else:
        result.append("Average Days")

In [24]:
# Column check

result

['Average Days',
 'Average Days',
 'Busiest days',
 'Slowest days',
 'Slowest days',
 'Busiest days',
 'Average Days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Average Days',
 'Slowest days',
 'Slowest days',
 'Average Days',
 'Slowest days',
 'Average Days',
 'Average Days',
 'Average Days',
 'Busiest days',
 'Busiest days',
 'Average Days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Average Days',
 'Busiest days',
 'Busiest days',
 'Average Days',
 'Average Days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Average Days',
 'Average Days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Average Days',
 'Busiest days',
 'Average Days',
 'Busiest days',
 'Busiest days',
 'Average Days',
 'Busiest days

### Step 3 of Exercise 4.7

In [25]:
# Inputting the results to new column name
ords_prods_merged['Busiest_days'] = result

In [26]:
# Checking the input

ords_prods_merged['Busiest_days'].value_counts(dropna = False)

Busiest_days
Average Days    12916111
Busiest days    11864412
Slowest days     7624336
Name: count, dtype: int64

### Step 4 of Exercise 4.7

In [27]:
# Result of the days of week busyness

ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [28]:
# Busiest hour column creation

result = []

for value in ords_prods_merged["order_hour_of_day"]:
    if value in (6, 0, 1, 5, 2, 4, 3, 23):
        result.append("Fewest orders")
    elif value in (16, 9, 12, 13, 15, 14, 11, 10):
        result.append("Most orders")
    else:
        result.append("Average orders")

In [29]:
# Creating a new column to receieve result

ords_prods_merged['busiest_period_of_day'] = result

In [30]:
# Checking the frequency

ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

In [31]:
# Dropping the unnamed column

ords_prods_merged.drop(columns = ['Unnamed: 0'])

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range,price_range_loc,Busiest_days,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,prior,28,6,11,3.0,5,0,both,Mid-range product,Mid-range product,Average Days,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,prior,30,6,17,20.0,1,1,both,Mid-range product,Mid-range product,Average Days,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,prior,2,0,21,6.0,20,0,both,Mid-range product,Mid-range product,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,prior,1,3,13,,10,0,both,Mid-range product,Mid-range product,Slowest days,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,prior,3,4,17,9.0,11,1,both,Mid-range product,Mid-range product,Slowest days,Average orders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,49688,Fresh Foaming Cleanser,73,11,13.5,1788356,200215,prior,2,0,9,5.0,27,0,both,,Mid-range product,Busiest days,Most orders
32404855,49688,Fresh Foaming Cleanser,73,11,13.5,3401313,200377,prior,1,4,11,,5,0,both,,Mid-range product,Slowest days,Most orders
32404856,49688,Fresh Foaming Cleanser,73,11,13.5,809510,200873,prior,5,3,8,15.0,12,0,both,,Mid-range product,Slowest days,Average orders
32404857,49688,Fresh Foaming Cleanser,73,11,13.5,2359893,200873,prior,9,3,15,5.0,11,1,both,,Mid-range product,Slowest days,Most orders


In [32]:
# Dropping the mergd flag column

ords_prods_merged.drop(columns = ['_merge'])

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,price_range,price_range_loc,Busiest_days,busiest_period_of_day
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,prior,28,6,11,3.0,5,0,Mid-range product,Mid-range product,Average Days,Most orders
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,prior,30,6,17,20.0,1,1,Mid-range product,Mid-range product,Average Days,Average orders
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,prior,2,0,21,6.0,20,0,Mid-range product,Mid-range product,Busiest days,Average orders
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,prior,1,3,13,,10,0,Mid-range product,Mid-range product,Slowest days,Most orders
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,prior,3,4,17,9.0,11,1,Mid-range product,Mid-range product,Slowest days,Average orders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,49692,49688,Fresh Foaming Cleanser,73,11,13.5,1788356,200215,prior,2,0,9,5.0,27,0,,Mid-range product,Busiest days,Most orders
32404855,49692,49688,Fresh Foaming Cleanser,73,11,13.5,3401313,200377,prior,1,4,11,,5,0,,Mid-range product,Slowest days,Most orders
32404856,49692,49688,Fresh Foaming Cleanser,73,11,13.5,809510,200873,prior,5,3,8,15.0,12,0,,Mid-range product,Slowest days,Average orders
32404857,49692,49688,Fresh Foaming Cleanser,73,11,13.5,2359893,200873,prior,9,3,15,5.0,11,1,,Mid-range product,Slowest days,Most orders


In [33]:
ords_prods_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_analysis.pkl'))