# Deriving New Variables

### Contents:
#### Create product price flag
#### Create flag for busiest days
#### Create busiest hours flags

## 1.0 Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

## 2.0 Import Data

In [2]:
#Create path for importing data
project_path = r'C:\Users\Owner\Documents\Career Foundry\Instacart Basket Analysis\02 Data'
#Import data
df_orders_products = pd.read_pickle(os.path.join(project_path, '02 02 Prepared Data', 'orders_products_combined.pkl'),)


In [3]:
#Subset of orders due to size
df_subset = df_orders_products[:1000000]

In [4]:
df_subset.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


## 3.0 Practice work

### 3.1 Create flag with If Else statement

In [5]:
#Creating price_range flag
def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range product'
    else:
        return 'Price needed'
        

In [6]:
#Using price_label function
df_subset['price_range'] = df_subset.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['price_range'] = df_subset.apply(price_label, axis=1)


In [7]:
#Frequency of each range
df_subset['price_range'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

### 3.2 Create flag with .loc() function

In [8]:
df_subset.loc[df_subset['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset.loc[df_subset['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [9]:
df_subset.loc[(df_subset['prices'] <= 15) & (df_subset['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [10]:
df_subset.loc[df_subset['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [11]:
df_subset['price_range_loc'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

### 3.3 Apply flag to full dataframe

In [12]:
df_orders_products.loc[df_orders_products['prices'] > 15, 'price_range'] = 'High-range product'

In [13]:
df_orders_products.loc[(df_orders_products['prices'] <= 15) & (df_orders_products['prices'] > 5), 'price_range'] = 'Mid-range product'

In [14]:
df_orders_products.loc[df_orders_products['prices'] <= 5, 'price_range'] = 'Low-range product'

In [15]:
#Frequency of each flag
df_orders_products['price_range'].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range, dtype: int64

### 3.4 If loops

In [16]:
#Find busiest day of week for sales
# 0 = Saturday, 1 = Sunday, 2 = Monday, 3 = Tuesday, 4 = Wednesday, 5 = Thursday, 6 = Friday
df_orders_products['order_day_of_week'].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: order_day_of_week, dtype: int64

In [17]:
#Assign create a list for day label based on day of the week
result = []

for value in df_orders_products['order_day_of_week']:
    if value == 0:
        result.append("Busiest day")
    elif value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")
        

In [18]:
#Add result to dataframe
df_orders_products['busiest_day'] = result

In [19]:
#Frequency of busiest day
df_orders_products['busiest_day'].value_counts(dropna = False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

## 4.0 Task 

### 4.1 Create Groups for Order Days

In [20]:
#Create a list for days scale
grouped_result = []

for value in df_orders_products['order_day_of_week']:
    if value == 0 or value == 1:
        grouped_result.append('Busiest Day')
    elif value == 4 or value == 3:
        grouped_result.append('Least Busy Day')
    else: 
        grouped_result.append('Regularly Busy Day')
        

In [21]:
#Add results to dataframe
df_orders_products['grouped_days'] = grouped_result

In [22]:
#Frequency of groupings
df_orders_products['grouped_days'].value_counts(dropna = False)

Regularly Busy Day    12916111
Busiest Day           11864412
Least Busy Day         7624336
Name: grouped_days, dtype: int64

Of note in the results is that the two busiest days of the week, Saturday and Sunday, are within 10% of the three days of the week that are regularly busy.  Also, we know from this information that 40% of the Instacart orders are placed on Saturday and Sunday.  Conversely, the least popular days of the week, Tuesday and Wednesday, account for less than 24% of orders.

### 4.2 Creating Busiest Hours of Day

In [23]:
#Determine which hours are busiest
df_orders_products['order_hour_of_day'].value_counts(dropna = False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

In [24]:
#Create list for busiest hours
hours_result = []

for value in df_orders_products['order_hour_of_day']:
    if value == 10:
        hours_result.append('Most Orders')
    elif value == 3:
        hours_result.append('Fewest Orders')
    else:
        hours_result.append('Average Orders')

In [25]:
#Add results to dataframe
df_orders_products['busiest_period_of_day'] = hours_result

In [26]:
#Frequency of busiest hours
df_orders_products['busiest_period_of_day'].value_counts(dropna = False)

Average Orders    29591818
Most Orders        2761760
Fewest Orders        51281
Name: busiest_period_of_day, dtype: int64

## 5.0 Export Data

In [27]:
#Export updated dataframe as a pickle
df_orders_products.to_pickle(os.path.join(project_path,'02 02 Prepared Data', 'orders_products_grouped.pkl'))