# TOC
1. Prep & Import   
2. Create price buckets   
3. Create busiest days flag   
4. Create busies hours flag   
5. Export pickle   

## Prep

In [1]:
# Importing libraries
import pandas as pd
import numpy as nm
import os

In [2]:
# Importing data
path = r'C:\Users\Ryzen RGB Madness!!!\Instacart Basket Analysis'

In [3]:
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Cleaned', 'orders_products_merged.pkl'))

In [4]:
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0


## Exercise

#### Setting up price buckets

In [5]:
# defining df for if-then
df = df_ords_prods[:1000000]

In [6]:
# Creating price range function
def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High range'
    else: return 'Not enough data'

In [7]:
# Applying the price range function
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [8]:
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    756450
Low-range product    243550
Name: count, dtype: int64

In [9]:
df['prices'].max()

14.8

In [10]:
# Running price range function on whole dataframe via loc()
df_ords_prods.loc[df_ords_prods['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [11]:
df_ords_prods.loc[(df_ords_prods['prices'] <= 15) & (df_ords_prods['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [12]:
df_ords_prods.loc[df_ords_prods['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [13]:
df_ords_prods['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

In [14]:
#Creating busiest_day for-loop
df_ords_prods['order_day_of_week'].value_counts(dropna = False)

order_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [15]:
result = []

for value in df_ords_prods['order_day_of_week']:
    if value == 0:
        result.append("Busiest day")
    elif value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

In [16]:
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

In [17]:
df_ords_prods['busiest_day'] = result

In [18]:
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy


In [19]:
df_ords_prods['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

## Task

#### 4.7.2 - Creating 'Busiest Days' Column

In [20]:
# Updating busiest_day column to busiest days
df_ords_prods['order_day_of_week'].value_counts(dropna = False)

order_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [21]:
result = []

for value in df_ords_prods['order_day_of_week']:
    if value == 0 or value == 1:
        result.append("Busiest day")
    elif value == 4 or value == 3:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

In [22]:
df_ords_prods['busiest_days'] = result

#### 4.7.3 - Checking 'Busiest Days' for Consistency

In [23]:
df_ords_prods['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12916111
Busiest day       11864412
Least busy         7624336
Name: count, dtype: int64

Verifying here that the regularly busy label has fewer occurrences, and busiest/least busiest both have more (ROUGHLY twice as many, since we're adding another value to both). 

In [24]:
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Least busy
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Least busy


#### 4.7.4 - Identifying the busiest hours of the day

In [25]:
# Finding busiest times of day
df_ords_prods['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [26]:
# Aggregating busiest period data
resulthr = []

for value in df_ords_prods['order_hour_of_day']:
    if value == 10 or value == 11 or value == 14:
        resulthr.append("Most orders")
    elif value == 3 or value == 4 or value == 2:
        resulthr.append("Fewest orders")
    else:
        resulthr.append("Average orders")

In [27]:
# Creating column for busiest hours
df_ords_prods['busiest_period_of_day'] = resulthr

#### 4.7.5 - Frequency for Busiest Period of Day column

In [28]:
# Checking frequency for busiest_period_of_day
df_ords_prods['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Average orders    24043925
Most orders        8187036
Fewest orders       173898
Name: count, dtype: int64

In [29]:
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Average orders
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Least busy,Average orders
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders


In [30]:
# Removing extraneous columns
df_ords_prods.drop(columns=['_merge', 'busiest_day'])

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Average orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Average orders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,1320836,202557,17,2,15,1.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range product,Regularly busy,Average orders
32404855,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,Low-range product,Regularly busy,Most orders
32404856,758936,203436,1,2,7,,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,Mid-range product,Regularly busy,Average orders
32404857,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,Mid-range product,Least busy,Average orders


In [31]:
# Saving pickle
df_ords_prods.to_pickle(os.path.join(path, '02 Data', 'Cleaned', 'orders_products_busiest.pkl'))

In [33]:
df_ords_prods['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64