### Contents:
    01 Importing libraries and data
    02 Define df and functions
    03 Derive new variables
        a practice with a subset
        b using whole dataframe
    04 Deriving more variables
        a derive using a function
        b value_counts from derivation
        c derive again
        d value_counts post derivation
    05 Exporting

# Exercise 4.7

## 01 Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/Emily/Documents/CF Data Analysis Program/Immersion 4/Instacart Basket Analysis'

In [3]:
ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'prepared data', 'orders_products_merged.pkl'))

## 02 Define dfs and functions

In [5]:
df = ords_prods[:1000000]

In [6]:
# user-defined function to identify ranges of prices per product

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

## 03 Derive new variables!

### a) first, with the subset of 1M rows

In [7]:
# apply the user-defined function across all rows (axis = 1)
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [8]:
# check to see what values it added to the new column
df['price_range'].value_counts(dropna = False)

Mid-range    756450
Low-range    243550
Name: price_range, dtype: int64

In [9]:
# no high-range priced products in the subset
df['prices'].max()

14.8

In [10]:
# messing around with .loc function
df.loc[df['price_range'] == 'Mid-range',['prices']]

Unnamed: 0,prices
0,9.0
1,9.0
2,9.0
3,9.0
4,9.0
...,...
999995,7.5
999996,7.5
999997,7.5
999998,7.5


In [11]:
# still getting an error message because this condition doesn't exist?
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range'


In [12]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range' 

In [13]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range'

### b) now, with the whole dataframe

In [14]:
# no error this time, which means there should be some products with $15+ prices
ords_prods.loc[ords_prods['prices'] > 15, 'price_range_loc'] = 'High-range'

In [16]:
ords_prods.loc[(ords_prods['prices'] <= 15) & (ords_prods['prices'] > 5), 'price_range_loc'] = 'Mid-range'

In [17]:
ords_prods.loc[ords_prods['prices'] <= 5, 'price_range_loc'] = 'Lo-range'

In [18]:
# check for frequencies
ords_prods['price_range_loc'].value_counts(dropna = False)

Mid-range     21860868
Lo-range      10126324
High-range      417678
NaN              30200
Name: price_range_loc, dtype: int64

In [22]:
# create an empty list, then step through the orders_day_of_week column to identify saturday (0) and wednesday (4)
result = []

for value in ords_prods["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [23]:
# set the result equal to a new column
ords_prods['busiest_day'] = result

In [24]:
# check to see if the value counts match the order_day_of_week column
ords_prods['busiest_day'].value_counts(dropna = False)

Regularly busy    22437999
Busiest day        6209808
Least busy         3787263
Name: busiest_day, dtype: int64

In [25]:
ords_prods['order_day_of_week'].value_counts(dropna = False)

0.0    6209808
1.0    5665951
6.0    4500391
2.0    4217868
5.0    4209603
3.0    3844175
4.0    3787263
NaN         11
Name: order_day_of_week, dtype: int64

In [49]:
# rename price_range column
ords_prods.rename(columns={'price_range_loc':'price_range'}, inplace = True)

## 04 Task 4.7 - Deriving more variables

### a) Step 2 - using a function

In [27]:
# redefine result as an empty list, step through order_day_of_week and identify 2 busiest and least busy days
result = []

for value in ords_prods["order_day_of_week"]:
  if value == 0 or value == 1:
    result.append("Busiest days")
  elif value == 4 or value == 3:
    result.append("Least busy days")
  else:
    result.append("Regularly busy")

In [28]:
ords_prods['busiest_days'] = result

### b) Step 3 - value counts from derivation

In [29]:
# values check
ords_prods['busiest_days'].value_counts(dropna = False)

Regularly busy     12927873
Busiest days       11875759
Least busy days     7631438
Name: busiest_days, dtype: int64

In [30]:
# busiest days should be Saturday + Monday
6209808 + 5665951

11875759

In [31]:
# least busy days should be Tuesday + Wednesday
3844175 + 3787263

7631438

### c) Step 4 - More variable derivation

In [32]:
ords_prods.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', '_merge', 'price_range_loc',
       'busiest_day', 'busiest_days'],
      dtype='object')

In [33]:
# frequency check of orders during all hours of the day
ords_prods['order_hour_of_day'].value_counts(dropna = False)

10.0    2764476
11.0    2738647
14.0    2691598
15.0    2664583
13.0    2663346
12.0    2620898
16.0    2537506
9.0     2456751
17.0    2089510
8.0     1719991
18.0    1637956
19.0    1259416
20.0     977049
7.0      891951
21.0     796379
22.0     634743
23.0     402621
6.0      290796
0.0      218951
1.0      115787
5.0       88064
2.0       69435
4.0       53284
3.0       51321
NaN          11
Name: order_hour_of_day, dtype: int64

In [42]:
# reset result to empty list, step through order_hour_of_day and estabish periods of busy-ness
result = []

for value in ords_prods["order_hour_of_day"]:
  if value >= 10 and value < 16:
    result.append("most orders")
  elif value >= 16 and value < 21:
    result.append("average orders")
  elif value >= 7 and value < 10:
    result.append("average orders")
  elif value >= 0 and value < 7:
    result.append("fewest orders")
  elif value >= 21 and value < 24:
    result.append("fewest orders")  
  elif pd.isna(value):
    result.append(np.nan)  
  else:
    result.append('huh')

In [43]:
ords_prods['busiest_period_of_day'] = result

### d) Step 5 - value counts from derivation

In [45]:
# check frequency
ords_prods['busiest_period_of_day'].value_counts(dropna = False)

most orders       16143548
average orders    13570130
fewest orders      2721381
NaN                     11
Name: busiest_period_of_day, dtype: int64

In [46]:
# most orders should be the summation of hours 10 through 15
2764476 + 2738647 + 2691598 + 2664583 + 2663346 + 2620898

16143548

In [47]:
ords_prods.shape

(32435070, 19)

In [50]:
ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day
0,2539329.0,1.0,1.0,2.0,8.0,,True,196,1.0,0.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Regularly busy,average orders
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,average orders
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,most orders
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,average orders
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,most orders


## 05 Exporting 

In [51]:
ords_prods.to_pickle(os.path.join(path, '02 Data', 'prepared data', 'orders_products_merged_2.pkl'))