# 4.7 Deriving New Variables

### 01. Import libraries and files

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# shortcuts for importing dataframes
path = r"C:\Users\Asus\Documents\DA CareerFoundry\Part II - Data Immersion\Python - Anaconda\August 2025 Instacart Basket Analysis\02 Data"

In [3]:
#importing pkl file (note that this formula is different from the csv):
ords_prods_merge = pd.read_pickle(rf'{path}\Prepared Data\ords_prods_merge.pkl')

### 02. Defining Conditions with if-else statements

In [9]:
# creating a sub set for the first million rows:
# .copy() prevents me from having warnings in the future
df = ords_prods_merge[:1000000].copy()

In [10]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,match
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,both
1,2539329,1,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both
2,2539329,1,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,both
3,2539329,1,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,both
4,2539329,1,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both


In [11]:
df.shape

(1000000, 15)

In [37]:
# defining conditions with if-else for the new function:

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [13]:
# df['price_range'] defines a new column within the dataframe (df).
# axis 1 stands for rows, while axis 0 would stand for columns.
# apply(price_label) is a function that commands the price_label function to be applied for every observation.
# let's do it:

df['price_range'] = df.apply(price_label, axis= 1 )

In [14]:
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    673183
Low-range product    314392
High range            12425
Name: count, dtype: int64

In [15]:
df.shape

(1000000, 16)

In [16]:
# checking the most expensive product in the subset
df['prices'].max()

99999.0

## 03. Loc Function with several conditions

In [17]:
# creating conditions with loc function:

df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

When entering the code into Jupyter, use different cells for each condition, otherwise error will happen.

Or use function .copy(); then there's no need to write each condition on different cells.

In [18]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [19]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [20]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [21]:
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     673183
Low-range product     314392
High-range product     12425
Name: count, dtype: int64

### Reasons to use loc function:
Using loc() won’t result in a warning message. While this won’t actually interfere with your work, it’s still a sign that, for whatever reason, Python thinks you should be doing something different. 

#### Second, the loc() method runs much faster; the loc() function applies the conditional filters before searching through the dataframe, while your user-defined function searches through the entire dataframe and then determines where to set the filters (remember axis = 1?).

The loc() function allows tow filter the entire dataframe rather than just a subset

In [22]:
# now lets apply it to the entire dataframe:

ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-range product' 

In [23]:
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [24]:
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [25]:
ords_prods_merge['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

Thanks to loc(), you can now filter the entire dataframe rather than just a subset. If you’d tried to do the same thing with your user-defined function, you would likely have received a memory error, but not so with loc(). That’s the power of Python!

### 04. For-Loops

For-loops, as their name implies, are loops for running the same block of code multiple times. They’re used to perform the same function on multiple elements, for instance, by running through an entire dataframe and performing a function on each row within that dataframe.

In [26]:
# example: 
for x in range (30,45):
    print('My age is %d' % (x))

My age is 30
My age is 31
My age is 32
My age is 33
My age is 34
My age is 35
My age is 36
My age is 37
My age is 38
My age is 39
My age is 40
My age is 41
My age is 42
My age is 43
My age is 44


To start, you need to know on which day most orders take place. You can find this out by printing the frequency of the “orders_day_of_week” column, which would look like this:

In [27]:
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

Now, you have some numbers. But what do these numbers mean? In your project brief, you can see that the value 0 means Saturday. This value has the highest frequency, meaning Saturday is the busiest day. Meanwhile, the 4 value has the lowest frequency. A value of 4, here, refers to Wednesday, meaning Wednesday is the slowest day for Instacart app orders.

You want to use this information to create a new column, “busiest day,” that will contain one of three different values: “Busiest day,” “Least busy,” and “Regularly busy.” This can be done using a for-loop. The loop will run through every row in the “orders_day_of_week” column, compare its value with what you know are the busiest and slowest days, and assign it the corresponding string value.

### 04.1: The if-statements with For-Loops

In [28]:
# the code looks like this:

result = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

result is the empty list, where the results from the loop will be put. 
Then there's the loop. 

In [None]:
result

In [30]:
# let's now combine this list with the dataframe: 

ords_prods_merge['busiest_day'] = result

In [31]:
ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

## Task Submission

#### Question:
Suppose your clients have changed their minds about the labels you created in your “busiest_day” column. Now, they want “Busiest day” to become “Busiest days” (plural). This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they’d also like to know the two slowest days. Create a new column for this using a suitable method.

#### Answer:

In [32]:
# checking the affluence of orders throughout the week:
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In the project brief, the value 0 means Saturday. This value has the highest frequency, meaning Saturday is the busiest day, followed by Sunday. Meanwhile, the 4 value has the lowest frequency. A value of 4, here, refers to Wednesday, meaning Wednesday is the slowest day for Instacart app orders. The other slowest day is Tuesday.

#### Approach with loc function and its conditions

In [38]:
# the code looks like this:

ords_prods_merge.loc[(ords_prods_merge['orders_day_of_week'] == 0) | (ords_prods_merge['orders_day_of_week'] == 1), 'orders_affluence'] = 'Busiest days' 

ords_prods_merge.loc[(ords_prods_merge['orders_day_of_week'] == 4) | (ords_prods_merge['orders_day_of_week'] == 3), 'orders_affluence'] = 'Least busy days'

ords_prods_merge.loc[(ords_prods_merge['orders_day_of_week'] == 6) | (ords_prods_merge['orders_day_of_week'] == 2) | (ords_prods_merge['orders_day_of_week'] == 5), 'orders_affluence'] = 'Regularly busy'

#### Approach with loc function, its conditions and .isin function

this would be an easier way to write it down:

.isin([]) can spare from writing a lot.

```python 
ords_prods_merge.loc[ords_prods_merge['orders_day_of_week'].isin([0, 1]), 'orders_affluence'] = 'Busiest days'
ords_prods_merge.loc[ords_prods_merge['orders_day_of_week'].isin([3, 4]), 'orders_affluence'] = 'Least busy days'
ords_prods_merge.loc[ords_prods_merge['orders_day_of_week'].isin([2, 5, 6]), 'orders_affluence'] = 'Regularly busy'


#### Approach with For Loops and if-else statements

This was learned on the course.
```python
results = []

for value in ords_prods_merge["orders_day_of_week"]:
    if value in (0, 1):
        results.append("Busiest days")
    elif value in(3, 4):
        results.append("Slowest days")
    else:
        results.append("Regular days")

Then, attributing the index results to the dataframe:
```python    
ords_prods_merge['orders_affluence'] = result

##### Problems with this approach:
It goes row by row, meaning it's much slower when there's a huge dataframe, which is the case.

It uses more RAM, it's less efficient. 

So, avoid it.

#### Approach with .map function (Easy and most RAM efficient)

This was not learned in the course.

this would be the most efficient way in terms of RAM usage:
```python
day_map = {
    0: 'Busiest days', 
    1: 'Busiest days',
    3: 'Least busy days',
    4: 'Least busy days',
    2: 'Regularly busy',
    5: 'Regularly busy',
    6: 'Regularly busy'
}

ords_prods_merge['orders_affluence'] = ords_prods_merge['orders_day_of_week'].map(day_map)

In [42]:
# checking values:
ords_prods_merge['orders_affluence'].value_counts(dropna = False)

orders_affluence
Regularly busy     12916111
Busiest days       11864412
Least busy days     7624336
Name: count, dtype: int64

In [43]:
ords_prods_merge.shape

(32404859, 18)

#### Question:
When too many users make Instacart orders at the same time, the app freezes. The senior technical officer at Instacart wants you to identify the busiest hours of the day. Rather than by hour, they want periods of time labeled “Most orders,” “Average orders,” and “Fewest orders.” Create a new column containing these labels called “busiest_period_of_day.”

#### Answer:

In [44]:
# checking frequencies:
ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [48]:
# Defining periods of time (dividing 24/3 = 8):
most_orders = [10, 11, 14, 15, 13, 12, 16, 9]
average_orders = [17, 8, 18, 19, 20, 7, 21, 22]
fewest_orders =[23, 6, 0, 1, 5, 2, 4, 3]

corrected version from tutor Saurabh
````python
result3 = []

for value in ords_prods_merged['order_hour_of_day']:
  if value >= 9 and value <= 16:
    result3.append("Most Orders")
  elif value >= 0 and value <=6:
    result3.append("Fewest Orders")
  else:
    result3.append("Average Orders")
Saurabh/Daniela

# What I've done is not wrong but I could have taken this approach

In [49]:
# lets try loc function:
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(most_orders), 'busiest_period_of_day'] = 'Most orders'

ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(average_orders), 'busiest_period_of_day'] = 'Average orders'

ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(fewest_orders), 'busiest_period_of_day'] = 'Fewest orders'

#### Approach with .map

````python
hour_map = {hour: 'Most orders' for hour in most_orders}
hour_map.update({hour: 'Average orders' for hour in average_orders})
hour_map.update({hour: 'Fewest orders' for hour in fewest_orders})

And then applying it:

ords_prods_merge['busiest_period_of_day'] = ords_prods_merge['order_hour_of_day'].map(hour_map)

#### Question:
Print the frequency for this new column.

#### Answer:

In [50]:
# verifying the results
ords_prods_merge['busiest_period_of_day'].value_counts()

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

In [51]:
ords_prods_merge.shape

(32404859, 19)

#### Exporting new data frames

In [52]:
# Export data to pkl

ords_prods_merge.to_pickle(os.path.join(path, 'Prepared Data', 'ords_prods_merge_variables.pkl'))

In [55]:
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', '_merge', 'product_name', 'aisle_id',
       'department_id', 'prices', 'match', 'price_range_loc', 'busiest_day',
       'orders_affluence', 'busiest_period_of_day'],
      dtype='object')

In [54]:
ords_prods_merge.shape

(32404859, 19)