# Import libraries, merged dataset, and subset the dataframe df

In [1]:
import pandas as pd
import numpy as np 
import os 

path = r'C:\Users\ctede\OneDrive\Desktop\Instacart Basket Analysis'
df_ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))
df = df_ords_prods_merged[:1000000]

# If-Statements using loc()

In [2]:
#High-range product
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range-product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range-product'


In [3]:
#Mid-range product
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [4]:
#Low-range product 
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [5]:
#Show frequency of each price range high, mid, low
df['price_range_loc'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

### loc() function locates a particular column in the dataframe it has been assigned to. The logical operators (<, >, =) is used to create a condition. There is no explicit "if" in the statements bc the "if" is implied. 
### The loc() function is being called on the df dataframe and you are comparing the values in the 'prices' column to the condition (i.e. > 15) = "if the values in the 'prices' column of the df dataframe are greater than 15". The ',' indicates 'then'. After the ',' we are creating the 'price_range_loc' column and assigning a value of 'high', 'mid', or 'low' depending on if the condition is met. 

### The loc() function runs faster than the user-defined bc it applies the conditional filters before searching through the dataframe. The user-defined function searches first, then applies the filter. 

# Apply the loc() function to the entire ords_prods_merged dataframe

In [6]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices
0,2539329,1,prior,1,2,8,0.0,196,1,0,both,Soda,77,7,9.0
1,2398795,1,prior,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0
2,473747,1,prior,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0
3,2254736,1,prior,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0
4,431534,1,prior,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0


In [7]:
#High-range products
df_ords_prods_merged.loc[df_ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [8]:
#Mid-range products
df_ords_prods_merged.loc[(df_ords_prods_merged['prices'] <= 15) & (df_ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [9]:
#Low-range products
df_ords_prods_merged.loc[df_ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [10]:
#Show frequency of each price range
df_ords_prods_merged['price_range_loc'].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range_loc, dtype: int64

### If tried using the user-defined function method, may have ran into a memory error. The loc() function is more efficient. 

# If-Statements using For-Loops

### For-loop example

In [11]:
for x in range (30, 45): 
    print("My age is %d" % (x))

My age is 30
My age is 31
My age is 32
My age is 33
My age is 34
My age is 35
My age is 36
My age is 37
My age is 38
My age is 39
My age is 40
My age is 41
My age is 42
My age is 43
My age is 44


In [12]:
#Print the frequency of orders_day_of_the_week column/variable
df_ords_prods_merged["order_dow"].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: order_dow, dtype: int64

### Create  a "busiest day" column using a for loop that runs through the "orders_dow" column 

In [13]:
result = [] #acts as an empty shell to place the results from the for-loop
for value in df_ords_prods_merged["order_dow"]:
    if value ==0: #Saturday is the busiest day. "value" is a placeholder 
        result.append("Busiest day")
    elif value ==4: #Wednesday is the least busiest day
        result.append("Least busy")
    else: 
        result.append("Regularly busy") #If neither of the above conditions are met, "Regularly busy" is returned

#For-loop --> telling Python that for each value within the order_dow column, you want to do something. 

### Benefits of using a for-loop vs. user-defined function: for-loop only looping through one column of the df = greatly speeds up performance. The user-defined function goes through the entire dataframe. 

In [14]:
#Print the "result"
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

### "Result" is a long list of an entry for every single row within the dataframe. Need to combine it with the df_ords_prods_merged dataframe

In [15]:
#Crete a new column called "busiest day" and set it equal to "result" 
df_ords_prods_merged['busiest_day'] = result 

In [16]:
#Show frequency of the values in the "busiest_day" column 
df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

In [17]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day
0,2539329,1,prior,1,2,8,0.0,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy
1,2398795,1,prior,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy
2,473747,1,prior,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy
3,2254736,1,prior,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy
4,431534,1,prior,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy


# Task 4.7

## 2 Clients changed their minds about the labels you created in your "busiest day" column. They want it to become "Busiest days" plural. This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they'd also like to know the two slowest days. Create a new column for this using a suitable method. 

In [18]:
result2 = []
for value in df_ords_prods_merged["order_dow"]:
    if value ==0 or value ==1: 
        result2.append("Busiest days")
    elif value ==4 or value ==3: 
        result2.append("Slowest days")
    else: 
        result2.append("Regularly busy")

In [19]:
df_ords_prods_merged['new_busiest_days'] = result2

## 3 Check the values of this new column (new_busiest_days) for accuracy. Note any observations in markdown format. 

In [20]:
df_ords_prods_merged['new_busiest_days'].value_counts(dropna = False)

Regularly busy    12916111
Busiest days      11864412
Slowest days       7624336
Name: new_busiest_days, dtype: int64

In [21]:
df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

In [22]:
df_ords_prods_merged['order_dow'].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: order_dow, dtype: int64

### The numbers from the order_dow, busiest_day, and new_busiest_days columns correspond with one another. There are more regularly busy days than busiest days. The regularly busy days are composed of dow values 2, 5, and 6 (Monday, Thursday, and Friday). 

## 4 The senior technical officer at Instacart wants you to identify the busiest hours of the day. Rather than by hour, they want periods of time labeled "Most orders", "Average orders", and "Fewest orders". Create a new column containing these labels called "busiest_period_of_day".

In [23]:
df_ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

### The busiest hour of day is 10am, the slowest hour of the day is 3am. 

In [24]:
#Create conditions for "Most", "Average", and "Fewest" orders
result = []
for value in df_ords_prods_merged["order_hour_of_day"]:
    
    #Most orders (hours 10,11,14,15,13,12,16,9,17)
    if value >=10 and value <=17: #To include the top hours with most orders.
        result.append("Most orders")
    
    #Average orders (hours 8,18,19,20,7,21,22,23)
    elif value ==7 or value ==8 or value >=18 and value <=23: #To include the orders in the middle 
        result.append("Average orders")
        
    else: 
        result.append("Fewest orders") #To include the remaining hours of the day (hours 6,0,1,5,2,4,3)

In [25]:
#Create the new column name and set equal to result
df_ords_prods_merged['busiest_period_of_day'] = result

In [26]:
df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,new_busiest_days,busiest_period_of_day
0,2539329,1,prior,1,2,8,0.0,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders
1,2398795,1,prior,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Average orders
2,473747,1,prior,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Most orders
3,2254736,1,prior,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Average orders
4,431534,1,prior,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most orders


## 5 Print the frequency for this new column busiest_period_of_day

In [27]:
df_ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

Most orders       20751522
Average orders     8312313
Fewest orders      3341024
Name: busiest_period_of_day, dtype: int64

### Most orders are between 10am and 5pm. The average orders are around 7 and 8am, and between 6pm and 11pm. The fewest orders are midnight to 6am. 

## 7 Export as a pickle file

In [29]:
df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_new_columns.pkl'))