# Importing the Libraries

In [1]:
#Importing pandas 
import pandas as pd

In [2]:
#Importing NumPy
import numpy as np 

In [3]:
#Importing os 
import os

# Instacart Basket Analysis Path

In [7]:
path = r'C:\Users\ctede\OneDrive\Desktop\Instacart Basket Analysis'

In [8]:
path

'C:\\Users\\ctede\\OneDrive\\Desktop\\Instacart Basket Analysis'

### Products.csv Dataframe

In [10]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data',
                                   'products.csv'), nrows = 1000)

### Orders.csv Dataframe

In [11]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 
                                  'orders.csv'), nrows = 1000)

In [12]:
df_ords.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
995,2302602,64,7,5,20,9.0
996,3215917,64,8,6,17,15.0
997,2754807,64,9,6,15,7.0
998,3129760,64,10,1,10,9.0


# Dropping columns

In [13]:
df_ords['days_since_prior_order'].value_counts(dropna = False)

7.0     90
30.0    88
6.0     74
4.0     65
NaN     64
5.0     61
8.0     60
1.0     47
3.0     44
2.0     38
14.0    38
9.0     37
10.0    34
11.0    30
13.0    29
15.0    23
12.0    23
0.0     19
28.0    18
21.0    15
20.0    14
17.0    13
16.0    12
27.0    12
22.0    11
18.0     9
19.0     9
24.0     7
26.0     6
29.0     5
25.0     3
23.0     2
Name: days_since_prior_order, dtype: int64

# Renaming columns

In [17]:
df_ords.rename(columns = {'order_dow': 'orders_day_of_week'}, inplace = True)

In [18]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


# Changing data types

In [19]:
#Changing the "order_id" data type to string 
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [20]:
df_ords['order_id'].dtype

dtype('O')

# Transposing Data

### Importing the Departments Dataset

In [21]:
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 
                                 'departments.csv'))

In [22]:
#View the departments dataframe
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


### Transposing the dataframe

In [24]:
#Overwrite and transpose the departments dataframe
df_dep_T = df_dep.T

In [28]:
df_dep_T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


### Adding an index

In [29]:
df_dep_T.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


### New Header

In [30]:
#Copying and pasting teh values within row 0 of the dataframe and assigning them to a variable "new_header"
new_header = df_dep_T.iloc[0]

In [31]:
new_header

0    department
Name: department_id, dtype: object

### Copy the dataframe

In [32]:
#Copy the df_dep_T dataframe but only copy from row 1 and onward
df_dep_T_new = df_dep_T[1:]

In [33]:
df_dep_T_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


### Add the new_header variable as the new header for the dataframe

In [34]:
df_dep_T_new.columns = new_header

In [35]:
df_dep_T_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# Creating a data dictionary 

In [36]:
#Turning the df_dep_T_new dataframe into a data dictionary 
data_dict = df_dep_T_new.to_dict('index')

In [37]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

### View the products dataframe 

In [38]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [39]:
#What does the department id '19' stand for? 
print(data_dict.get('19'))

{'department': 'snacks'}


# Subsetting

### Searching for specific values (i.e. filtering)

In [40]:
#Looking for everything with a "department_id" of 19 
#Will return "true" or "false" if the value meets the criteria
df_prods['department_id']==19

0       True
1      False
2      False
3      False
4      False
       ...  
995    False
996     True
997    False
998    False
999    False
Name: department_id, Length: 1000, dtype: bool

### Return only department ids = 19

In [41]:
#Only returns the department ids of 19
df_prods[df_prods['department_id']==19]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
974,974,Organic 54% Cocoa Dark Chocolate Bar,45,19,4.4
983,983,Strawberry Pomegranate,3,19,3.0
985,985,Peanut Butter Bars,3,19,6.4
988,988,Cranberrry Almond Soft & Chewy Granola Bar,3,19,6.3


### Create a subset that only contains data from the snacks dept. 

In [42]:
#Snacks department has a dept. id of 19 (why using "19")
df_snacks = df_prods[df_prods['department_id'] ==19]

In [43]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


### Using loc function to filter 

In [44]:
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [45]:
df_snacks_2.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# Exporting the wrangled dataframe 

In [48]:
#orders dataframe 
#df_ords.to_csv(os.path.join(path,'02 Data', 'Prepared Data',
                          #  'orders_wrangled.csv'))