# Import libraries and data sets

In [5]:
import pandas as pd
import numpy as np
import os 

path = r'C:\Users\ctede\OneDrive\Desktop\Instacart Basket Analysis'
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [8]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


# Create a test dataframe

In [9]:
#Create a test dataframe 
df_test = pd.DataFrame()

In [11]:
#Create a mixed type column 
df_test['mix'] = ['a', 'b', 1, True]
df_test

Unnamed: 0,mix
0,a
1,b
2,1
3,True


# Check for mixed data types 

In [12]:
for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_test[weird]) > 0: 
        print(col)

mix


### Change from numeric to string data type

In [13]:
df_test['mix'] = df_test['mix'].astype('str')

In [14]:
df_test['mix'].dtype

dtype('O')

# Finding missing values 

In [15]:
#Sum the number of missing values in each column 
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

### View the missing values

In [16]:
#Create a subset to view the missing/null values in the produce_name column 
#isnull() == True is a condition for displaying only when there are null (missing) values 
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [18]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Create a new/clean dataframe that excludes the missing values

In [19]:
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [20]:
#Show number of rows/columns of df_prods_clean 
df_prods_clean.shape

(49677, 5)

# Finding duplicates

In [21]:
#Creating a subset that has all full duplicates 
df_dups = df_prods_clean[df_prods_clean.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


# Dropping full duplicates

In [22]:
#Drop full duplicates and create a new clean (no duplicates) dataframe - replace df_prods_clean 
df_prods_clean = df_prods_clean.drop_duplicates()
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


# Task 4.5 Consistency Checks on Orders dataframe

### 2 Run the df.describe() function on df_ords. Share whether anything about the data looks off or should be investigated further. 

In [23]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### The order id and user id variables are expected to contain what look like random numbers because these variables contain identifying/unique values. I do not see any negative numbers in any of the columns. The order dow (day of the week) statistics make sense bc the max and min correspond with the key listed in the project brief (i.e. Friday and Saturday). The order hour of day statistics have times less than 24 hours. The min 0 hours should correspond to 12 am (midnight) and the max 23 hours should correspond to 11pm. The days since prior order statistics also make sense (i.e. can have a value of 0 days since prior order if the customer is purchasing back to back orders). There does not seem to be any outliers or extreme values that affect the averages of each column. 

### 3 Check for mixed-type data in df_ords.

In [24]:
#Use code from df_test example
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_ords[weird]) > 0: 
        print(col)

#### After running the above code on the orders dataframe, there was no output / no columns were printed. This means that there are no mixed data types in the columns for this dataframe and that each column contains consistent data types. 

In [25]:
#Show data types for each column 
df_ords.dtypes

order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

### 5 Run a check for missing values in df_ords. Report your findings and propose an explanation for any missing values found. 

In [26]:
#Sum the number of missing values in each column 
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [27]:
#View the missing values for days_since_prior_order variable
df_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]
df_nan

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


In [28]:
#Number of values/rows in df_ords (with all values)
df_ords.shape

(3421083, 7)

#### The "days since prior order" column has 206209 missing values (NaN). This column describes the number of days between the current order and a previous order. If a customer is odering every single day, then the expected value is 0. Each of these missing values have the same "order number" (1) and the "user id" associated with each missing value is in numberical order. 

In [29]:
#Look at a specific user_id 
df_ords[df_ords['user_id'] == 3]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
26,1374495,3,prior,1,1,14,
27,444309,3,prior,2,3,19,9.0
28,3002854,3,prior,3,3,16,21.0
29,2037211,3,prior,4,2,18,20.0
30,2710558,3,prior,5,0,17,12.0
31,1972919,3,prior,6,0,16,7.0
32,1839752,3,prior,7,0,15,7.0
33,3225766,3,prior,8,0,17,7.0
34,3160850,3,prior,9,0,16,7.0
35,676467,3,prior,10,3,16,17.0


In [31]:
#Look at another user_id 
df_ords[df_ords['user_id'] == 11]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
89,2411567,11,prior,1,0,11,
90,1170872,11,prior,2,5,11,12.0
91,2840752,11,prior,3,5,10,14.0
92,1611810,11,prior,4,5,13,30.0
93,2921164,11,prior,5,5,11,30.0
94,790927,11,prior,6,5,11,7.0
95,1468214,11,prior,7,5,9,30.0
96,1376945,11,test,8,6,11,8.0


#### The first order number for each user id will have a missing value in the days since prior order column because there are no orders that precede the very first order. We should expect to see a "NaN" value for the first order for each user id. 

### 6 Solution: Impute a "0" to represent the first order (that there were zero days since the previous order)

In [32]:
#Impute "0" for the days_since_prior_order column for the first order_number of each customer
#To represent there were zero days since prior order 
df_ords['days_since_prior_order'].fillna(0, inplace=True)
df_ords[df_ords['user_id']==1]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,0.0
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
5,3367565,1,prior,6,2,7,19.0
6,550135,1,prior,7,1,9,20.0
7,3108588,1,prior,8,1,14,14.0
8,2295261,1,prior,9,1,16,0.0
9,2550362,1,prior,10,4,8,30.0


In [33]:
#Show new descriptive statistics
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,10.44488
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.308727
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


### 7 Run a check for duplicate values in df_ords

In [34]:
df_dups = df_ords[df_ords.duplicated()]

In [35]:
df_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


#### There are no full duplicates in the orders dataframe. 

### Rename df_ords to a "clean" version name

In [38]:
df_ords_full_clean = df_ords

In [39]:
df_prods_full_clean = df_prods_clean

# Export dataframes 

In [40]:
#Export the CLEAN orders dataframe
df_ords_full_clean.to_csv(os.path.join(path,'02 Data', 'Prepared Data', 'order_full_clean.csv'))

In [41]:
#Export the CLEAN products dataframe 
df_prods_full_clean.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_full_clean.csv'))