# TOC
1. Preparation & Data import
2. Exercise
3. View descriptive stats
4. Removing unnecessary column
5. Checking for mixed type columns
6. Finding missing values and analyzing them
7. Checking for duplicates
8. Exporting dataframes

### Prep

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Importing data
path = r'C:\Users\Ryzen RGB Madness!!!\Instacart Basket Analysis'

In [3]:
path

'C:\\Users\\Ryzen RGB Madness!!!\\Instacart Basket Analysis'

In [4]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original', 'products.csv'), index_col = False)

In [5]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Cleaned', 'old', 'orders_wrangled.csv'), index_col = False)

### Exercise

In [6]:
# Create a dataframe
df_test =  pd.DataFrame()

In [7]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [8]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [9]:
# Finding missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [10]:
# Isolating null values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [11]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [12]:
# Creating df_prods with no null values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [13]:
df_prods.shape

(49693, 5)

In [14]:
df_prods_clean.shape

(49677, 5)

In [15]:
# Searching for duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [16]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [17]:
df_prods_clean_no_dupes = df_prods_clean.drop_duplicates()

In [18]:
df_prods_clean_no_dupes.shape

(49672, 5)

In [19]:
df_prods_clean_no_dupes.to_csv(os.path.join(path, '02 Data', 'Cleaned', 'products_checked.csv'))

### Task

#### Step 2

In [20]:
# Viewing descriptive statistics for df_ords

In [21]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,10.44488
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.308727
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [22]:
# Re-viewing df_ords due to new column appearing
df_ords

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,0.0
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,10,5,18,29.0
3421079,3421079,1854736,206209,11,4,10,30.0
3421080,3421080,626363,206209,12,1,12,18.0
3421081,3421081,2977660,206209,13,1,12,7.0


In [23]:
# Dropping generated repeated column Unnamed: 0 and re-importing
vars_list = ['order_id', 'user_id', 'order_number', 'order_day_of_week', 'order_hour_of_day', 'days_since_prior_order']

In [24]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Cleaned', 'old', 'orders_wrangled.csv'), usecols = vars_list)

In [25]:
df_ords

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,0.0
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [26]:
# Setting column types to exclude from descriptive statistics
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [27]:
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [28]:
df_ords.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0
mean,17.15486,2.776219,13.45202,10.44488
std,17.73316,2.046829,4.226088,9.308727
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [29]:
# Checking that zero is an applicable number for order_hour_of_day (ie, in place of midnight)
for col in df_ords:
  print(df_ords['order_hour_of_day'].unique())

[ 8  7 12 15  9 14 16 11 10 19 18 17 13 20  0 21 22  5 23  4  6  1  2  3]
[ 8  7 12 15  9 14 16 11 10 19 18 17 13 20  0 21 22  5 23  4  6  1  2  3]
[ 8  7 12 15  9 14 16 11 10 19 18 17 13 20  0 21 22  5 23  4  6  1  2  3]
[ 8  7 12 15  9 14 16 11 10 19 18 17 13 20  0 21 22  5 23  4  6  1  2  3]
[ 8  7 12 15  9 14 16 11 10 19 18 17 13 20  0 21 22  5 23  4  6  1  2  3]
[ 8  7 12 15  9 14 16 11 10 19 18 17 13 20  0 21 22  5 23  4  6  1  2  3]


I may just be reading the numbers incorrectly, but the max on days_since_prior_order only showing a 3 seems to be very very low. All of those numbers seem to be low, given that we looked at the days_since_prior_order for user_id 1 and they had double digit days. 

#### Step 3

In [30]:
# Checking for mixed-types columns
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

#### Step 5

In [31]:
# Checking for missing values within df_ords
df_ords.isnull().sum()

order_id                  0
user_id                   0
order_number              0
order_day_of_week         0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

days_since_prior_order has NaN for a number of values. My theory is that this is what generates on the first order for each customer. 

#### Step 6

In [32]:
# Testing my theory about missing values in days_since_prior_order
df_ords['user_id'].nunique()

206209

REVISED: Because in this case the null value works as a flag to show that this is the customer's first order, it serves a specific purpose and should not be revised out. Thus, the "missing values" in this case are not really "missing" values and do not need to be further addressed. 

#### Step 7

In [33]:
# Pulling duplicates into new dataframe
df_ords_dupes = df_ords[df_ords.duplicated()]

In [34]:
# Checking new duplicates dataframe
df_ords_dupes.shape

(0, 6)

#### Step 8

Pulling the duplicates out of df_ords yielded no full duplicates

#### Step 9

In [35]:
# Exporting final cleaned data
df_ords.to_csv(os.path.join(path, '02 Data', 'Cleaned', 'orders_cleaned.csv'))

In [36]:
df_prods_clean_no_dupes.to_csv(os.path.join(path, '02 Data', 'Cleaned', 'products_cleaned.csv'))