# Data Checks on: orders.csv

#### Contents. 
- Missing Data Check
- Duplicate Check
- Mixed-type Data Check
- Other inconsistency, if needed, check
- Summary if changes to the original dataframe

## Importing libraries and data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#importing dataset:
path = r'C:\Users\chris\Documents\Instacart Basket Analysis'
df_ords = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'orders.csv'))

## Viewing the dataframe

In [3]:
# size:
df_ords.shape

(3421083, 7)

In [4]:
# columns:
df_ords.columns

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [5]:
# looks:
df_ords

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [6]:
# data types:
df_ords.dtypes

order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [7]:
# basic stats:
df_ords.describe().round(1)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.2,2.8,13.5,11.1
std,987581.7,59533.7,17.7,2.0,4.2,9.2
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


##### checking on the column 'eval_set' more closely:

In [8]:
# What values are included and how many of each:
df_ords['eval_set'].value_counts(dropna = False)

prior    3214874
train     131209
test       75000
Name: eval_set, dtype: int64

In [9]:
# Do these two make sense together:
df_ords[['days_since_prior_order', 'eval_set']].value_counts(dropna=False)

days_since_prior_order  eval_set
7.0                     prior       306181
30.0                    prior       306137
6.0                     prior       230245
4.0                     prior       214488
3.0                     prior       210665
                                     ...  
23.0                    test           782
29.0                    test           773
24.0                    test           704
26.0                    test           678
25.0                    test           670
Length: 94, dtype: int64

### Missing Data Check

In [10]:
# checking for any missing values:
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

##### 206.209 cases of missing data - that needs to be examined.

In [11]:
#Creating a subset only with those missing values in 'days_since_prior_order':
df_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]
df_nan

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


In [12]:
# missing data: seems to be every first order number:
df_nan['order_number'].value_counts(dropna=False)

1    206209
Name: order_number, dtype: int64

#### Checking further details with a crosstab:

In [23]:
# creating the crosstab and storing it in the object crosstab (in the code first rows, then columns)
crosstab = pd.crosstab(df_ords['days_since_prior_order'], df_ords['order_number'], dropna = False)

In [25]:
# Copy in a spreadsheet in Excel:
crosstab.to_clipboard()

In [26]:
# Or calling the crosstab here:
crosstab

order_number,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
days_since_prior_order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0,2899,2915,2964,2634,2443,2236,2101,1972,1782,...,156,154,125,151,151,153,120,144,134,137
1.0,0,4822,4613,4690,4423,4047,4000,3744,3503,3358,...,397,396,373,359,366,330,356,360,339,321
2.0,0,6203,6209,6231,5789,5679,5274,5090,4914,4705,...,416,405,407,389,359,351,359,346,318,329
3.0,0,7411,7577,7414,7097,6770,6424,6078,5750,5663,...,345,287,330,294,291,292,254,221,240,223
4.0,0,8415,8476,8469,8029,7608,7092,6735,6534,6395,...,221,238,225,191,193,185,183,170,170,153
5.0,0,9216,9329,9220,8600,8228,7703,7270,6828,6563,...,142,128,105,119,119,120,100,95,82,81
6.0,0,11419,11701,11601,10771,10054,9521,8842,8527,7727,...,77,91,86,82,67,59,58,50,58,44
7.0,0,16788,16609,16341,15135,13834,13268,12452,11612,10980,...,60,56,48,56,36,37,37,41,32,35
8.0,0,10422,10304,10148,9378,8738,8068,7594,6894,6511,...,29,22,24,26,23,30,18,16,12,13
9.0,0,7053,7160,7013,6610,5993,5497,5082,4760,4404,...,12,8,14,12,8,6,7,8,7,11


##### Conclusion: the missing data is logically correct and thus not 'missing': there can be no day since last/prior order before the first order. 

### Duplicated Data Check

In [13]:
# looking for duplicates:
df_dups = df_ords[df_ords.duplicated()]
df_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


##### No duplicates.

### Mixed-typed Data Check

In [14]:
# checking for mixed-typed data:
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

##### No mixed-typed data

## Summary: What needs to be addressed:

##### 1) column 'eval set' is superfluous. It can be deleted as it only enlargens the size.
##### 2) column 'order_dow' can be renamened into 'order_day_of_week' for better understanding.
##### 3) column 'days_since_prior_order' can be renamened into 'days_since_last_order' for better understanding.
##### 4) column 'order_id' and 'user_id' can be changed into string variables.

## Changing and Exporting the dataframe >> orders_wrangled.pkl

In [15]:
# dropping the column 'eval_set', saving changes in df clean1
df_ords_clean = df_ords.drop('eval_set', axis=1)

In [16]:
# renaming columns:
df_ords_clean.rename(columns = {'order_dow':'order_day_of_week'}, inplace = True)
df_ords_clean.rename(columns = {'days_since_prior_order':'days_since_last_order'}, inplace = True)

In [17]:
# checking the changes: 
df_ords_clean.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [18]:
# order_id and user_id to be changed into string variables:
df_ords_clean[['order_id', 'user_id']] = df_ords[['order_id', 'user_id']].astype('str')

In [19]:
# checking the changes:
df_ords_clean.dtypes

order_id                  object
user_id                   object
order_number               int64
order_day_of_week          int64
order_hour_of_day          int64
days_since_last_order    float64
dtype: object

In [20]:
df_ords_clean.shape

(3421083, 6)

In [21]:
# EXPORTING the dataframe into orders_wrangled.pkl
df_ords_clean.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.pkl'))