# Task

### 3. Importing Modules

In [1]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### Importing Data

In [2]:
# declare data path as a raw string and import customer into df_customer

data_path = r'C:\Users\gerar\CareerFoundry Python\Instacart Basket Analysis'
df_customers = pd.read_csv(os.path.join(data_path, '02 Data', 'Original Data', 'customers.csv'))
df_orders_products_merge = pd.read_pickle(os.path.join(data_path, '02 Data', 'Prepared Data', 'orders_products_merge_group.pkl'))

In [3]:
# check shapes

print(f' These are the dimensions of the customers dataframe: {df_customers.shape}')
print(f' These are the dimensions of the orders-products merge dataframe: {df_orders_products_merge.shape}')

 These are the dimensions of the customers dataframe: (206209, 10)
 These are the dimensions of the orders-products merge dataframe: (32404859, 24)


In [4]:
#check headers

df_customers.head(1)

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665


### 4. Data Wrangling

In [5]:
# Change the name of columns 
# write a list with the new column names

headers_customers = ['user_id', 'first_name', 'last_name', 'gender', 'state',
                     'age', 'date_joined', 'number_of_dependants', 'marital_status', 'income']
# test
len(headers_customers) == len(df_customers.columns)

df_customers.columns = headers_customers

In [6]:
# test
df_customers.head(1)

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665


### Summary: Data Wrangling
Only the column names were changed 

### 5. Data consistency

In [7]:
# exploratory analysis of variables
df_customers.describe(include = 'all')

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income
count,206209.0,194950,206209,206209,206209,206209.0,206209,206209.0,206209,206209.0
unique,,207,1000,2,51,,1187,,4,
top,,Marilyn,Hamilton,Male,Florida,,9/17/2018,,married,
freq,,2213,252,104067,4044,,213,,144906,
mean,103105.0,,,,,49.501646,,1.499823,,94632.852548
std,59527.555167,,,,,18.480962,,1.118433,,42473.786988
min,1.0,,,,,18.0,,0.0,,25903.0
25%,51553.0,,,,,33.0,,0.0,,59874.0
50%,103105.0,,,,,49.0,,1.0,,93547.0
75%,154657.0,,,,,66.0,,3.0,,124244.0


In [8]:
#checking data types 
df_customers.dtypes

user_id                  int64
first_name              object
last_name               object
gender                  object
state                   object
age                      int64
date_joined             object
number_of_dependants     int64
marital_status          object
income                   int64
dtype: object

In [9]:
# change user_id into object type
df_customers['user_id'] = df_customers['user_id'].astype(str)

In [10]:
# find how many null/NaN values in the dataframe
df_customers.isna().sum()

user_id                     0
first_name              11259
last_name                   0
gender                      0
state                       0
age                         0
date_joined                 0
number_of_dependants        0
marital_status              0
income                      0
dtype: int64

In [11]:
# create a slice with these customers as safety
df_customers_no_name= df_customers[df_customers['first_name'].isna()]

In [12]:
#check for duplicates 
df_customers[df_customers.duplicated()]

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income


In [13]:
# check for mixed type data 
for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].map(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col)

first_name


In [14]:
# check if the nan values are causing mixed data type in first_name of something else
df_customers_with_names = df_customers[df_customers['first_name'].notna()]

In [15]:
# check for mixed type data 
for col in df_customers_with_names.columns.tolist():
  weird = (df_customers_with_names[[col]].map(type) != df_customers_with_names[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers_with_names[weird]) > 0:
    print (col)

### Summary: Data Consistency


-user_id is now an object type 

-nan rows in first_name were spotted (need further advice on how to treat them)

-no duplicates were found

-mixed type data in first_name is caused by nan entries


### 5. Data combination

In [25]:
# display all column names
pd.options.display.max_columns = None

In [26]:
# check headers of both dataframes
df_customers.head(1)

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665


In [29]:
df_orders_products_merge.head(1)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,customer_order_count,order_day_of_the_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_days_new,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_prior_days,frequency_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid_range_price,Regularly busy,Regularly busy,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer


In [33]:
# drop "_merge" column Indicator
df_orders_products_merge = df_orders_products_merge.drop('_merge', axis=1)

In [18]:
#check compatible datatypes of key 
print(df_orders_products_merge['user_id'].dtype)

int64


In [19]:
print(df_customers['user_id'].dtype)

object


In [20]:
# make df_orders_products_merge['user_id'] an object type 
df_orders_products_merge['user_id'] = df_orders_products_merge['user_id'].astype(str)

print(df_orders_products_merge['user_id'].dtype)

object


In [34]:
# merging tables 
df_orders_products_all = pd.merge(df_customers, df_orders_products_merge, on='user_id', how = 'outer', indicator = True)

### Summary: Data Combination

In [36]:
# shape
df_orders_products_all.shape

(32404859, 33)

### Data export

In [37]:
df_orders_products_all.to_pickle(os.path.join(data_path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))

In [39]:
# shape
df_orders_products_all.shape

(32404859, 33)