# 4.9 Intro to Data Visualization Part 1

## Instacart Grocery Basket Analysis

#### -Errol Hinkamp

##### Table of Contents

1. Import libraries
2. Import data
3. Wrangle and check data
4. Merge data
5. Export data

# 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 2. Import data

In [2]:
# Import dataframes
path=r'C:\Users\Errol\Documents\Data Analyst Work\Achievement 4\Instacart Basket Analysis'
customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_4.8.pkl'))

# 3. Wrangle and check data

In [3]:
# Disable row limits
pd.options.display.max_rows = None

In [4]:
# Quick visual check
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
# Get dataframe shape
customers.shape

(206209, 10)

In [6]:
# Get column data
customers.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [7]:
# Rename "Surnam" column
customers.rename(columns = {'Surnam' : 'Last Name'}, inplace = True)

In [8]:
# Get descriptive statistics
customers.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [9]:
ords_prods_merged.dtypes

order_id                   int64
user_id                    int64
order_number               int64
orders_day_of_week         int64
order_hour_of_day          int64
days_since_last_order    float64
product_id                 int64
add_to_cart_order          int64
reordered                  int64
product_name              object
aisle_id                   int64
department_id              int64
prices                   float64
price_range_loc           object
busiest_days              object
busiest_period_of_day     object
max_order                  int64
loyalty_flag              object
avg_spent                float64
spending_flag             object
order_frequency          float64
frequency_flag            object
dtype: object

In [10]:
# Check for mixed-columns
for col in customers.columns.tolist():
   weird = (customers[[col]].applymap(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
   if len (customers[weird]) > 0:
     print (col)

First Name


In [11]:
# Set "First Name" column to string
customers['First Name'] = customers['First Name'].astype('str')

In [12]:
# Search for missing values
customers.isnull().sum()

user_id         0
First Name      0
Last Name       0
Gender          0
STATE           0
Age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [13]:
# Search for duplicates
customers_dup = customers[customers.duplicated()]
customers_dup

Unnamed: 0,user_id,First Name,Last Name,Gender,STATE,Age,date_joined,n_dependants,fam_status,income


In [14]:
# Value count of "First Name"
customers['First Name'].value_counts(dropna = False)

nan            11259
Marilyn         2213
Barbara         2154
Todd            2113
Jeremy          2104
Cynthia         1951
Rose            1880
Kathy           1863
Steven          1844
Sarah           1840
Irene           1823
Andrea          1698
Justin          1684
Bobby           1664
Clarence        1663
Harry           1638
Alice           1629
Ruby            1622
Julie           1607
Gloria          1590
Carl            1565
Shawn           1500
Gregory         1486
Brandon         1485
Frank           1482
Thomas          1480
Marie           1467
Robin           1463
Scott           1459
Robert          1450
Russell         1448
Linda           1433
Michael         1427
Peter           1417
Harold          1409
Ruth            1399
Jerry           1393
James           1367
Brenda          1306
Stephen         1300
Ralph           1290
Henry           1272
Mary            1263
Kenneth         1262
Victor          1260
Lois            1253
Deborah         1251
Bonnie       

##### No values were blank, but there are an awful lot that were filled in as "nan." I won't take any action, as it is extremely unlikely that any further analysis will require first names in order to be relevant.

In [15]:
# Value count of "Last Name"
customers['Last Name'].value_counts(dropna = False)

Hamilton       252
Randall        248
Pennington     243
Lamb           243
Barnett        242
Ward           241
Walton         241
Kline          239
Berry          239
Berg           239
Mccoy          238
Buchanan       237
Morris         237
Jackson        236
Mitchell       236
Dickson        236
Fischer        236
Medrano        236
Johnson        235
Bass           235
Barber         235
Quinn          234
Trejo          234
Golden         234
Maynard        234
Browning       234
Cochran        233
Hubbard        233
Ware           233
Mcdaniel       233
Hill           232
Marin          232
Le             232
Gonzalez       232
Waters         232
Williamson     232
Duarte         232
Tang           232
Bean           231
Marquez        231
Robbins        231
Wise           231
May            231
Bradley        231
Koch           231
Christensen    231
Carlson        230
Hurley         230
Rodriguez      230
Garcia         229
Conway         229
Wilcox         229
Kirby       

In [16]:
# Value count of "Gender"
customers['Gender'].value_counts(dropna = False)

Male      104067
Female    102142
Name: Gender, dtype: int64

In [17]:
# Value count of "STATE"
customers['STATE'].value_counts(dropna = False)

Delaware                4044
Connecticut             4044
Arkansas                4044
Indiana                 4044
Alabama                 4044
Alaska                  4044
Hawaii                  4044
Iowa                    4044
Colorado                4044
Arizona                 4044
Georgia                 4044
California              4044
District of Columbia    4044
Idaho                   4044
Florida                 4044
Illinois                4044
Washington              4043
Utah                    4043
New York                4043
Rhode Island            4043
Wyoming                 4043
West Virginia           4043
New Mexico              4043
Oklahoma                4043
Vermont                 4043
South Carolina          4043
New Hampshire           4043
Louisiana               4043
Wisconsin               4043
North Carolina          4043
Maine                   4043
Nebraska                4043
Nevada                  4043
North Dakota            4043
Tennessee     

In [18]:
# Value count of "date_joined"
customers['date_joined'].value_counts(dropna = False)

9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
12/24/2019    209
11/19/2019    205
10/11/2017    205
1/7/2018      205
1/25/2019     205
12/6/2019     204
9/30/2019     203
3/26/2018     203
3/31/2019     203
10/18/2019    203
7/28/2018     203
3/12/2020     202
10/20/2018    202
10/17/2019    202
11/21/2018    202
3/23/2018     202
3/2/2020      202
11/6/2018     202
10/6/2018     202
3/8/2019      201
10/18/2017    201
2/1/2017      201
12/28/2018    201
5/4/2018      201
11/14/2017    200
5/11/2018     200
1/19/2017     200
7/23/2018     200
8/10/2017     199
4/13/2019     199
4/7/2018      199
1/23/2018     199
10/23/2018    198
12/25/2017    198
9/28/2019     198
3/10/2017     198
7/20/2017     198
5/13/2019     198
7/7/2018      198
9/12/2017     198
3/28/2020     197
7/21/2018     197
11/12/2018    197
8/4/2017      197
2/1/2018      197
8/13/2018     197
5/18/2019     197
1/12/2017     197
12/18/2018    196
6/4/2019      196
9/30/2017 

In [19]:
# Value count of "fam_status"
customers['fam_status'].value_counts(dropna = False)

married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: fam_status, dtype: int64

In [20]:
# Confirm shape of dataset
customers.shape

(206209, 10)

# 4. Merge data

In [21]:
# Quick visual check of "customers"
customers.head()

Unnamed: 0,user_id,First Name,Last Name,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [22]:
# Quick visual check of ords_prods_merged
ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_spent,spending_flag,order_frequency,frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,...,9.0,Mid-range product,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,9.0,Mid-range product,Slowest days,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [23]:
# Drop unnecessary columns to save memory and make merging easier
ords_prods_merged=ords_prods_merged.drop(columns=['avg_spent', 'order_frequency'])

##### My computer kept stalling and then crashing when trying to merge in the way we've been taught so far. My tutor came up with the following alternative method.

In [24]:
customers.drop_duplicates(subset ='user_id', keep = False, inplace = True) 
ords_prods_merged.drop_duplicates( keep = False, inplace = True) 

In [25]:
# creating a empty bucket to save result
df_result = pd.DataFrame(columns=(ords_prods_merged.columns.append(customers.columns)).unique())
df_result.to_csv("df3.csv",index_label=False)

# save data which only appear in df1 # sorry I was doing left join here. no need to run below two line.
# df_result = df1[df1.Colname1.isin(df2.Colname2)!=True]
# df_result.to_csv("df3.csv",index_label=False, mode="a")

# deleting df2 to save memory
del(customers)

def preprocess(x):
    df2=pd.merge(ords_prods_merged,x, left_on = "user_id", right_on = "user_id")
    df2.to_csv("df3.csv",mode="a",header=False,index=False)

reader = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), chunksize=1000) # chunksize depends with you colsize

[preprocess(r) for r in reader]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [26]:
# Load merged dataset
bigmerged = pd.read_csv(os.path.join(path, '03 Scripts', 'df3.csv'))

In [27]:
# Quick visual check of merged dataset
bigmerged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,product_name,...,frequency_flag,First Name,Last Name,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,3144955,10331,4,5,11,30.0,196,3,0,Soda,...,Regular customer,Marilyn,Olsen,Female,North Dakota,24,1/6/2017,3,married,73639
1,2300470,10331,1,2,14,,21386,2,0,Smartwater,...,Regular customer,Marilyn,Olsen,Female,North Dakota,24,1/6/2017,3,married,73639
2,1922742,10331,2,5,11,10.0,21386,2,1,Smartwater,...,Regular customer,Marilyn,Olsen,Female,North Dakota,24,1/6/2017,3,married,73639
3,2300470,10331,1,2,14,,42500,1,0,Orange & Lemon Flavor Variety Pack Sparkling F...,...,Regular customer,Marilyn,Olsen,Female,North Dakota,24,1/6/2017,3,married,73639
4,1922742,10331,2,5,11,10.0,42500,3,1,Orange & Lemon Flavor Variety Pack Sparkling F...,...,Regular customer,Marilyn,Olsen,Female,North Dakota,24,1/6/2017,3,married,73639


# 5. Export data

In [28]:
# Export dataframe
bigmerged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'bigmerged_4.9.1.pkl'))