# 4.10.1 Coding Ettiquette and Excel Reporting

## This script contains the following points:

### 01. Importing Libraries
### 02. Creating Dataframes
### 03. Task 2: Security Implications
### 04. Task 3: Creating Region Column
### 05. Task 4: Low-activity Exclusion
### 06. Task 5: Customer Profiling

## 01. Importing Libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

## 02. Creating Dataframes

In [2]:
# Creating path variable for loading in files
path = r'C:\Users\widne\Documents\CareerFoundry Exercises\Data_Immersion\Achievement 4\03-2024 Instacart Basket Analysis'

# Creating dataframe containing all data
all = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))

In [3]:
# Checking dataframe
all.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,average_spend,spender_flag,median_frequency,frequency_flag,state,age,number_of_dependents,fam_status,income,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,6.367797,Low spender,20.5,Non-frequent customer,Alabama,31,3,married,40423,both
1,2398795,1,2,3,7,15.0,False,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Alabama,31,3,married,40423,both
2,473747,1,3,3,12,21.0,False,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Alabama,31,3,married,40423,both
3,2254736,1,4,4,7,29.0,False,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Alabama,31,3,married,40423,both
4,431534,1,5,4,15,28.0,False,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Alabama,31,3,married,40423,both


## 03. Task 2: Security Implications

The original customers data had the first and last names of customers which is personally identifiable information (PII). In a working environment, I would consult with senior members or someone who works in data security for guidance on how to proceed with the data. For this specific exercise, I dropped the first and last name columns, because they are not necessary for analysis.

## 04. Task 3: Creating Region Column

In [6]:
# Creating Northeast value for region variable
all.loc[all['state'].isin(['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut',
                           'New York', 'Pennsylvania', 'New Jersey']), 'region'] = 'Northeast'

In [7]:
# Creating Midwest value for region variable
all.loc[all['state'].isin(['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota',
                           'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']), 'region'] = 'Midwest'

In [8]:
# Creating South value for region variable
all.loc[all['state'].isin(['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina',
                           'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama',
                           'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']), 'region'] = 'South'

In [9]:
# Creating West value for region variable
all.loc[all['state'].isin(['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico',
                           'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']), 'region'] = 'West'

In [10]:
# Checking values of region column
all['region'].value_counts(dropna = False)

region
South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: count, dtype: int64

In [11]:
# Creating crosstab of region and spender flag
crosstab = pd.crosstab(all['region'], all['spender_flag'], dropna = False)

In [12]:
crosstab.to_clipboard()

## 05. Task 4: Low-activity Exclusion

In [13]:
# Creating low_activity flag
all.loc[all['max_order'] >= 5, 'low_activity'] = 'no'

In [14]:
all.loc[all['max_order'] < 5, 'low_activity'] = 'yes'

In [15]:
# Checking values of low_activity flag
all['low_activity'].value_counts(dropna = False)

low_activity
no     30964564
yes     1440295
Name: count, dtype: int64

In [16]:
# Creating all_excluded dataframe that excludes all low activity customers
all_excluded = all[all['low_activity'] == 'no']

In [17]:
# Checking all_excluded dataframe
all_excluded.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,median_frequency,frequency_flag,state,age,number_of_dependents,fam_status,income,_merge,region,low_activity
0,2539329,1,1,2,8,,True,196,1,0,...,20.5,Non-frequent customer,Alabama,31,3,married,40423,both,South,no
1,2398795,1,2,3,7,15.0,False,196,1,1,...,20.5,Non-frequent customer,Alabama,31,3,married,40423,both,South,no
2,473747,1,3,3,12,21.0,False,196,1,1,...,20.5,Non-frequent customer,Alabama,31,3,married,40423,both,South,no
3,2254736,1,4,4,7,29.0,False,196,1,1,...,20.5,Non-frequent customer,Alabama,31,3,married,40423,both,South,no
4,431534,1,5,4,15,28.0,False,196,1,1,...,20.5,Non-frequent customer,Alabama,31,3,married,40423,both,South,no


In [18]:
all_excluded.shape

(30964564, 32)

In [19]:
# Exporting excluded dataframe
all_excluded.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all_excluded.pkl'))

## 06. Task 5: Customer Profiling

In [5]:
# Creating departments dataframe for chekcing departments
dept = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_prepared.csv'), index_col = False)

In [6]:
dept

Unnamed: 0.1,Unnamed: 0,department_id,department
0,0,1,frozen
1,1,2,other
2,2,3,bakery
3,3,4,produce
4,4,5,alcohol
5,5,6,international
6,6,7,beverages
7,7,8,pets
8,8,9,dry goods pasta
9,9,10,bulk


In [None]:
Profiling customers by age

In [4]:
all_excluded['age'].describe()

count    3.096456e+07
mean     4.946803e+01
std      1.848528e+01
min      1.800000e+01
25%      3.300000e+01
50%      4.900000e+01
75%      6.500000e+01
max      8.100000e+01
Name: age, dtype: float64

The minimum age is 18 and the maximum age is 81. Creating 3 values for age_group flag (18-29 is young adult, 30-59 is middle age, and 60+ is senior

In [7]:
# Creating Young adult value for age_group column
all_excluded.loc[all_excluded['age'] < 30, 'age_group'] = 'Young adult'

In [8]:
# Creating Midde age value for age_group column
all_excluded.loc[(all_excluded['age'] >= 30) & (all_excluded['age'] < 60), 'age_group'] = 'Middle age'

In [9]:
# Creating Senior value for age_group column
all_excluded.loc[all_excluded['age'] >= 60, 'age_group'] = 'Senior'

In [10]:
# Checking values of age_group
all_excluded['age_group'].value_counts(dropna = False)

age_group
Middle age     14572457
Senior         10574504
Young adult     5817603
Name: count, dtype: int64

Profiling customers by income

In [9]:
all_excluded['income'].describe()

count    3.096456e+07
mean     9.967587e+04
std      4.314187e+04
min      2.590300e+04
25%      6.729200e+04
50%      9.676500e+04
75%      1.281020e+05
max      5.939010e+05
Name: income, dtype: float64

Creating 3 different income_group values for income_group flag ( < 75,000 is 'low income', 75,000-119,999 is 'mid income', and > 120,000 is high income

In [12]:
# Creating low income value for income_group column
all_excluded.loc[all_excluded['income'] < 75000, 'income_group'] = 'low income'

In [13]:
# Creating mid income value for income_group column
all_excluded.loc[(all_excluded['income'] >= 75000) & (all_excluded['income'] < 120000), 'income_group'] = 'mid income'

In [14]:
# Creating high income value for income_group column
all_excluded.loc[all_excluded['income'] >= 120000, 'income_group'] = 'high income'

In [15]:
# Checking values of income_group
all_excluded['income_group'].value_counts(dropna=False)

income_group
mid income     11878101
low income      9906680
high income     9179783
Name: count, dtype: int64

In [95]:
# Profiling customers by combination of age, family status, and number of children
# Creating Single young adult w/o kids
all_excluded.loc[(all_excluded['age_group'] == 'Young adult') & (all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] == 0), 'profile_age'] = 'Single young adult w/o kids'

In [96]:
# Creating single young adult w/ kids
all_excluded.loc[(all_excluded['age_group'] == 'Young adult') & (all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] > 0), 'profile_age'] = 'Single young adult w/ kids'

In [97]:
# Creating married young adult w/o kids
all_excluded.loc[(all_excluded['age_group'] == 'Young adult') & (all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] == 0), 'profile_age'] = 'Married young adult w/o kids'

In [98]:
# Creating married young adult w/ kids
all_excluded.loc[(all_excluded['age_group'] == 'Young adult') & (all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] > 0), 'profile_age'] = 'Married young adult w/ kids'

In [99]:
# Creating Single mid age w/o kids
all_excluded.loc[(all_excluded['age_group'] == 'Middle age') & (all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] == 0), 'profile_age'] = 'Single middle age w/o kids'

In [100]:
# Creating Single mid age w/ kids
all_excluded.loc[(all_excluded['age_group'] == 'Middle age') & (all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] > 0), 'profile_age'] = 'Single middle age w/ kids'

In [101]:
# Creating married middle age w/o kids
all_excluded.loc[(all_excluded['age_group'] == 'Middle age') & (all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] == 0), 'profile_age'] = 'Married middle age w/o kids'

In [102]:
# Creating married midde age w/kids
all_excluded.loc[(all_excluded['age_group'] == 'Middle age') & (all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] > 0), 'profile_age'] = 'Married middle age w/ kids'

In [103]:
# Creating single senior w/o kids
all_excluded.loc[(all_excluded['age_group'] == 'Senior') & (all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] == 0), 'profile_age'] = 'Single senior w/o kids'

In [104]:
# Creating single senior w/ kids
all_excluded.loc[(all_excluded['age_group'] == 'Senior') & (all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] > 0), 'profile_age'] = 'Single senior w/ kids'

In [105]:
# Creating married senior w/o kids
all_excluded.loc[(all_excluded['age_group'] == 'Senior') & (all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] == 0), 'profile_age'] = 'Married senior w/o kids'

In [106]:
# Creating married senior w/ kids
all_excluded.loc[(all_excluded['age_group'] == 'Senior') & (all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] > 0), 'profile_age'] = 'Married senior w/ kids'

In [107]:
# Checking values of profile_age
all_excluded['profile_age'].value_counts(dropna=False)

profile_age
Married middle age w/ kids     10925665
Married senior w/ kids          7929233
Single middle age w/o kids      3646792
Married young adult w/ kids     2888813
Single senior w/o kids          2645271
Single young adult w/ kids      1481172
Single young adult w/o kids     1447618
Name: count, dtype: int64

In [110]:
# Profiling customers by combination of income, family status, and number of children
# Creating single w/o kids low income
all_excluded.loc[(all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] == 0) & (all_excluded['income_group'] == 'low income'), 'profile_income'] = 'Single w/o kids low income'

In [111]:
# Creating single w/ kids low income
all_excluded.loc[(all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] > 0) & (all_excluded['income_group'] == 'low income'), 'profile_income'] = 'Single w/ kids low income'

In [112]:
# Creating married w/o kids low income
all_excluded.loc[(all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] == 0) & (all_excluded['income_group'] == 'low income'), 'profile_income'] = 'Married w/o kids low income'

In [113]:
# Creating married w/ kids low income
all_excluded.loc[(all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] > 0) & (all_excluded['income_group'] == 'low income'), 'profile_income'] = 'Married w/ kids low income'

In [115]:
# Creating single w/o kids mid income
all_excluded.loc[(all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] == 0) & (all_excluded['income_group'] == 'mid income'), 'profile_income'] = 'Single w/o kids mid income'

In [116]:
# Creating single w/ kids mid income
all_excluded.loc[(all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] > 0) & (all_excluded['income_group'] == 'mid income'), 'profile_income'] = 'Single w/ kids mid income'

In [117]:
# Creating married w/o kids mid income
all_excluded.loc[(all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] == 0) & (all_excluded['income_group'] == 'mid income'), 'profile_income'] = 'Married w/o kids mid income'

In [118]:
# Creating married w/ kids mid income
all_excluded.loc[(all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] > 0) & (all_excluded['income_group'] == 'mid income'), 'profile_income'] = 'Married w/ kids mid income'

In [119]:
# Creating single w/o kids high income
all_excluded.loc[(all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] == 0) & (all_excluded['income_group'] == 'high income'), 'profile_income'] = 'Single w/o kids high income'

In [120]:
# Creating single w/ kids high income
all_excluded.loc[(all_excluded['fam_status'] != 'married') & (all_excluded['number_of_dependents'] > 0) & (all_excluded['income_group'] == 'high income'), 'profile_income'] = 'Single w/ kids high income'

In [121]:
# Creating married w/o kids high income
all_excluded.loc[(all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] == 0) & (all_excluded['income_group'] == 'high income'), 'profile_income'] = 'Married w/o kids high income'

In [122]:
# Creating married w/ kids high income
all_excluded.loc[(all_excluded['fam_status'] == 'married') & (all_excluded['number_of_dependents'] > 0) & (all_excluded['income_group'] == 'high income'), 'profile_income'] = 'Married w/ kids high income'

In [123]:
# Checking values of profile_income
all_excluded['profile_income'].value_counts(dropna=False)

profile_income
Married w/ kids mid income     8366651
Married w/ kids high income    6840076
Married w/ kids low income     6536984
Single w/o kids mid income     2943567
Single w/o kids low income     2474608
Single w/o kids high income    2321506
Single w/ kids low income       895088
Single w/ kids mid income       567883
Single w/ kids high income       18201
Name: count, dtype: int64

In [124]:
# Exporting dataframe
all_excluded.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'customer_profiles.pkl'))