## Topic: online shopping
- Population of interest: all Ironhack students
- Method: convenience sampling

In [4]:
#  Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.datasets import load_iris, fetch_california_housing
from statsmodels.stats.weightstats import DescrStatsW

# Set plot styling
plt.style.use('seaborn-v0_8')
sns.set(style="whitegrid")

In [6]:
# read csv

shopping = pd.read_csv("shopping.csv")
shopping

Unnamed: 0,Timestamp,First name,Age,"Annual Income (in euros, approximate)",Q1. Online orders in the past 30 days,Q2. Approx. total online spend in the past 30 days,Q3. What categories did you buy? (check all that apply),Q4. Which device do you use most to shop online?,Q5. What usually triggers a purchase? (choose one),Q6. How important is delivery speed when choosing a retailer?
0,10/2/2025 11:17:23,Marta,29,25000,1-2,€51-150,"[1] Food/Groceries, [4] Learning (books/course...",[2] Laptop/Desktop,[1] I needed it,[3] Moderately important
1,10/2/2025 11:18:07,Jonas,34,60000,3-5,€151-300,"[1] Food/Groceries, [2] Apparel/Accessories, [...",[1] Phone,[1] I needed it,[5] Extremely important
2,10/2/2025 11:19:54,Aisha,26,45000,6-10,€300+,"[2] Apparel/Accessories, [3] Electronics/Hardware",[1] Phone,[2] I saw a sale/discount,[4] Very important
3,10/2/2025 11:20:32,Levi,31,50000,3-5,€151-300,"[1] Food/Groceries, [5] Household/Appliances/C...",[2] Laptop/Desktop,[1] I needed it,[4] Very important
4,10/2/2025 11:21:22,Petra,22,15000,1-2,€51-150,"[1] Food/Groceries, [2] Apparel/Accessories",[1] Phone,[4] Boredom/impulse,[2] Slightly important
5,10/2/2025 11:22:05,Carlos,37,55000,1-2,€51-150,"[3] Electronics/Hardware, [4] Learning (books/...",[2] Laptop/Desktop,[2] I saw a sale/discount,[3] Moderately important
6,10/2/2025 11:22:51,Sara,28,30000,1-2,€1-50,"[2] Apparel/Accessories, [4] Learning (books/c...",[2] Laptop/Desktop,[2] I saw a sale/discount,[3] Moderately important


### EDA

In [9]:
shopping.rename(columns={
    "First name": "first_name",
    "Annual Income (in euros, approximate)": "income",
    "Q1. Online orders in the past 30 days": "orders_30d",
    "Q2. Approx. total online spend in the past 30 days": "spend_30d",
    "Q3. What categories did you buy? (check all that apply)": "purchase_categories",
    "Q4. Which device do you use most to shop online?": "device_used",
    "Q5. What usually triggers a purchase? (choose one)": "purchase_trigger",
    "Q6. How important is delivery speed when choosing a retailer?": "delivery_speed"
}, inplace=True)

shopping

Unnamed: 0,Timestamp,first_name,Age,income,orders_30d,spend_30d,purchase_categories,device_used,purchase_trigger,delivery_speed
0,10/2/2025 11:17:23,Marta,29,25000,1-2,€51-150,"[1] Food/Groceries, [4] Learning (books/course...",[2] Laptop/Desktop,[1] I needed it,[3] Moderately important
1,10/2/2025 11:18:07,Jonas,34,60000,3-5,€151-300,"[1] Food/Groceries, [2] Apparel/Accessories, [...",[1] Phone,[1] I needed it,[5] Extremely important
2,10/2/2025 11:19:54,Aisha,26,45000,6-10,€300+,"[2] Apparel/Accessories, [3] Electronics/Hardware",[1] Phone,[2] I saw a sale/discount,[4] Very important
3,10/2/2025 11:20:32,Levi,31,50000,3-5,€151-300,"[1] Food/Groceries, [5] Household/Appliances/C...",[2] Laptop/Desktop,[1] I needed it,[4] Very important
4,10/2/2025 11:21:22,Petra,22,15000,1-2,€51-150,"[1] Food/Groceries, [2] Apparel/Accessories",[1] Phone,[4] Boredom/impulse,[2] Slightly important
5,10/2/2025 11:22:05,Carlos,37,55000,1-2,€51-150,"[3] Electronics/Hardware, [4] Learning (books/...",[2] Laptop/Desktop,[2] I saw a sale/discount,[3] Moderately important
6,10/2/2025 11:22:51,Sara,28,30000,1-2,€1-50,"[2] Apparel/Accessories, [4] Learning (books/c...",[2] Laptop/Desktop,[2] I saw a sale/discount,[3] Moderately important


In [10]:
shopping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Timestamp            7 non-null      object
 1   first_name           7 non-null      object
 2   Age                  7 non-null      int64 
 3   income               7 non-null      object
 4   orders_30d           7 non-null      object
 5   spend_30d            7 non-null      object
 6   purchase_categories  7 non-null      object
 7   device_used          7 non-null      object
 8   purchase_trigger     7 non-null      object
 9   delivery_speed       7 non-null      object
dtypes: int64(1), object(9)
memory usage: 692.0+ bytes


In [14]:
shopping = shopping.copy()

In [15]:
shopping

Unnamed: 0,Timestamp,first_name,Age,income,orders_30d,spend_30d,purchase_categories,device_used,purchase_trigger,delivery_speed
0,10/2/2025 11:17:23,Marta,29,25000,1-2,€51-150,"[1] Food/Groceries, [4] Learning (books/course...",[2] Laptop/Desktop,[1] I needed it,[3] Moderately important
1,10/2/2025 11:18:07,Jonas,34,60000,3-5,€151-300,"[1] Food/Groceries, [2] Apparel/Accessories, [...",[1] Phone,[1] I needed it,[5] Extremely important
2,10/2/2025 11:19:54,Aisha,26,45000,6-10,€300+,"[2] Apparel/Accessories, [3] Electronics/Hardware",[1] Phone,[2] I saw a sale/discount,[4] Very important
3,10/2/2025 11:20:32,Levi,31,50000,3-5,€151-300,"[1] Food/Groceries, [5] Household/Appliances/C...",[2] Laptop/Desktop,[1] I needed it,[4] Very important
4,10/2/2025 11:21:22,Petra,22,15000,1-2,€51-150,"[1] Food/Groceries, [2] Apparel/Accessories",[1] Phone,[4] Boredom/impulse,[2] Slightly important
5,10/2/2025 11:22:05,Carlos,37,55000,1-2,€51-150,"[3] Electronics/Hardware, [4] Learning (books/...",[2] Laptop/Desktop,[2] I saw a sale/discount,[3] Moderately important
6,10/2/2025 11:22:51,Sara,28,30000,1-2,€1-50,"[2] Apparel/Accessories, [4] Learning (books/c...",[2] Laptop/Desktop,[2] I saw a sale/discount,[3] Moderately important


In [44]:
shopping['purchase_categories'] = (
    shopping['purchase_categories']
    .astype(str)
    .str.replace(r'\[\d+\]\s*', '', regex=True)
    .str.strip()
)
shopping['device_used'] = (
    shopping['device_used']
    .astype(str)
    .str.replace(r'\[\d+\]\s*', '', regex=True)
    .str.strip()
)
shopping['purchase_trigger'] = (
    shopping['purchase_trigger']
    .astype(str)
    .str.replace(r'\[\d+\]\s*', '', regex=True)
    .str.strip()
)
shopping['delivery_speed'] = (
    shopping['delivery_speed']
    .astype(str)
    .str.replace(r'\[\d+\]\s*', '', regex=True)
    .str.strip()
)

In [46]:
shopping

Unnamed: 0,Timestamp,first_name,Age,income,orders_30d,spend_30d,purchase_categories,device_used,purchase_trigger,delivery_speed,...,delivery_speed_code,delivery_speed_label,purchase_categories_code,purchase_categories_label,cat_Apparel/Accessories,cat_Electronics/Hardware,cat_Food/Groceries,cat_Household/Appliances/Cleaning,cat_Learning (books/courses/software),cat_list
0,10/2/2025 11:17:23,Marta,29,25000,1-2,€51-150,"Food/Groceries, Learning (books/courses/softwa...",Laptop/Desktop,I needed it,Moderately important,...,3,Moderately important,1,"Food/Groceries, Learning (books/courses/softwa...",0,0,1,1,1,"[Food/Groceries, Learning (books/courses/softw..."
1,10/2/2025 11:18:07,Jonas,34,60000,3-5,€151-300,"Food/Groceries, Apparel/Accessories, Household...",Phone,I needed it,Extremely important,...,5,Extremely important,1,"Food/Groceries, Apparel/Accessories, Household...",1,0,1,1,0,"[Food/Groceries, Apparel/Accessories, Househol..."
2,10/2/2025 11:19:54,Aisha,26,45000,6-10,€300+,"Apparel/Accessories, Electronics/Hardware",Phone,I saw a sale/discount,Very important,...,4,Very important,2,"Apparel/Accessories, Electronics/Hardware",1,1,0,0,0,"[Apparel/Accessories, Electronics/Hardware]"
3,10/2/2025 11:20:32,Levi,31,50000,3-5,€151-300,"Food/Groceries, Household/Appliances/Cleaning",Laptop/Desktop,I needed it,Very important,...,4,Very important,1,"Food/Groceries, Household/Appliances/Cleaning",0,0,1,1,0,"[Food/Groceries, Household/Appliances/Cleaning]"
4,10/2/2025 11:21:22,Petra,22,15000,1-2,€51-150,"Food/Groceries, Apparel/Accessories",Phone,Boredom/impulse,Slightly important,...,2,Slightly important,1,"Food/Groceries, Apparel/Accessories",1,0,1,0,0,"[Food/Groceries, Apparel/Accessories]"
5,10/2/2025 11:22:05,Carlos,37,55000,1-2,€51-150,"Electronics/Hardware, Learning (books/courses/...",Laptop/Desktop,I saw a sale/discount,Moderately important,...,3,Moderately important,3,"Electronics/Hardware, Learning (books/courses/...",0,1,0,0,1,"[Electronics/Hardware, Learning (books/courses..."
6,10/2/2025 11:22:51,Sara,28,30000,1-2,€1-50,"Apparel/Accessories, Learning (books/courses/s...",Laptop/Desktop,I saw a sale/discount,Moderately important,...,3,Moderately important,2,"Apparel/Accessories, Learning (books/courses/s...",1,0,0,0,1,"[Apparel/Accessories, Learning (books/courses/..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   Timestamp                              7 non-null      object
 1   first_name                             7 non-null      object
 2   Age                                    7 non-null      int64 
 3   income                                 7 non-null      object
 4   orders_30d                             7 non-null      object
 5   spend_30d                              7 non-null      object
 6   purchase_categories                    7 non-null      object
 7   device_used                            7 non-null      object
 8   purchase_trigger                       7 non-null      object
 9   delivery_speed                         7 non-null      object
 10  device_used_code                       7 non-null      int64 
 11  device_used_label      

In [66]:
# remove commas from income
shopping['income'] = (
    shopping['income']
    .astype(str)
    .str.replace(r'[^0-9]', '', regex=True)
    .astype(int)
)

shopping['spend_30d'] = (
    shopping['spend_30d']
    .astype(str)
    .str.replace('€', '', regex=False)   # remove euro symbol
    .str.strip()
)


In [None]:
shopping['income'] = shopping['income'].astype(int)

In [58]:
shopping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   Timestamp                              7 non-null      object
 1   first_name                             7 non-null      object
 2   Age                                    7 non-null      int64 
 3   income                                 7 non-null      int64 
 4   orders_30d                             7 non-null      object
 5   spend_30d                              7 non-null      object
 6   purchase_categories                    7 non-null      object
 7   device_used                            7 non-null      object
 8   purchase_trigger                       7 non-null      object
 9   delivery_speed                         7 non-null      object
 10  device_used_code                       7 non-null      int64 
 11  device_used_label      

In [None]:
shopping = shopping.drop("cat_list", axis=1)

In [67]:
shopping

Unnamed: 0,Timestamp,first_name,Age,income,orders_30d,spend_30d,purchase_categories,device_used,purchase_trigger,delivery_speed,...,purchase_trigger_label,delivery_speed_code,delivery_speed_label,purchase_categories_code,purchase_categories_label,cat_Apparel/Accessories,cat_Electronics/Hardware,cat_Food/Groceries,cat_Household/Appliances/Cleaning,cat_Learning (books/courses/software)
0,10/2/2025 11:17:23,Marta,29,25000,1-2,51-150,"Food/Groceries, Learning (books/courses/softwa...",Laptop/Desktop,I needed it,Moderately important,...,I needed it,3,Moderately important,1,"Food/Groceries, Learning (books/courses/softwa...",0,0,1,1,1
1,10/2/2025 11:18:07,Jonas,34,60000,3-5,151-300,"Food/Groceries, Apparel/Accessories, Household...",Phone,I needed it,Extremely important,...,I needed it,5,Extremely important,1,"Food/Groceries, Apparel/Accessories, Household...",1,0,1,1,0
2,10/2/2025 11:19:54,Aisha,26,45000,6-10,300+,"Apparel/Accessories, Electronics/Hardware",Phone,I saw a sale/discount,Very important,...,I saw a sale/discount,4,Very important,2,"Apparel/Accessories, Electronics/Hardware",1,1,0,0,0
3,10/2/2025 11:20:32,Levi,31,50000,3-5,151-300,"Food/Groceries, Household/Appliances/Cleaning",Laptop/Desktop,I needed it,Very important,...,I needed it,4,Very important,1,"Food/Groceries, Household/Appliances/Cleaning",0,0,1,1,0
4,10/2/2025 11:21:22,Petra,22,15000,1-2,51-150,"Food/Groceries, Apparel/Accessories",Phone,Boredom/impulse,Slightly important,...,Boredom/impulse,2,Slightly important,1,"Food/Groceries, Apparel/Accessories",1,0,1,0,0
5,10/2/2025 11:22:05,Carlos,37,55000,1-2,51-150,"Electronics/Hardware, Learning (books/courses/...",Laptop/Desktop,I saw a sale/discount,Moderately important,...,I saw a sale/discount,3,Moderately important,3,"Electronics/Hardware, Learning (books/courses/...",0,1,0,0,1
6,10/2/2025 11:22:51,Sara,28,30000,1-2,1-50,"Apparel/Accessories, Learning (books/courses/s...",Laptop/Desktop,I saw a sale/discount,Moderately important,...,I saw a sale/discount,3,Moderately important,2,"Apparel/Accessories, Learning (books/courses/s...",1,0,0,0,1
