In [1]:
import sys
from pathlib import Path
import pandas as pd

projectRoot = Path().resolve().parent
sys.path.append(str(projectRoot))

In [2]:
# loading raw dataset
dataPath = projectRoot / 'data' / 'raw' / 'e_commerce_shopper_behaviour_and_lifestyle.csv'

df = pd.read_csv(dataPath)
print(f'Shape: {df.shape}')
df.head()

Shape: (1000000, 60)


Unnamed: 0,user_id,age,gender,country,urban_rural,income_level,employment_status,education_level,relationship_status,has_children,...,cart_items_average,checkout_abandonments_per_month,purchase_conversion_rate,app_usage_frequency,notification_response_rate,account_age_months,last_purchase_date,social_sharing_frequency,premium_subscription,return_rate
0,1,56,Female,Germany,Suburban,90860,Self-employed,Associate Degree,Single,0,...,10,2,62,7,74,19,2025-06-22,6,1,50
1,2,69,Male,Japan,Suburban,35423,Unemployed,Bachelor,Single,1,...,5,7,54,5,23,8,2026-07-25,3,0,37
2,3,46,Female,India,Urban,21467,Self-employed,Associate Degree,Married,1,...,3,3,33,7,12,13,2026-02-26,6,0,53
3,4,32,Male,Canada,Urban,41770,Self-employed,Bachelor,Widowed,0,...,5,9,26,4,19,9,2026-10-27,7,0,98
4,5,60,Female,Japan,Urban,183882,Employed,Associate Degree,Widowed,1,...,8,0,18,7,30,3,2026-06-23,3,0,86


In [3]:
# basic info of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 60 columns):
 #   Column                           Non-Null Count    Dtype 
---  ------                           --------------    ----- 
 0   user_id                          1000000 non-null  int64 
 1   age                              1000000 non-null  int64 
 2   gender                           1000000 non-null  object
 3   country                          1000000 non-null  object
 4   urban_rural                      1000000 non-null  object
 5   income_level                     1000000 non-null  int64 
 6   employment_status                1000000 non-null  object
 7   education_level                  1000000 non-null  object
 8   relationship_status              1000000 non-null  object
 9   has_children                     1000000 non-null  int64 
 10  household_size                   1000000 non-null  int64 
 11  occupation                       1000000 non-null  object
 12  e

In [4]:
# descriptive stats
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
user_id,1000000.0,,,,500000.5,288675.278932,1.0,250000.75,500000.5,750000.25,1000000.0
age,1000000.0,,,,49.003377,18.193959,18.0,33.0,49.0,65.0,80.0
gender,1000000.0,4.0,Male,480132.0,,,,,,,
country,1000000.0,10.0,Brazil,100476.0,,,,,,,
urban_rural,1000000.0,3.0,Urban,500226.0,,,,,,,
income_level,1000000.0,,,,104994.565463,54851.476652,10000.0,57466.0,105013.0,152497.0,200000.0
employment_status,1000000.0,5.0,Retired,200246.0,,,,,,,
education_level,1000000.0,5.0,Bachelor,299953.0,,,,,,,
relationship_status,1000000.0,5.0,Divorced,200675.0,,,,,,,
has_children,1000000.0,,,,0.399426,0.489781,0.0,0.0,0.0,1.0,1.0


In [5]:
# investigating missing values
df.isnull().sum().sort_values(ascending=False)

user_id                            0
age                                0
impulse_buying_score               0
environmental_consciousness        0
health_conscious_shopping          0
travel_frequency                   0
hobby_count                        0
social_media_influence_score       0
reading_habits                     0
exercise_frequency                 0
stress_from_financial_decisions    0
overall_stress_level               0
sleep_quality                      0
physical_activity_level            0
mental_health_score                0
daily_session_time_minutes         0
product_views_per_day              0
ad_views_per_day                   0
ad_clicks_per_day                  0
wishlist_items_count               0
cart_items_average                 0
checkout_abandonments_per_month    0
purchase_conversion_rate           0
app_usage_frequency                0
notification_response_rate         0
account_age_months                 0
last_purchase_date                 0
s

#### Column: user_id
- Type: int
- Description: unique identifier of each customer

#### Column: age
- Type: int
- Description: age of customers
- Notes: no negative values observed

#### Column: gender
- Type: categorical
- Description: customer genders
- Notes: 4 categorical variables defined; [Male, Female, Non-binary, Other]

#### Column: country
- Type: categorical
- Description: customer nationality
- Notes: 10 categorical variables

#### Column: urban_rural
- Type: categorical
- Description: customer demography
- Notes: 3 categorical values; [Urban, Suburban, Rural]

#### Column: income_level
- Type: int
- Description: customer income level
- Notes: no negative values observed

#### Column: employment_status
- Type: categorical
- Descrpition: employment status of customers
- Notes: 5 categorical values; [Retired, Unemployed, Employed, Student, Self-employed]

#### Column: education_level
- Type: categorical
- Description: level of education of customers
- Notes: 5 categorical values; [Bachelor, High School, Associate Degree, Master, PhD]

#### Column: relationship_status
- Type: categorical
- Description: relationship status of customers
- Notes: 5 categorical values; [Divorced, Widowed, In a relationship, Married, Single]

#### Column: has_children
- Type: int
- Description: whether customer has any children or not (Bool)
- Notes: 0 and 1 for bool representation; 0: no children, 1: any number of children

#### Column: household_size
- Type: int
- Description: household size of each customer
- Notes: a total number of 10 household size values starting from 1 to 10

#### Column: occupation
- Type: categorical
- Description: occupation of customers
- Notes: 8 categorical values; [Other, Marketing, Engineering, Finance, Retail, Education, Healthcare, IT]

#### Column: ethnicity
- Type: categorical
- Description: customer ethnicity
- Notes: 5 categorical values; [Asian, African American, Other, Hispanic, Caucasian]

#### Column: language_preference
- Type: categorical
- Description: native language
- Notes: 6 categorical values; [German, Hindi, Spanish, Mandarin, English, French]

#### Column: device_type
- Type: categorical
- Description: device that customer was using when purchasing
- Notes: 3 categorical values; [Mobile, Desktop, Tablet]

#### Column: weekly_purchases
- Type: int
- Description: the frequency of purchasing behavior over weekly aggregation 
- Notes: no negative values observed, starting from 0 to 20

#### Column: monthly_spend
- Type: int
- Description: monthly spenditure at the store in $
- Notes: no negative values observed

#### Column: cart_abandonment_rate
- Type: int
- Description: a division of the total number of completed transactions by the total number of transactions that were initiated
- Notes: no negative values observed

#### Column: review_writing_frequency
- Type: int
- Description: the frequency of reviews per customer 
- Notes: No negative values observed

#### Column: average_order_value
- Type: int
- Description: average order value of customers in $
- Notes: No negative values observed

#### Column: preferred_payment_method
- Type: categorical
- Description: 
- Notes: 6 categorical values; [Apple Pay, Debit Card, Bank Transfer, Google Pay, PayPal, Credit Card]

#### Column: coupon_usage_frequency
- Type: int
- Description: frequency of coupon usage per customer
- Notes: no negative values observed

#### Column: loyalty_program_member
- Type: int
- Description: if customer is part of loyalty program
- Notes: 0 and 1 respresenting bool values; 0: not part of loyalty program, 1: loyalty program member

#### Column: referral_count
- Type: int
- Description: number of referrals
- Notes: no negative values observed

#### Column: product_category_preference
- Type: categorical
- Description: preferred product category
- Notes: 8 categorical values; [Beauty, Sports, Electronics, Books, Groceries, Fashion, Toys, Home & Kitchen]

#### Column: shopping_time_of_day
- Type: categorical
- Description: shopping time
- Notes: 4 categorical values; [Evening, Night, Morning, Afternoon]

#### Column: weekend_shopper
- Type: int
- Description: weekday/weekend transaction
- Notes: 0 and 1 respresenting bool values; 0: weekday, 1: weekend

#### Column: impulse_purchases_per_month
- Type: int
- Description: number of purchase decisions made just before a purchase
- Notes: no negative values observed

#### Column: browse_to_buy_ratio
- Type: int
- Description: number of browsing actions / number of total purchases
- Notes: no negative values observed

#### Column: return_frequency
- Type: int
- Description: number of returns made by customers
- Notes: no negative values observed

#### Column: budgeting_style
- Type: categorical
- Description: customer budget classes
- Notes: 3 categorical values; [Loose, Moderate, Strict]

#### Column: brand_loyalty_score
- Type: int
- Description: how loyal a customer to purchased brands
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of being loyal to brands

#### Column: impulse_buying_score
- Type: int
- Description: impulse buying behavior
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of impulse buying activity

#### Column: environmental_consciousness
- Type: int
- Description: evvironmentally friendly purchasing activity
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of environmentally friendly buying activity

#### Column: health_conscious_shopping
- Type: int
- Description: health conscious purchasing activity
- Notes: 0 and 1 respresenting bool values; 0: False, 1: True

#### Column: travel_frequency
- Type: int
- Description: travel frequency of customers
- Notes: no negative values observed

#### Column: hobby_count
- Type: int
- Description: unique hobby count of customers
- Notes: no negative values observed

#### Column: social_media_influence_score
- Type: int
- Description: whether they are affected by social media when making a purchasing decision
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of social media impact on buying activity

#### Column: reading_habits
- Type: int
- Description: reading habits per customer
- Notes: no negative values observed

#### Column: exercise_frequency
- Type: int
- Description: exercse frequency per customer
- Notes: no negative values observed

#### Column: stress_from_financial_decisions
- Type: int
- Description: stress level derived from financial decisions
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of the impact of financial decision stress on buying activity

#### Column: overall_stress_level
- Type: int
- Description: overall stress level per customer
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of the impact of overall stress level on buying activity

#### Column: sleep_quality
- Type: int
- Description: sleep quality per customer
- Notes: no negative values observed; starting from 4 to 9, the higher ordinal value, the better the sleep quality

#### Column: physical_activity_level
- Type: int
- Description: physical activity level per customer
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of physical activity 

#### Column: mental_health_score
- Type: int
- Description: mental health per customer
- Notes: no negative values observed, starting from 0 to 10, ordinal categories with 10 representing maximum degree of meantal well-being

#### Column: daily_session_time_minutes
- Type: int
- Description: time spent on online store per day per customer
- Notes: no negative values observed

#### Column: product_views_per_day
- Type: int
- Description: number of products browsed per customer per day
- Notes: no negative values observed

#### Column: ad_views_per_day
- Type: int
- Description: number of ads viewed by customers per day
- Notes: no negative values observed

#### Column: ad_clicks_per_day
- Type: int
- Description: number of ads clicked per customer per day
- Notes: no negative values observed

#### Column: wishlist_items_count
- Type: int
- Description: number of items in the wishlist per customer
- Notes: no negative values observed

#### Column: cart_items_average
- Type: int
- Description: the average of number of items added in the cart per customer
- Notes: no negative values observed

#### Column: checkout_abandonments_per_month
- Type: int
- Description: the monthly frequency of a customer beginning the purchase process but not completing it 
- Notes: no negative values observed

#### Column: purchase_conversion_rate
- Type: int
- Description: showing what percentage of website visitors actually buying something from the online store. The metric is represented per customer
- Notes: no negative values observed

#### Column: app_usage_frequency
- Type: int
- Description: app usage frequency per customer
- Notes: no negative values observed

#### Column: notification_response_rate
- Type: int
- Description: the rate of notification responses per customer
- Notes: no negative values observed

#### Column: account_age_months
- Type: int
- Description: the duration showing when the customer account is created in months
- Notes: no negative values observed; starting from 1 to 24 months

#### Column: last_purchase_date
- Type: categorical
- Description: the date of last purchase by customer
- Notes: ***-this column should be converted to datetime in data handling stage***

#### Column: social_sharing_frequency
- Type: int
- Description: how frequenctly each customer shares the items in the social media
- Notes: no negative values observed

#### Column: premium_subscription
- Type: int
- Description: whether a customer made a premium subscription or not
- Notes:  0 and 1 respresenting bool values; 0: not a premium subscriber, 1: premium subscriber

#### Column: return_rate
- Type: int
- Description: the rate of returns per customer
- Notes: no negative values observed

### Potential ML Targets
- Spending_Level -> regression
- Purchase_Frequency -> classification
- Customer_Segment -> clustering