### Online Retails Data with KMeans Clustering
- Data Source: https://archive.ics.uci.edu/dataset/502/online+retail+ii
- date: 2024-11-01

In [2]:
### Setup
import pandas as pd
import matplotlib.pyplot as pyplot
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 500)

### Data Exploration

In [3]:
df = pd.read_excel('datasets/online_retail_II.xlsx', sheet_name='Year 2010-2011')
df.head()


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      541910 non-null  object        
 1   StockCode    541910 non-null  object        
 2   Description  540456 non-null  object        
 3   Quantity     541910 non-null  int64         
 4   InvoiceDate  541910 non-null  datetime64[ns]
 5   Price        541910 non-null  float64       
 6   Customer ID  406830 non-null  float64       
 7   Country      541910 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [6]:
df.isna().sum()

Invoice             0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
Price               0
Customer ID    135080
Country             0
dtype: int64

In [11]:
# Get value counts and percentages for each country
country_distribution = df['Country'].value_counts()
country_percentages = df['Country'].value_counts(normalize=True) * 100

# Display both counts and percentages
country_analysis = pd.DataFrame({
    'Count': country_distribution,
    'Percentage': country_percentages
})

# Sort by count in descending order
country_analysis = country_analysis.sort_values('Count', ascending=False)

# Round percentages to 2 decimal places
country_analysis['Percentage'] = country_analysis['Percentage'].round(2)

print("Top 5 Distribution of data by country:")
print(country_analysis[:5])

Top 5 Distribution of data by country:
                 Count  Percentage
Country                           
United Kingdom  495478       91.43
Germany           9495        1.75
France            8558        1.58
EIRE              8196        1.51
Spain             2533        0.47


In [12]:
# First, let's see the invoices with missing Customer IDs
missing_customer_invoices = df[df['Customer ID'].isna()]['Invoice'].unique()

# Get count of how many times each invoice appears
invoice_analysis = df[df['Customer ID'].isna()].groupby('Invoice').agg({
    'Customer ID': 'count',  # Count of rows for each invoice
    'StockCode': 'count',    # Number of items in invoice
    'Price': 'sum'          # Total value of invoice
}).rename(columns={'Customer ID': 'Number_of_Items'})

# Check if these invoices ever appear with a Customer ID
invoices_with_customer = df[
    (df['Invoice'].isin(missing_customer_invoices)) & 
    (df['Customer ID'].notna())
]

# Summary of findings
print(f"Number of invoices with missing Customer ID: {len(missing_customer_invoices)}")
print("\nSample of invoices with missing Customer IDs:")
print(invoice_analysis.head())

if len(invoices_with_customer) > 0:
    print("\nWARNING: Found invoices that appear with both missing and non-missing Customer IDs:")
    print(invoices_with_customer[['Invoice', 'Customer ID']].head())
else:
    print("\nAll invoices with missing Customer IDs are consistently missing (good consistency)")

# Additional analysis by invoice characteristics
print("\nAnalysis of transactions with missing Customer IDs:")
missing_analysis = df[df['Customer ID'].isna()].agg({
    'Invoice': 'nunique',
    'StockCode': 'count',
    'Price': ['sum', 'mean'],
    'Quantity': ['sum', 'mean']
}).round(2)
print(missing_analysis)

# Check if there are any patterns
missing_patterns = df[df['Customer ID'].isna()].groupby(['Country', 'Invoice']).size().reset_index()
print("\nDistribution of missing Customer IDs by country:")
print(missing_patterns.groupby('Country').size().sort_values(ascending=False))

Number of invoices with missing Customer ID: 3710

Sample of invoices with missing Customer IDs:
         Number_of_Items  StockCode    Price
Invoice                                     
536414                 0          1     0.00
536544                 0        527 2,987.72
536545                 0          1     0.00
536546                 0          1     0.00
536547                 0          1     0.00

All invoices with missing Customer IDs are consistently missing (good consistency)

Analysis of transactions with missing Customer IDs:
         Invoice  StockCode        Price   Quantity
nunique 3,710.00        NaN          NaN        NaN
count        NaN 135,080.00          NaN        NaN
sum          NaN        NaN 1,090,984.01 269,562.00
mean         NaN        NaN         8.08       2.00

Distribution of missing Customer IDs by country:
Country
United Kingdom    3637
EIRE                41
Hong Kong           15
Unspecified          5
Israel               3
Switzerland       

In [5]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,Price,Customer ID
count,541910.0,541910,541910.0,406830.0
mean,9.55,2011-07-04 13:35:22.342307584,4.61,15287.68
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.08,,96.76,1713.6
