### Online Retails Data with KMeans Clustering
- Data Source: https://archive.ics.uci.edu/dataset/502/online+retail+ii
- date: 2024-11-01

In [15]:
### Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 500)

In [16]:
import warnings
warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)

### Basic Data Exploration

In [17]:
df = pd.read_excel('datasets/online_retail_II.xlsx', sheet_name='Year 2010-2011')
df.head()


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      541910 non-null  object        
 1   StockCode    541910 non-null  object        
 2   Description  540456 non-null  object        
 3   Quantity     541910 non-null  int64         
 4   InvoiceDate  541910 non-null  datetime64[ns]
 5   Price        541910 non-null  float64       
 6   Customer ID  406830 non-null  float64       
 7   Country      541910 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [None]:
# Convert to string and replace 'nan' with actual NaN (None)
df['Customer ID'] = df['Customer ID'].astype(str).replace('nan', None)


In [20]:
# Check for missing values
df.isna().sum()

Invoice           0
StockCode         0
Description    1454
Quantity          0
InvoiceDate       0
Price             0
Customer ID       0
Country           0
dtype: int64

### Data Composition

In [21]:
# Get value counts and percentages for each country
country_distribution = df['Country'].value_counts()
country_percentages = df['Country'].value_counts(normalize=True) * 100

# Display both counts and percentages
country_analysis = pd.DataFrame({
    'Count': country_distribution,
    'Percentage': country_percentages
})

# Sort by count in descending order
country_analysis = country_analysis.sort_values('Count', ascending=False)

# Round percentages to 2 decimal places
country_analysis['Percentage'] = country_analysis['Percentage'].round(2)

print("Top 5 Distribution of data by country:")
print(country_analysis[:5])

Top 5 Distribution of data by country:
                 Count  Percentage
Country                           
United Kingdom  495478       91.43
Germany           9495        1.75
France            8558        1.58
EIRE              8196        1.51
Spain             2533        0.47


In [22]:
# First, let's see the invoices with missing Customer IDs
missing_customer_invoices = df[df['Customer ID'].isna()]['Invoice'].unique()

# Get count of how many times each invoice appears
invoice_analysis = df[df['Customer ID'].isna()].groupby('Invoice').agg({
    'Customer ID': 'count',  # Count of rows for each invoice
    'StockCode': 'count',    # Number of items in invoice
    'Price': 'sum'          # Total value of invoice
}).rename(columns={'Customer ID': 'Number_of_Items'})

# Check if these invoices ever appear with a Customer ID
invoices_with_customer = df[
    (df['Invoice'].isin(missing_customer_invoices)) & 
    (df['Customer ID'].notna())
]

# Summary of findings
print(f"Number of invoices with missing Customer ID: {len(missing_customer_invoices)}")
print("\nSample of invoices with missing Customer IDs:")
print(invoice_analysis.head())

if len(invoices_with_customer) > 0:
    print("\nWARNING: Found invoices that appear with both missing and non-missing Customer IDs:")
    print(invoices_with_customer[['Invoice', 'Customer ID']].head())
else:
    print("\nAll invoices with missing Customer IDs are consistently missing (good consistency)")

# Additional analysis by invoice characteristics
print("\nAnalysis of transactions with missing Customer IDs:")
missing_analysis = df[df['Customer ID'].isna()].agg({
    'Invoice': 'nunique',
    'StockCode': 'count',
    'Price': ['sum', 'mean'],
    'Quantity': ['sum', 'mean']
}).round(2)
print(missing_analysis)

# Check if there are any patterns
missing_patterns = df[df['Customer ID'].isna()].groupby(['Country', 'Invoice']).size().reset_index()
print("\nDistribution of missing Customer IDs by country:")
print(missing_patterns.groupby('Country').size().sort_values(ascending=False))

Number of invoices with missing Customer ID: 0

Sample of invoices with missing Customer IDs:
Empty DataFrame
Columns: [Number_of_Items, StockCode, Price]
Index: []

All invoices with missing Customer IDs are consistently missing (good consistency)

Analysis of transactions with missing Customer IDs:
         Invoice  StockCode  Price  Quantity
nunique     0.00        NaN    NaN       NaN
count        NaN       0.00    NaN       NaN
sum          NaN        NaN   0.00      0.00
mean         NaN        NaN    NaN       NaN

Distribution of missing Customer IDs by country:
Series([], dtype: int64)


In [23]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,Price
count,541910.0,541910,541910.0
mean,9.55,2011-07-04 13:35:22.342307584,4.61
min,-80995.0,2010-12-01 08:26:00,-11062.06
25%,1.0,2011-03-28 11:34:00,1.25
50%,3.0,2011-07-19 17:17:00,2.08
75%,10.0,2011-10-19 11:27:00,4.13
max,80995.0,2011-12-09 12:50:00,38970.0
std,218.08,,96.76


### Data Preprocessing

In [24]:
# Check for negative values in Quantity and Price
negative_quantity = df[df['Quantity'] < 0]
negative_price = df[df['Price'] < 0]

# Summary of findings
print(f"Number of rows with negative Quantity: {len(negative_quantity)}")
print(f"Number of rows with negative Price: {len(negative_price)}")

# Additional analysis
print("\nSample of rows with negative Quantity:")
print(negative_quantity.head())

print("\nSample of rows with negative Price:")
print(negative_price.head())

# Check for negative values in Quantity with StockCode as 'D'
negative_d = df[(df['Quantity'] < 0) & (df['StockCode'] == 'D')]
print(f"Number of rows with negative Quantity and StockCode 'D': {len(negative_d)}")
negative_d.head()


Number of rows with negative Quantity: 10624
Number of rows with negative Price: 2

Sample of rows with negative Quantity:
     Invoice StockCode                       Description  Quantity  \
141  C536379         D                          Discount        -1   
154  C536383    35004C   SET OF 3 COLOURED  FLYING DUCKS        -1   
235  C536391     22556    PLASTERS IN TIN CIRCUS PARADE        -12   
236  C536391     21984  PACK OF 12 PINK PAISLEY TISSUES        -24   
237  C536391     21983  PACK OF 12 BLUE PAISLEY TISSUES        -24   

            InvoiceDate  Price Customer ID         Country  
141 2010-12-01 09:41:00  27.50     14527.0  United Kingdom  
154 2010-12-01 09:49:00   4.65     15311.0  United Kingdom  
235 2010-12-01 10:24:00   1.65     17548.0  United Kingdom  
236 2010-12-01 10:24:00   0.29     17548.0  United Kingdom  
237 2010-12-01 10:24:00   0.29     17548.0  United Kingdom  

Sample of rows with negative Price:
        Invoice StockCode      Description  Quantity 

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527.0,United Kingdom
9038,C537164,D,Discount,-1,2010-12-05 13:21:00,29.29,14527.0,United Kingdom
14498,C537597,D,Discount,-1,2010-12-07 12:34:00,281.0,15498.0,United Kingdom
19392,C537857,D,Discount,-1,2010-12-08 16:00:00,267.12,17340.0,United Kingdom
31134,C538897,D,Discount,-1,2010-12-15 09:14:00,5.76,16422.0,United Kingdom


In [25]:
# Using only the UK data with non-missing values and update the data types
df_uk = df[(df['Country'] == 'United Kingdom') & df.notna().all(axis=1)]

print("Data types after conversion:\n")
df_uk.info()

print("\nSummary statistics for UK data:")
df_uk.describe()

Data types after conversion:

<class 'pandas.core.frame.DataFrame'>
Index: 494024 entries, 0 to 541893
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      494024 non-null  object        
 1   StockCode    494024 non-null  object        
 2   Description  494024 non-null  object        
 3   Quantity     494024 non-null  int64         
 4   InvoiceDate  494024 non-null  datetime64[ns]
 5   Price        494024 non-null  float64       
 6   Customer ID  494024 non-null  object        
 7   Country      494024 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 33.9+ MB

Summary statistics for UK data:


Unnamed: 0,Quantity,InvoiceDate,Price
count,494024.0,494024,494024.0
mean,8.66,2011-07-04 08:01:31.008777472,4.55
min,-80995.0,2010-12-01 08:26:00,-11062.06
25%,1.0,2011-03-27 12:21:00,1.25
50%,3.0,2011-07-19 13:01:00,2.1
75%,10.0,2011-10-20 11:51:00,4.13
max,80995.0,2011-12-09 12:49:00,38970.0
std,227.54,,99.46
