In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

# Set up visualization
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

transactions = pd.read_csv('data/raw/transactions_train.csv')
articles = pd.read_csv('data/raw/articles.csv')
customers = pd.read_csv('data/raw/customers.csv')

print("✓ Datasets loaded successfully!")
print(f"\nTransactions shape: {transactions.shape}")
print(f"Articles shape: {articles.shape}")
print(f"Customers shape: {customers.shape}")

Libraries imported successfully!
✓ Datasets loaded successfully!

Transactions shape: (31788324, 5)
Articles shape: (105542, 25)
Customers shape: (1371980, 7)


In [2]:
# Basic info about transactions
print("=" * 60)
print("TRANSACTIONS DATA")
print("=" * 60)
print(transactions.head())
print("\nData types:")
print(transactions.dtypes)
print("\nMissing values:")
print(transactions.isnull().sum())
print("\nDate range:")
print(f"From: {transactions['t_dat'].min()}")
print(f"To: {transactions['t_dat'].max()}")

TRANSACTIONS DATA
        t_dat                                        customer_id  article_id  \
0  2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1  2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   505221004   
3  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687003   
4  2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687004   

      price  sales_channel_id  
0  0.050831                 2  
1  0.030492                 2  
2  0.015237                 2  
3  0.016932                 2  
4  0.016932                 2  

Data types:
t_dat                   str
customer_id             str
article_id            int64
price               float64
sales_channel_id      int64
dtype: object

Missing values:
t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
dtype: int64

D

In [3]:
print("=" * 60)
print("ARTICLES DATA")
print("=" * 60)
print(articles.head())
print("\nArticles columns:")
print(articles.columns.tolist())
print("\nData types:")
print(articles.dtypes)

ARTICLES DATA
   article_id  product_code          prod_name  product_type_no  \
0   108775015        108775          Strap top              253   
1   108775044        108775          Strap top              253   
2   108775051        108775      Strap top (1)              253   
3   110065001        110065  OP T-shirt (Idro)              306   
4   110065002        110065  OP T-shirt (Idro)              306   

  product_type_name  product_group_name  graphical_appearance_no  \
0          Vest top  Garment Upper body                  1010016   
1          Vest top  Garment Upper body                  1010016   
2          Vest top  Garment Upper body                  1010017   
3               Bra           Underwear                  1010016   
4               Bra           Underwear                  1010016   

  graphical_appearance_name  colour_group_code colour_group_name  ...  \
0                     Solid                  9             Black  ...   
1                     Solid 

In [5]:
print('=' * 60)
print('CUSTOMERS DATA')
print('=' * 60)
print(customers.head())
print('\nCustomers Columns: ')
print(customers.columns.tolist())
print('\nData Types:')
print(customers.dtypes)

CUSTOMERS DATA
                                         customer_id   FN  Active  \
0  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...  NaN     NaN   
1  0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...  NaN     NaN   
2  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...  NaN     NaN   
3  00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...  NaN     NaN   
4  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...  1.0     1.0   

  club_member_status fashion_news_frequency   age  \
0             ACTIVE                   NONE  49.0   
1             ACTIVE                   NONE  25.0   
2             ACTIVE                   NONE  24.0   
3             ACTIVE                   NONE  54.0   
4             ACTIVE              Regularly  52.0   

                                         postal_code  
0  52043ee2162cf5aa7ee79974281641c6f11a68d276429a...  
1  2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...  
2  64f17e6a330a85798e4998f62d0930d14db8db1c054af6...  
3  5d36574f52495e81f019b680c843

In [13]:
print("=" * 60)
print("KEY STATISTICS")
print("=" * 60)

print(f"\n1. TEMPORAL COVERAGE:")
print(f"   Training period: 2018-09-20 to 2020-09-22 (~2 years)")
print(f"   Test period: Last 7 days (2020-09-16 to 2020-09-22)")

print(f"\n2. CUSTOMER ANALYSIS:")
print(f"   Total unique customers: {transactions['customer_id'].nunique():,}")
print(f"   Customers in dataset: {len(customers):,}")
print(f"   Coverage: {transactions['customer_id'].nunique() / len(customers) * 100:.1f}%")

print(f"\n3. PRODUCT ANALYSIS:")
print(f"   Total unique articles: {transactions['article_id'].nunique():,}")
print(f"   Articles in catalog: {len(articles):,}")
print(f"   Coverage: {transactions['article_id'].nunique() / len(articles) * 100:.1f}%")

print(f"\n4. TRANSACTION STATS:")
print(f"   Total transactions: {len(transactions):,}")
print(f"   Avg transactions per customer: {len(transactions) / transactions['customer_id'].nunique():.1f}")
print(f"   Avg transactions per article: {len(transactions) / transactions['article_id'].nunique():.1f}")

print(f"\n5. MISSING DATA IN CUSTOMERS:")
print(f"   FN missing: {customers['FN'].isnull().sum():,} ({customers['FN'].isnull().sum()/len(customers)*100:.1f}%)")
print(f"   Active missing: {customers['Active'].isnull().sum():,} ({customers['Active'].isnull().sum()/len(customers)*100:.1f}%)")

KEY STATISTICS

1. TEMPORAL COVERAGE:
   Training period: 2018-09-20 to 2020-09-22 (~2 years)
   Test period: Last 7 days (2020-09-16 to 2020-09-22)

2. CUSTOMER ANALYSIS:
   Total unique customers: 1,362,281
   Customers in dataset: 1,371,980
   Coverage: 99.3%

3. PRODUCT ANALYSIS:
   Total unique articles: 104,547
   Articles in catalog: 105,542
   Coverage: 99.1%

4. TRANSACTION STATS:
   Total transactions: 31,788,324
   Avg transactions per customer: 23.3
   Avg transactions per article: 304.1

5. MISSING DATA IN CUSTOMERS:
   FN missing: 895,050 (65.2%)
   Active missing: 907,576 (66.2%)
