# Initial Data Exploration
## 1. Load Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 2. Load Data

In [None]:
df = pd.read_csv('../data/raw/online_retail.csv', low_memory=False)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
print(f"Dataset Shape: {df.shape}")
df.head()

## 3. Basic Statistics
### Q1: What is the date range of transactions?
### Q2: How many unique customers?
### Q3: How many unique products?
### Q4: How many countries represented?

In [None]:
print(f"Date Range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")
print(f"Unique Customers: {df['CustomerID'].nunique()}")
print(f"Unique Products: {df['StockCode'].nunique()}")
print(f"Unique Countries: {df['Country'].nunique()}")

## 4. Data Quality Issues
### Q5: How many missing values in each column?
### Q6: What percentage of CustomerID is missing?
### Q7: Are there any duplicate invoices?

In [None]:
print("Missing Values:")
print(df.isnull().sum())

missing_cust_pct = df['CustomerID'].isnull().mean() * 100
print(f"\nMissing CustomerID Percentage: {missing_cust_pct:.2f}%")

print(f"\nDuplicate Rows: {df.duplicated().sum()}")

## 5. Transaction Patterns
### Q8: What's the distribution of Quantity?
### Q9: Are there negative quantities?
### Q10: What's the price range?

In [None]:
print(df['Quantity'].describe())
print(f"\nNegative Quantities: {(df['Quantity'] < 0).sum()}")

print("\nPrice Statistics:")
print(df['UnitPrice'].describe())

## 6. Data Quality Summary

In [None]:
data_quality_summary = {
    'total_rows': len(df),
    'total_columns': len(df.columns),
    'missing_values': df.isnull().sum().to_dict(),
    'duplicate_rows': int(df.duplicated().sum()),
    'date_range': {
        'start': str(df['InvoiceDate'].min()),
        'end': str(df['InvoiceDate'].max())
    },
    'negative_quantities': int((df['Quantity'] < 0).sum()),
    'cancelled_invoices': int(df['InvoiceNo'].astype(str).str.startswith('C').sum()),
    'missing_customer_ids': int(df['CustomerID'].isnull().sum()),
    'missing_customer_ids_percentage': float(df['CustomerID'].isnull().mean() * 100)
}

with open('../data/raw/data_quality_summary.json', 'w') as f:
    json.dump(data_quality_summary, f, indent=4, default=str)

print("Data Quality Summary Saved.")