# Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the pricing dataset using the project's data loading, feature engineering, and metrics modules.

## Objectives
- Load and explore the raw dataset
- Analyze feature distributions and relationships
- Identify data quality issues
- Understand temporal patterns
- Prepare data for feature engineering


In [3]:
# Install required packages if not available
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
        print(f"✅ {package} installed successfully")
    except Exception as e:
        print(f"⚠️  Could not install {package}: {e}")

# Install pydantic-settings if needed
try:
    import pydantic_settings
    print("✅ pydantic-settings already available")
except ImportError:
    print("Installing pydantic-settings...")
    install_package('pydantic-settings')


✅ pydantic-settings already available


In [4]:
# Import project modules
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Project modules
from pricing_rf.data import load_data, clean_data, get_time_series_cv_splits
from pricing_rf.features import create_feature_pipeline, create_all_features
from pricing_rf.metrics import wape, tail_mae, evaluate_model
from pricing_rf.config import DataConfig

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)


## 1. Data Loading and Initial Exploration


In [5]:
# Load raw data using project module
print("Loading raw data...")
raw_data = load_data('../data/raw.csv')
print(f"Raw data shape: {raw_data.shape}")
print(f"Columns: {list(raw_data.columns)}")

# Display first few rows
raw_data.head()


Loading raw data...
Raw data shape: (10000, 6)
Columns: ['timestamp', 'price', 'feature1', 'feature2', 'feature3', 'category_feature']


Unnamed: 0,timestamp,price,feature1,feature2,feature3,category_feature
0,2020-01-01,107.45,,3.17,-0.67,category_c
1,2020-01-02,98.28,1.43,2.89,0.02,category_a
2,2020-01-03,110.41,1.26,1.3,0.46,category_b
3,2020-01-04,123.89,2.1,3.84,1.11,category_a
4,2020-01-05,97.88,2.92,0.22,1.68,category_b


In [6]:
# Basic data info
print("Data Info:")
print(f"Shape: {raw_data.shape}")
print(f"Memory usage: {raw_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Date range: {raw_data['timestamp'].min()} to {raw_data['timestamp'].max()}")
print(f"Unique categories: {raw_data['category_feature'].nunique()}")

# Data types
print("\nData Types:")
print(raw_data.dtypes)


Data Info:
Shape: (10000, 6)
Memory usage: 1.43 MB
Date range: 2020-01-01 to 2047-05-18
Unique categories: 3

Data Types:
timestamp            object
price               float64
feature1            float64
feature2            float64
feature3            float64
category_feature     object
dtype: object


## 2. Data Quality Analysis


In [7]:
# Missing values analysis
missing_data = raw_data.isnull().sum()
missing_percent = (missing_data / len(raw_data)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing %': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if not missing_df.empty:
    print("Missing Values:")
    print(missing_df)
else:
    print("No missing values found!")

# Duplicate rows
duplicates = raw_data.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Missing Values:
          Missing Count  Missing %
feature1            200        2.0

Duplicate rows: 0


# Exploratory Data Analysis (EDA)

This notebook performs exploratory data analysis on the pricing dataset to understand:
- Data structure and quality
- Feature distributions and relationships
- Time series patterns
- Missing values and outliers
- Target variable characteristics


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [9]:
# Load data
df = pd.read_csv('../data/raw.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()


Dataset shape: (10000, 6)
Columns: ['timestamp', 'price', 'feature1', 'feature2', 'feature3', 'category_feature']


Unnamed: 0,timestamp,price,feature1,feature2,feature3,category_feature
0,2020-01-01,107.45,,3.17,-0.67,category_c
1,2020-01-02,98.28,1.43,2.89,0.02,category_a
2,2020-01-03,110.41,1.26,1.3,0.46,category_b
3,2020-01-04,123.89,2.1,3.84,1.11,category_a
4,2020-01-05,97.88,2.92,0.22,1.68,category_b
