# 01. Data Exploration

東証ティックデータの探索と100ティックバーへの集約

**目的**:
- CSVデータの構造確認
- トヨタ(7203)データの抽出
- 100ティックバーへの集約
- 基本統計量の確認

In [None]:
# Add src to path
import sys
sys.path.insert(0, '../src')

import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Configure matplotlib for Japanese
plt.rcParams['font.family'] = ['Hiragino Sans', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['font.size'] = 12
plt.rcParams['figure.figsize'] = (14, 6)

print(f"Polars version: {pl.__version__}")

## 1. Load Tick Data

In [None]:
from data.loader import load_and_preprocess

CSV_PATH = Path("../stock_tick_data/stock_tick_202510.csv")
STOCK_CODE = "72030"  # Toyota

print(f"Loading {STOCK_CODE} tick data...")
df = load_and_preprocess(CSV_PATH, stock_code=STOCK_CODE)
print(f"Loaded {len(df):,} ticks")

In [None]:
# Display sample data
df.head(10)

In [None]:
# Basic statistics
print("Price statistics:")
print(df.select(["price"]).describe())

print("\nVolume statistics:")
print(df.select(["volume"]).describe())

In [None]:
# Ticks per day
ticks_per_day = df.group_by("trade_date").agg(pl.len().alias("tick_count"))
ticks_per_day = ticks_per_day.sort("trade_date")
print("Ticks per day:")
print(ticks_per_day)

## 2. Aggregate to 100-Tick Bars

In [None]:
from data.bar_aggregator import aggregate_tick_bars, save_bars_parquet

# Create 100-tick bars
BAR_SIZE = 100
bars = aggregate_tick_bars(df, bar_size=BAR_SIZE)
print(f"Created {len(bars):,} bars from {len(df):,} ticks")
print(f"Reduction ratio: {len(df) / len(bars):.1f}x")

In [None]:
# Display bar sample
bars.head(10)

In [None]:
# Bar statistics
print("Bar Return statistics:")
print(bars.select(["bar_return"]).describe())

print("\nBar Volume statistics:")
print(bars.select(["volume"]).describe())

## 3. Visualization

In [None]:
# Price chart
fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)

# Close price
close_prices = bars["close"].to_numpy()
ax1 = axes[0]
ax1.plot(close_prices, linewidth=0.5)
ax1.set_ylabel("Close Price (JPY)")
ax1.set_title(f"Toyota (7203) - 100-Tick Bars - {len(bars):,} bars")
ax1.grid(True, alpha=0.3)

# Volume
volumes = bars["volume"].to_numpy()
ax2 = axes[1]
ax2.bar(range(len(volumes)), volumes, width=1.0, alpha=0.7)
ax2.set_ylabel("Volume (shares)")
ax2.grid(True, alpha=0.3)

# Returns
returns = bars["bar_return"].to_numpy()
ax3 = axes[2]
ax3.plot(returns, linewidth=0.5, color='green')
ax3.axhline(y=0, color='red', linestyle='--', alpha=0.5)
ax3.set_ylabel("Bar Return")
ax3.set_xlabel("Bar Index")
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Return distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
ax1 = axes[0]
ax1.hist(returns, bins=100, edgecolor='black', alpha=0.7)
ax1.axvline(x=0, color='red', linestyle='--')
ax1.set_xlabel("Bar Return")
ax1.set_ylabel("Frequency")
ax1.set_title("Return Distribution")

# QQ plot (vs normal)
from scipy import stats
ax2 = axes[1]
stats.probplot(returns[~np.isnan(returns)], dist="norm", plot=ax2)
ax2.set_title("Q-Q Plot (vs Normal)")

plt.tight_layout()
plt.show()

print(f"Return stats:")
print(f"  Mean: {np.nanmean(returns):.6f}")
print(f"  Std:  {np.nanstd(returns):.6f}")
print(f"  Skew: {stats.skew(returns[~np.isnan(returns)]):.4f}")
print(f"  Kurt: {stats.kurtosis(returns[~np.isnan(returns)]):.4f}")

## 4. Save Processed Data

In [None]:
# Save to Parquet
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "toyota_7203_100tick_bars.parquet"
save_bars_parquet(bars, str(output_path))

print(f"\nSaved to: {output_path}")
print(f"File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")

In [None]:
# Verify saved data
bars_loaded = pl.read_parquet(output_path)
print(f"Verified: Loaded {len(bars_loaded):,} bars from parquet")
bars_loaded.head(5)

## Summary

- **Original ticks**: {len(df):,}
- **100-tick bars**: {len(bars):,}
- **Data period**: 2025/10 (1 month)
- **Stock**: Toyota (7203)

Next: Feature Engineering (02_feature_engineering.ipynb)