In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import logging

# 로깅 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 실행 시작 로깅
logger.info(f"Notebook execution started at {datetime.now()}")

# 더미 데이터 생성
np.random.seed(42)
n_samples = 1000

# 날짜 범위 생성
date_range = pd.date_range(start='2024-01-01', periods=n_samples)

# 더미 데이터프레임 생성
df = pd.DataFrame({
    'date': date_range,
    'value_1': np.random.normal(100, 15, n_samples),
    'value_2': np.random.exponential(50, n_samples),
    'category': np.random.choice(['A', 'B', 'C'], n_samples)
})

# 기본 통계 계산
stats = {
    'value_1_mean': df['value_1'].mean(),
    'value_1_std': df['value_1'].std(),
    'value_2_mean': df['value_2'].mean(),
    'value_2_std': df['value_2'].std(),
    'total_rows': len(df),
    'categories': df['category'].value_counts().to_dict()
}

INFO:__main__:Notebook execution started at 2024-10-26 03:09:43.252829


In [3]:
# 결과 로깅
logger.info("Statistical summary:")
for key, value in stats.items():
    logger.info(f"{key}: {value}")

# 일별 평균 계산
daily_stats = df.groupby('date').agg({
    'value_1': 'mean',
    'value_2': 'mean'
}).round(2)

logger.info("\nFirst 5 days of daily averages:")
logger.info(daily_stats.head())

# 결과를 반환하기 위한 dictionary
notebook_output = {
    'execution_time': datetime.now().isoformat(),
    'basic_stats': stats,
    'daily_stats_sample': daily_stats.head().to_dict(),
    'data_shape': df.shape
}


INFO:__main__:Statistical summary:
INFO:__main__:value_1_mean: 100.28998083733488
INFO:__main__:value_1_std: 14.688239072695135
INFO:__main__:value_2_mean: 50.39929668484337
INFO:__main__:value_2_std: 50.14942544926654
INFO:__main__:total_rows: 1000
INFO:__main__:categories: {'A': 357, 'C': 336, 'B': 307}
INFO:__main__:
First 5 days of daily averages:
INFO:__main__:            value_1  value_2
date                        
2024-01-01   107.45     9.17
2024-01-02    97.93     5.52
2024-01-03   109.72    50.59
2024-01-04   122.85    61.29
2024-01-05    96.49     1.60


In [4]:
# 변수로 결과 저장 (Airflow에서 받을 수 있도록)
notebook_output

{'execution_time': '2024-10-26T02:30:38.119360',
 'basic_stats': {'value_1_mean': 100.28998083733488,
  'value_1_std': 14.688239072695135,
  'value_2_mean': 50.39929668484337,
  'value_2_std': 50.14942544926654,
  'total_rows': 1000,
  'categories': {'A': 357, 'C': 336, 'B': 307}},
 'daily_stats_sample': {'value_1': {Timestamp('2024-01-01 00:00:00'): 107.45,
   Timestamp('2024-01-02 00:00:00'): 97.93,
   Timestamp('2024-01-03 00:00:00'): 109.72,
   Timestamp('2024-01-04 00:00:00'): 122.85,
   Timestamp('2024-01-05 00:00:00'): 96.49},
  'value_2': {Timestamp('2024-01-01 00:00:00'): 9.17,
   Timestamp('2024-01-02 00:00:00'): 5.52,
   Timestamp('2024-01-03 00:00:00'): 50.59,
   Timestamp('2024-01-04 00:00:00'): 61.29,
   Timestamp('2024-01-05 00:00:00'): 1.6}},
 'data_shape': (1000, 4)}