In [None]:

# Parameters
start_date = "2024-01-01"  # @param
days = 30  # @param
sample_size = 1000  # @param
random_seed = 42  # @param
category_weights = {'A': 0.4, 'B': 0.3, 'C': 0.3}  # @param


In [None]:

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging

# 다음 셀부터는 일반 코드 셀입니다
# 로깅 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info(f"Analysis started at {datetime.now()}")
logger.info(f"Parameters received: start_date={start_date}, days={days}, sample_size={sample_size}")

# 날짜 범위 생성
date_range = pd.date_range(start=start_date, periods=days)

# 더미 데이터 생성
data = []
for date in date_range:
    daily_samples = sample_size // days
    
    # 일별 데이터 생성
    daily_data = pd.DataFrame({
        'date': [date] * daily_samples,
        'value_1': np.random.normal(100, 15, daily_samples),
        'value_2': np.random.exponential(50, daily_samples),
        'category': np.random.choice(
            list(category_weights.keys()),
            daily_samples,
            p=list(category_weights.values())
        )
    })
    data.append(daily_data)

# 데이터프레임 결합
df = pd.concat(data, ignore_index=True)
logger.info(f"Generated dataset shape: {df.shape}")


In [None]:

# 기본 통계 계산
stats = {
    'value_1_mean': df['value_1'].mean(),
    'value_1_std': df['value_1'].std(),
    'value_2_mean': df['value_2'].mean(),
    'value_2_std': df['value_2'].std(),
    'total_rows': len(df),
    'categories': df['category'].value_counts().to_dict()
}

# 결과 로깅
logger.info("Statistical summary:")
for key, value in stats.items():
    logger.info(f"{key}: {value}")

# 일별 평균 계산
daily_stats = df.groupby('date').agg({
    'value_1': ['mean', 'std'],
    'value_2': ['mean', 'std']
}).round(2)

logger.info("\nFirst 5 days of daily averages:")
logger.info(daily_stats.head())

# 카테고리별 분석
category_analysis = df.groupby('category').agg({
    'value_1': ['mean', 'std', 'count'],
    'value_2': ['mean', 'std']
}).round(2)

logger.info("\nCategory analysis:")
logger.info(category_analysis)

# 이상치 탐지
value1_outliers = df[np.abs(df['value_1'] - df['value_1'].mean()) > (3 * df['value_1'].std())]
value2_outliers = df[np.abs(df['value_2'] - df['value_2'].mean()) > (3 * df['value_2'].std())]

logger.info(f"\nFound {len(value1_outliers)} outliers in value_1")
logger.info(f"Found {len(value2_outliers)} outliers in value_2")


In [None]:
# 노트북의 마지막 셀
from IPython.display import JSON, display

# 결과를 반환하기 위한 dictionary
notebook_output = {
    'execution_info': {
        'execution_time': datetime.now().isoformat(),
        'parameters_used': {
            'start_date': start_date,
            'days': days,
            'sample_size': sample_size,
            'random_seed': random_seed,
            'category_weights': category_weights
        }
    },
    'basic_stats': stats,
    'daily_stats_sample': daily_stats.head().to_dict(),
    'category_analysis': category_analysis.to_dict(),
    'outliers_summary': {
        'value1_outliers_count': len(value1_outliers),
        'value2_outliers_count': len(value2_outliers)
    },
    'data_shape': df.shape
}

# JSON으로 출력 (이것이 papermill에 의해 캡처됨)
display(JSON(notebook_output))