In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploratory Data Analysis for Time Series\n",
    "\n",
    "This notebook provides exploratory analysis for time series data before generating synthetic forecasts with Chronos-T5."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Import required libraries\n",
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import yaml\n",
    "import h5py\n",
    "import torch\n",
    "\n",
    "# Add the project root directory to path for importing project modules\n",
    "sys.path.append('..')\n",
    "from src.data_preprocessing import DataPreprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Load configuration\n",
    "with open('../config/config.yaml', 'r') as file:\n",
    "    config = yaml.safe_load(file)\n",
    "\n",
    "# Display configuration\n",
    "print(\"Project Configuration:\")\n",
    "print(yaml.dump(config, sort_keys=False, default_flow_style=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and Examine Data\n",
    "\n",
    "Next, we'll load the input data and perform exploratory analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Initialize the data preprocessor\n",
    "preprocessor = DataPreprocessor('../config/config.yaml')\n",
    "\n",
    "# Input file (adjust according to your case)\n",
    "input_file = \"your_data_file.csv\"  # or .hdf5, .xlsx, etc.\n",
    "\n",
    "try:\n",
    "    # Load data\n",
    "    df = preprocessor.load_data(input_file)\n",
    "    print(f\"Data loaded with shape: {df.shape}\")\n",
    "except FileNotFoundError:\n",
    "    print(f\"File '{input_file}' not found in {preprocessor.raw_path}\")\n",
    "    print(\"Please place your data file in the 'data/raw/' folder\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Display the first few rows\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# General information about the DataFrame\n",
    "print(\"\\nDataFrame Information:\")\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Descriptive statistics\n",
    "df.describe(include='all').T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HDF5 File Structure Analysis\n",
    "\n",
    "If working with HDF5 files, let's explore the internal structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def explore_hdf5_structure(file_path):\n",
    "    \"\"\"Explore the structure of an HDF5 file\"\"\"\n",
    "    if not file_path.endswith(('.h5', '.hdf5')):\n",
    "        print(f\"File {file_path} is not an HDF5 file.\")\n",
    "        return\n",
    "    \n",
    "    full_path = os.path.join(preprocessor.raw_path, file_path)\n",
    "    if not os.path.exists(full_path):\n",
    "        print(f\"File {full_path} does not exist.\")\n",
    "        return\n",
    "    \n",
    "    print(f\"Exploring structure of {full_path}\")\n",
    "    \n",
    "    # Function to print attributes and metadata of HDF5 objects\n",
    "    def print_attrs(name, obj):\n",
    "        print(f\"\\nPath: {name}\")\n",
    "        print(f\"Type: {type(obj).__name__}\")\n",
    "        \n",
    "        if isinstance(obj, h5py.Dataset):\n",
    "            print(f\"Shape: {obj.shape}\")\n",
    "            print(f\"Data type: {obj.dtype}\")\n",
    "            if len(obj.shape) == 0 or obj.shape[0] < 5:\n",
    "                print(f\"Data: {obj[()] if len(obj.shape) == 0 else obj[:][:5]}\")\n",
    "            else:\n",
    "                print(f\"First 5 elements: {obj[:5]}\")\n",
    "        \n",
    "        if obj.attrs:\n",
    "            print(\"Attributes:\")\n",
    "            for key, val in obj.attrs.items():\n",
    "                print(f\"  {key}: {val}\")\n",
    "    \n",
    "    with h5py.File(full_path, 'r') as f:\n",
    "        # Print file structure\n",
    "        print(\"\\nHDF5 File Structure:\")\n",
    "        f.visititems(print_attrs)\n",
    "\n",
    "# Try to explore HDF5 file if it exists\n",
    "try:\n",
    "    explore_hdf5_structure(input_file)\n",
    "except Exception as e:\n",
    "    print(f\"Error exploring HDF5 file: {str(e)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Missing Value Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Count missing values by column\n",
    "missing_values = df.isnull().sum()\n",
    "missing_percentage = (missing_values / len(df)) * 100\n",
    "\n",
    "missing_df = pd.DataFrame({\n",
    "    'Missing Values': missing_values,\n",
    "    'Percentage': missing_percentage\n",
    "}).sort_values('Missing Values', ascending=False)\n",
    "\n",
    "# Display columns with missing values\n",
    "missing_df[missing_df['Missing Values'] > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualize missing values\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')\n",
    "plt.title('Missing Values Heatmap')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Time Series Extraction and Analysis\n",
    "\n",
    "Let's extract and analyze potential time series from the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Identify numeric columns that could be time series\n",
    "numeric_cols = df.select_dtypes(include=['number']).columns.tolist()\n",
    "print(f\"Found {len(numeric_cols)} numeric columns that could contain time series data:\")\n",
    "print(numeric_cols[:10], '...' if len(numeric_cols) > 10 else '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualize potential time series\n",
    "def visualize_time_series(df, columns, max_cols=5, samples=500):\n",
    "    \"\"\"Visualize potential time series from selected columns\"\"\"\n",
    "    columns = columns[:min(max_cols, len(columns))]\n",
    "    \n",
    "    for col in columns:\n",
    "        series = df[col].dropna().values\n",
    "        \n",
    "        # Sample if series is too long\n",
    "        if len(series) > samples:\n",
    "            indices = np.linspace(0, len(series)-1, samples).astype(int)\n",
    "            series = series[indices]\n",
    "        \n",
    "        plt.figure(figsize=(12, 5))\n",
    "        plt.plot(series)\n",
    "        plt.title(f'Time Series: {col}')\n",
    "        plt.xlabel('Time')\n",
    "        plt.ylabel('Value')\n",
    "        plt.grid(True)\n",
    "        plt.show()\n",
    "        \n",
    "        # Display basic statistics\n",
    "        print(f\"\\nStatistics for {col}:\")\n",
    "        print(f\"Length: {len(df[col].dropna())}\")\n",
    "        print(f\"Min: {df[col].min()}\")\n",
    "        print(f\"Max: {df[col].max()}\")\n",
    "        print(f\"Mean: {df[col].mean()}\")\n",
    "        print(f\"Std Dev: {df[col].std()}\")\n",
    "\n",
    "# Visualize first few numeric columns as potential time series\n",
    "visualize_time_series(df, numeric_cols, max_cols=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stationarity and Seasonality Analysis\n",
    "\n",
    "Let's check if these time series exhibit stationarity or seasonality."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "from statsmodels.tsa.stattools import adfuller\n",
    "from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n",
    "\n",
    "def analyze_stationarity(series, title):\n",
    "    \"\"\"Analyze stationarity of a time series\"\"\"\n",
    "    # Augmented Dickey-Fuller test\n",
    "    result = adfuller(series.dropna())\n",
    "    print(f'\\nAugmented Dickey-Fuller Test for {title}:')\n",
    "    print(f'ADF Statistic: {result[0]:.4f}')\n",
    "    print(f'p-value: {result[1]:.4f}')\n",
    "    print('Critical Values:')\n",
    "    for key, value in result[4].items():\n",
    "        print(f'\\t{key}: {value:.4f}')\n",
    "    \n",
    "    # Series is stationary if p-value is less than significance level (e.g., 0.05)\n",
    "    is_stationary = result[1] < 0.05\n",
    "    print(f'Series is{\"\" if is_stationary else \" not\"} stationary with 95% confidence')\n",
    "    \n",
    "    # Plot ACF and PACF\n",
    "    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))\n",
    "    plot_acf(series.dropna(), ax=ax1, title=f'Autocorrelation Function for {title}')\n",
    "    plot_pacf(series.dropna(), ax=ax2, title=f'Partial Autocorrelation Function for {title}')\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    return is_stationary\n",
    "\n",
    "# Analyze stationarity of first few numeric columns\n",
    "for col in numeric_cols[:2]:  # Limit to 2 columns for brevity\n",
    "    try:\n",
    "        is_stationary = analyze_stationarity(df[col], col)\n",
    "    except Exception as e:\n",
    "        print(f\"Error analyzing stationarity for {col}: {str(e)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare Time Series for Chronos-T5\n",
    "\n",
    "Now, let's prepare the time series for use with Chronos-T5."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Preprocess data\n",
    "try:\n",
    "    processed_df = preprocessor.preprocess_data(df, \"preprocessed_from_notebook.csv\")\n",
    "    print(f\"Data preprocessed with shape: {processed_df.shape}\")\n",
    "    print(f\"Data saved to {preprocessor.processed_path}/preprocessed_from_notebook.csv\")\n",
    "except Exception as e:\n",
    "    print(f\"Error during preprocessing: {str(e)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Extract time series for Chronos-T5\n",
    "time_series_data = preprocessor.prepare_for_chronos(processed_df)\n",
    "print(f\"Extracted {len(time_series_data)} time series for Chronos-T5\")\n",
    "\n",
    "# Display information about each time series\n",
    "for i, series in enumerate(time_series_data[:5]):  # Limit to 5 series for brevity\n",
    "    print(f\"Series {i}: {len(series)} points, range: [{series.min().item():.2f}, {series.max().item():.2f}]\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualize the extracted time series\n",
    "for i, series in enumerate(time_series_data[:3]):  # Limit to 3 series for brevity\n",
    "    plt.figure(figsize=(12, 5))\n",
    "    plt.plot(series.numpy())\n",
    "    plt.title(f'Time Series {i} for Chronos-T5')\n",
    "    plt.xlabel('Time')\n",
    "    plt.ylabel('Value')\n",
    "    plt.grid(True)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test Chronos-T5 Model\n",
    "\n",
    "Let's run a quick test with a sample series to verify that the model works."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "from chronos import ChronosPipeline\n",
    "\n",
    "def test_chronos_model(time_series, prediction_length=24):\n",
    "    \"\"\"Test Chronos-T5 model on a sample time series\"\"\"\n",
    "    try:\n",
    "        print(\"Loading Chronos-T5-Small model...\")\n",
    "        pipeline = ChronosPipeline.from_pretrained(\n",
    "            \"amazon/chronos-t5-small\",\n",
    "            device_map=\"auto\",\n",
    "            torch_dtype=torch.float32\n",
    "        )\n",
    "        \n",
    "        print(f\"Generating forecast for {prediction_length} future steps...\")\n",
    "        forecast = pipeline.predict(\n",
    "            time_series, \n",
    "            prediction_length,\n",
    "            num_samples=10,  # Number of trajectories to sample\n",
    "            temperature=0.8\n",
    "        )\n",
    "        \n",
    "        print(f\"Forecast generated with shape: {forecast.shape}\")\n",
    "        \n",
    "        # Calculate forecast statistics\n",
    "        forecast_np = forecast.numpy()\n",
    "        median_forecast = np.median(forecast_np, axis=0)\n",
    "        lower_bound = np.percentile(forecast_np, 10, axis=0)\n",
    "        upper_bound = np.percentile(forecast_np, 90, axis=0)\n",
    "        \n",
    "        # Visualize forecast\n",
    "        plt.figure(figsize=(12, 6))\n",
    "        \n",
    "        # Historical data\n",
    "        historical = time_series.numpy()\n",
    "        plt.plot(range(len(historical)), historical, color='blue', label='Historical Data')\n",
    "        \n",
    "        # Forecast\n",
    "        forecast_idx = range(len(historical), len(historical) + prediction_length)\n",
    "        plt.plot(forecast_idx, median_forecast, color='red', label='Forecast (median)')\n",
    "        plt.fill_between(forecast_idx, lower_bound, upper_bound, color='red', alpha=0.3, label='80% Prediction Interval')\n",
    "        \n",
    "        plt.title('Time Series Forecast with Chronos-T5')\n",
    "        plt.grid(True)\n",
    "        plt.legend()\n",
    "        plt.show()\n",
    "        \n",
    "        return forecast_np\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing Chronos model: {str(e)}\")\n",
    "        return None\n",
    "\n",
    "# Test with the first time series (if available)\n",
    "if time_series_data and len(time_series_data) > 0:\n",
    "    sample_series = time_series_data[0]\n",
    "    forecast = test_chronos_model(sample_series, prediction_length=24)\n",
    "else:\n",
    "    print(\"No time series available for testing\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusions and Next Steps\n",
    "\n",
    "In this notebook, we have:\n",
    "1. Loaded and explored the input data\n",
    "2. Analyzed potential time series for stationarity and seasonality\n",
    "3. Preprocessed the data for use with Chronos-T5\n",
    "4. Extracted time series and visualized them\n",
    "5. Tested the Chronos-T5 model on a sample time series\n",
    "\n",
    "Next steps:\n",
    "- Run the full synthetic data generation process with `main.py`\n",
    "- Evaluate the quality of the synthetic forecasts with the `model_evaluation.ipynb` notebook"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}