In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploratory Data Analysis\n",
    "\n",
    "This notebook allows for an exploratory data analysis before generating synthetic data with Chronos-T5."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Import necessary libraries\n",
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import yaml\n",
    "\n",
    "# Add the main directory to the path to import project modules\n",
    "sys.path.append('..')\n",
    "from src.data_preprocessing import DataPreprocessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Load configuration\n",
    "with open('../config/config.yaml', 'r') as file:\n",
    "    config = yaml.safe_load(file)\n",
    "\n",
    "# Display configuration\n",
    "print(\"Project configuration:\")\n",
    "print(yaml.dump(config, sort_keys=False, default_flow_style=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and Examine Data\n",
    "\n",
    "Next, we will load the input data and perform an exploratory analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Initialize the data preprocessor\n",
    "preprocessor = DataPreprocessor('../config/config.yaml')\n",
    "\n",
    "# Input file (adjust as needed)\n",
    "input_file = \"your_data_file.csv\"\n",
    "\n",
    "try:\n",
    "    # Load data\n",
    "    df = preprocessor.load_data(input_file)\n",
    "    print(f\"Data loaded with shape: {df.shape}\")\n",
    "except FileNotFoundError:\n",
    "    print(f\"File '{input_file}' not found in {preprocessor.raw_path}\")\n",
    "    print(\"Please place your CSV file in the 'data/raw/' folder\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Display the first rows\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# General information about the DataFrame\n",
    "print(\"\\nDataFrame Information:\")\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Descriptive statistics\n",
    "df.describe(include='all').T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Missing Values Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Count missing values per column\n",
    "missing_values = df.isnull().sum()\n",
    "missing_percentage = (missing_values / len(df)) * 100\n",
    "\n",
    "missing_df = pd.DataFrame({\n",
    "    'Missing Values': missing_values,\n",
    "    'Percentage': missing_percentage\n",
    "}).sort_values('Missing Values', ascending=False)\n",
    "\n",
    "# Display columns with missing values\n",
    "missing_df[missing_df['Missing Values'] > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Visualize missing values\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')\n",
    "plt.title('Missing Values Heatmap')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Distribution Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Identify numeric columns\n",
    "numeric_cols = df.select_dtypes(include=['number']).columns.tolist()\n",
    "\n",
    "# Visualize distributions of numeric columns\n",
    "for col in numeric_cols[:5]:  # Limit to 5 columns to avoid overloading the notebook\n",
    "    plt.figure(figsize=(12, 5))\n",
    "    \n",
    "    # Histogram\n",
    "    plt.subplot(1, 2, 1)\n",
    "    sns.histplot(df[col].dropna(), kde=True)\n",
    "    plt.title(f'Histogram of {col}')\n",
    "    \n",
    "    # Boxplot\n",
    "    plt.subplot(1, 2, 2)\n",
    "    sns.boxplot(x=df[col].dropna())\n",
    "    plt.title(f'Boxplot of {col}')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Categorical Variable Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Identify categorical (or text) columns\n",
    "categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()\n",
    "\n",
    "# Visualize distributions of categorical columns\n",
    "for col in categorical_cols[:5]:  # Limit to 5 columns\n",
    "    if df[col].nunique() < 20:  # Only show if there are fewer than 20 categories\n",
    "        plt.figure(figsize=(10, 6))\n",
    "        \n",
    "        # Count frequencies\n",
    "        value_counts = df[col].value_counts()\n",
    "        \n",
    "        # Create bar chart\n",
    "        sns.barplot(x=value_counts.index, y=value_counts.values)\n",
    "        plt.title(f'Value Frequency in {col}')\n",
    "        plt.xticks(rotation=45, ha='right')\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "        \n",
    "        # Also show frequency table\n",
    "        print(f\"\\nValue Distribution in '{col}':\")\n",
    "        percent = df[col].value_counts(normalize=True) * 100\n",
    "        counts = df[col].value_counts()\n",
    "        freq_df = pd.DataFrame({'Frequency': counts, 'Percentage': percent})\n",
    "        display(freq_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Conclusions and Next Steps\n",
    "\n",
    "In this notebook, we have:\n",
    "1. Loaded and explored the input data\n",
    "2. Analyzed missing values, distributions, and outliers\n",
    "3. Preprocessed the data for use with Chronos-T5\n",
    "4. Split the data into training and test sets\n",
    "5. Prepared texts for synthetic data generation\n",
    "\n",
    "Next steps:\n",
    "- Run the synthetic data generation process with `main.py`\n",
    "- Evaluate the quality of the synthetic data using the `model_evaluation.ipynb` notebook"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
