# Dataset Exploration in `notebooks/`

This notebook helps you explore datasets found only in the `notebooks/` folder with `.ipynb` extension. You can list available notebooks, load their contents, inspect their structure, and visualize any embedded data.

## 1. List Available Datasets in Folder

Use Python to list all `.ipynb` files in the `notebooks/` directory.

In [None]:
import os
import glob

# List all .ipynb files in the notebooks/ directory
notebook_dir = 'notebooks'
ipynb_files = glob.glob(os.path.join(notebook_dir, '*.ipynb'))

print('Available .ipynb files:')
for f in ipynb_files:
    print('-', os.path.basename(f))

## 2. Load Selected Dataset

Select a `.ipynb` file to load its contents using `nbformat`.

In [None]:
import nbformat

# Select a notebook to load (update filename as needed)
selected_file = ipynb_files[0] if ipynb_files else None
if selected_file:
    with open(selected_file, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)
    print(f'Loaded notebook: {selected_file}')
else:
    print('No notebook files found to load.')

## 3. Explore Dataset Structure

Parse the loaded notebook to identify code cells, markdown cells, and any embedded data.

In [None]:
if selected_file:
    code_cells = [cell for cell in nb.cells if cell.cell_type == 'code']
    markdown_cells = [cell for cell in nb.cells if cell.cell_type == 'markdown']
    print(f'Code cells: {len(code_cells)}')
    print(f'Markdown cells: {len(markdown_cells)}')
    # Display first few lines of each type
    if code_cells:
        print('First code cell:')
        print(code_cells[0].source)
    if markdown_cells:
        print('First markdown cell:')
        print(markdown_cells[0].source)
else:
    print('No notebook loaded.')

## 4. Display Summary Statistics

If the notebook contains data (e.g., pandas DataFrames), display summary statistics such as shape, columns, and basic descriptive stats.

In [None]:
import pandas as pd
import ast

def extract_dataframes(code_cells):
    dfs = []
    for cell in code_cells:
        # Try to find DataFrame creation in code
        if 'pd.DataFrame' in cell.source:
            try:
                # Very basic extraction: look for dict or list in DataFrame constructor
                start = cell.source.find('pd.DataFrame(') + len('pd.DataFrame(')
                end = cell.source.find(')', start)
                df_arg = cell.source[start:end]
                data = ast.literal_eval(df_arg)
                df = pd.DataFrame(data)
                dfs.append(df)
            except Exception:
                continue
    return dfs

if selected_file:
    dfs = extract_dataframes(code_cells)
    if dfs:
        for i, df in enumerate(dfs):
            print(f'DataFrame {i+1}:')
            print('Shape:', df.shape)
            print('Columns:', df.columns.tolist())
            print(df.describe(include="all"))
    else:
        print('No DataFrames found in code cells.')
else:
    print('No notebook loaded.')

## 5. Visualize Dataset Features

Generate basic visualizations (e.g., histograms, scatter plots) for any data found within the notebook.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

if selected_file and dfs:
    for i, df in enumerate(dfs):
        print(f'Visualizations for DataFrame {i+1}:')
        for col in df.select_dtypes(include=['number']).columns:
            plt.figure(figsize=(6, 4))
            sns.histplot(df[col], kde=True)
            plt.title(f'Histogram of {col}')
            plt.show()
        if len(df.select_dtypes(include=['number']).columns) >= 2:
            cols = df.select_dtypes(include=['number']).columns[:2]
            plt.figure(figsize=(6, 4))
            sns.scatterplot(x=df[cols[0]], y=df[cols[1]])
            plt.title(f'Scatter plot: {cols[0]} vs {cols[1]}')
            plt.show()
else:
    print('No data available for visualization.')