In [2]:
import pandas as pd
from pathlib import Path
import io

# --- Configuration ---
# The script is expected to run from the 'notebooks' folder.
# The path is adjusted to go up one level to the project root, then into data.
BASE_PATH = Path('../data')

# Corrected path for the Castilla-La Mancha community data
CLM_DATA_PATH = BASE_PATH / 'analytics' / 'CCAA Castilla-La Mancha'

# List of the relevant analysis files to describe
FILES_TO_ANALYZE = [
    'Nocturno_Mes_demographics_analysis.parquet',
    'Nocturno_Mes_origin_analysis.parquet',
    'Diario_Mes_origin_analysis.parquet',
    'Diario_Mes_demographics_analysis.parquet',
    'NocheEstancia_demographics_analysis.parquet',
    'NocheEstancia_origin_analysis.parquet'
]

def get_file_inspection_string(file_path):
    """
    Reads a parquet file and returns a formatted string with its
    description, info, head, and unique values for key columns.
    """
    if not file_path.exists():
        return f"### FILE NOT FOUND: {file_path}\n"

    try:
        df = pd.read_parquet(file_path)
        
        # Use a string buffer to capture output
        buffer = io.StringIO()
        
        buffer.write("="*80 + "\n")
        buffer.write(f"### FILE: {file_path.name}\n")
        buffer.write("="*80 + "\n\n")

        # --- DataFrame Info ---
        buffer.write("--- DataFrame Info ---\n")
        df.info(buf=buffer, verbose=False, memory_usage="deep")
        buffer.write("\n")

        # --- DataFrame Head ---
        buffer.write("--- DataFrame Head ---\n")
        buffer.write(df.head().to_string())
        buffer.write("\n\n")

        # --- Unique Values in Key Columns ---
        buffer.write("--- Unique Values in Key Columns ---\n")
        categorical_cols = ['provincia', 'origen', 'tipo_origen', 'grupo_edad', 'genero']
        for col in categorical_cols:
            if col in df.columns:
                # Handle potential missing values before calling unique()
                if df[col].isnull().any():
                    unique_values = df[col].dropna().unique()
                else:
                    unique_values = df[col].unique()
                
                # Limit the number of unique values printed for brevity
                display_values = list(unique_values[:10])
                if len(unique_values) > 10:
                    display_values.append('...')
                buffer.write(f"Column '{col}': {display_values}\n")

        buffer.write("\n\n")
        return buffer.getvalue()

    except Exception as e:
        return f"Could not read or process file {file_path}. Error: {e}\n\n"

# --- Main Execution ---
final_output_string = ""
final_output_string += "### Starting Analysis of Representative Files from 'CCAA Castilla-La Mancha' Directory ###\n\n"

for file_name in FILES_TO_ANALYZE:
    full_path = CLM_DATA_PATH / file_name
    final_output_string += get_file_inspection_string(full_path)

# Print the final consolidated string for the user to copy
print(final_output_string)

### Starting Analysis of Representative Files from 'CCAA Castilla-La Mancha' Directory ###

### FILE: Nocturno_Mes_demographics_analysis.parquet

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Columns: 15 entries, mes to volumen_genero_nr
dtypes: float64(10), int64(2), object(3)
memory usage: 78.8 KB

--- DataFrame Head ---
      mes etiquetadelmes      origen   categoriadelvisitante  volumen_total  volumen_edad_18-24  volumen_edad_25-34  volumen_edad_35-44  volumen_edad_45-54  volumen_edad_55-64  volumen_edad_65 o más  volumen_edad_<18  volumen_genero_h  volumen_genero_m  volumen_genero_nr
0  202206     2022 junio  Extranjero  Habitualmente presente          32205                 NaN                 NaN                 NaN                 NaN                 NaN                    NaN               NaN               NaN               NaN                NaN
1  202206     2022 junio  Extranjero               Residente         164493       