# Data Quality Check: INE vs. Mobile Data Comparison
**Objective:** Compare hotel occupancy data from INE with anonymized mobile phone data for night stays in Castilla-La Mancha to validate the mobile dataset.

In [None]:
# --- Cell 2: Code ---
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import os
import sys

# Set a professional plot theme
pio.templates.default = "plotly_white"

# Add scripts directory to path to import helpers
# Make sure the path is correct for your project structure
sys.path.append('../scripts')
import hotel_data_helpers as helpers

# --- Configuration ---
# Set to False to hide detailed dataframe printouts during execution
DEBUG = True


In [None]:
# --- Cell 3: Markdown ---
# ## 1. Load and Process Datasets
#
# We will now load the two datasets:
# 1.  **INE Data:** `INE_EOH_Viajeros_pernoctaciones_mensual.csv` - Contains monthly traveler numbers in hotels.
# 2.  **Mobile Data:** `Nocturno_Mes_demographics_analysis.parquet` - Contains monthly night-stay data from mobile devices for Castilla-La Mancha.

# --- Cell 4: Code ---
# Define file paths based on the project structure
ine_file_path = '../data/raw/ine_data/INE_EOH_Viajeros_pernoctaciones_mensual.csv'
mobile_file_path = '../data/analytics/CCAA Castilla-La Mancha/Nocturno_Mes_demographics_analysis.parquet'

# Load data using the helper functions
# The debug flag will print intermediate steps and dataframes
df_ine = helpers.load_ine_data(ine_file_path, debug=DEBUG)
df_mobile = helpers.load_mobile_data(mobile_file_path, debug=DEBUG)

## 2. Merge Data and Extrapolate INE Figures

With both datasets loaded and cleaned, we merge them based on date and residency. We also apply an **extrapolation factor** to the INE data.

### Reasoning for Extrapolation

The INE data only counts travelers staying in hotels. The mobile data captures anyone spending the night, regardless of accommodation type (hotels, rentals, family homes, etc.). To make a fair comparison, we need to estimate the *total* number of travelers from the hotel data.

Based on regional tourism statistics ("Plan Estratégico de Turismo de Castilla-La Mancha"), travelers staying in hotels represent approximately **65%** of all travelers. Therefore, we will extrapolate the INE numbers by dividing by 0.65 (or multiplying by ~1.53) to estimate the total market size.

In [None]:
# --- Cell 6: Code ---
# Merge the two datasets and apply the extrapolation
df_comparison = helpers.merge_and_prepare_data(df_ine, df_mobile, debug=DEBUG)

# Display the final prepared dataframe if not in debug mode
if not DEBUG and 'df_comparison' in locals():
    print("--- Final Comparison DataFrame ---")
    display(df_comparison.head())

## 3. Visualize the Comparison

The final step is to create a professional bar chart that clearly compares the datasets. The chart will feature:

- **Stacked Bars for Mobile Data:** Showing the individual contributions of 'Tourists' and 'Frequently Present' individuals.
- **Grouped Bars:** Comparing the total mobile data against the extrapolated INE data, month by month.
- **Facets:** Separate charts for Spanish residents and Foreign residents for clearer analysis.

In [None]:
# --- Cell 8: Code ---
# Check if the comparison dataframe is valid before plotting
if 'df_comparison' in locals() and not df_comparison.empty:
    # Prepare data for plotting by sorting by date
    df_plot = df_comparison.sort_values('fecha')

    # Get unique residencies to create a plot for each category
    residencies = df_plot['residencia'].unique()

    for res in residencies:
        df_res = df_plot[df_plot['residencia'] == res]

        fig = go.Figure()

        # --- Add Mobile Data Bars (Stacked) ---
        # Bar for 'Tourists' (bottom part of the stack)
        fig.add_trace(go.Bar(
            x=df_res['fecha'],
            y=df_res['turistas_mobile'],
            name='Mobile Data: Tourists',
            marker_color='#1f77b4',  # Professional Blue
            offsetgroup=0
        ))
        
        # Bar for 'Frequently Present' (top part of the stack)
        fig.add_trace(go.Bar(
            x=df_res['fecha'],
            y=df_res['frecuentes_mobile'],
            name='Mobile Data: Frequently Present',
            marker_color='#aec7e8',  # Light Blue
            offsetgroup=0,
            base=df_res['turistas_mobile'] # This stacks it on top of the 'turistas_mobile' bar
        ))

        # --- Add INE Data Bar (Grouped) ---
        fig.add_trace(go.Bar(
            x=df_res['fecha'],
            y=df_res['viajeros_ine_extrapolado'],
            name='INE Data (Extrapolated Total)',
            marker_color='#ff7f0e',  # Professional Orange
            offsetgroup=1
        ))

        # --- Customize Layout ---
        fig.update_layout(
            title=f'<b>Monthly Travelers in Castilla-La Mancha ({res})</b><br>Mobile Data vs. Extrapolated INE Data',
            xaxis_title='Month',
            yaxis_title='Number of Travelers',
            barmode='group', # Grouped bars for Mobile vs INE
            legend_title='Data Source',
            xaxis=dict(tickformat='%b %Y'), # Format x-axis dates for readability
            legend=dict(
                x=1.02, 
                y=1, 
                traceorder='normal',
                bgcolor='rgba(255, 255, 255, 0.5)',
                bordercolor='rgba(0, 0, 0, 0.5)',
                borderwidth=1
            ),
            yaxis_gridcolor='lightgrey'
        )

        fig.show()
else:
    print("Comparison DataFrame could not be created. Please check the loading and merging steps for errors.")