# Phase 2: Code Structuring

This notebook structures the code to work with the actual data structure from `Data/Molise.dta`.

## Key Findings from Phase 1:
- **region_res**: 12 = Molise, 2 = Basilicata (need to check if Basilicata data exists)
- **type**: 1 = Private, 2 = Public, 3 = Self-employed, 4 = Non-employed
- **Years**: 1985-2019 (covers required 1997-2007 period)
- **Key variables**: id_worker, year, wage, contract_type, sector_12cat, gender, year_birth

## Tasks:
1. Map actual variable names to analysis variables
2. Create helper functions for data processing
3. Design estimation wrappers
4. Set up directory structure
5. Create reusable modules in /src/


In [None]:
import pandas as pd
import numpy as np
import pyreadstat
from pathlib import Path
import sys

# Set up paths
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "Data"
SRC_DIR = BASE_DIR / "src"
OUT_DIR = BASE_DIR / "out"
DERIVED_DIR = BASE_DIR / "data" / "derived"

# Create src directory if it doesn't exist
SRC_DIR.mkdir(parents=True, exist_ok=True)
sys.path.insert(0, str(SRC_DIR))

# Set random seed
np.random.seed(42)

print(f"Base directory: {BASE_DIR}")
print(f"Source directory: {SRC_DIR}")


In [None]:
# Load data to understand structure
df, meta = pyreadstat.read_dta(DATA_DIR / "Molise.dta")

print("=" * 80)
print("VARIABLE MAPPING")
print("=" * 80)

# Define variable mapping based on actual data
VAR_MAP = {
    # Identifiers
    'person_id': 'id_worker',
    'year': 'year',
    'firm_id': 'id_firm',
    
    # Demographics
    'gender': 'gender',  # 0=Male, 1=Female
    'year_birth': 'year_birth',
    'age': None,  # Will compute from year - year_birth
    
    # Geography
    'region_residence': 'region_res',  # 12=Molise, 2=Basilicata
    
    # Worker classification
    'worker_type_raw': 'type',  # 1=Private, 2=Public, 3=Self, 4=Non-employed
    
    # Employment
    'contract_type': 'contract_type',  # 1=Permanent, 2=Temporary, 3=Seasonal
    'sector': 'sector_12cat',
    'occupation': 'occupation',
    'firm_size': 'firm_dimension',
    
    # Outcomes
    'wage': 'wage',  # Monthly wage (object type, needs conversion)
    'working_weeks': 'working_weeks',  # Weeks worked
    'part_time': 'part_time',
    'part_time_fraction': 'part_time_fraction',
    
    # Dates
    'date_start': 'date_start',
    'date_end': 'date_end',
}

print("Variable mapping defined:")
for key, val in VAR_MAP.items():
    print(f"  {key}: {val if val else 'computed'}")

# Check if Basilicata data exists
print(f"\n\nRegion distribution:")
print(df['region_res'].value_counts())
print(f"\nUnique regions: {df['region_res'].unique()}")


In [None]:
# Create the io.py module
io_code = '''"""
Data I/O functions for Molise earthquake analysis.
"""

import pandas as pd
import pyreadstat
from pathlib import Path
import numpy as np


def load_raw(data_file=None, base_dir=None):
    """
    Load raw Stata data file.
    
    Parameters
    ----------
    data_file : str or Path, optional
        Path to .dta file. If None, uses Data/Molise.dta relative to base_dir.
    base_dir : str or Path, optional
        Base directory. If None, uses current working directory.
    
    Returns
    -------
    df : DataFrame
        Loaded data
    meta : dict
        Metadata from Stata file (value labels, etc.)
    """
    if base_dir is None:
        base_dir = Path.cwd()
    else:
        base_dir = Path(base_dir)
    
    if data_file is None:
        data_file = base_dir / "Data" / "Molise.dta"
    else:
        data_file = Path(data_file)
    
    try:
        df, meta = pyreadstat.read_dta(data_file)
        print(f"Loaded {len(df):,} rows using pyreadstat")
    except Exception as e:
        print(f"pyreadstat failed: {e}, trying pandas...")
        df = pd.read_stata(data_file)
        meta = None
        print(f"Loaded {len(df):,} rows using pandas")
    
    return df, meta


def write_parquet(df, path, partition_by=None):
    """
    Write DataFrame to Parquet format.
    
    Parameters
    ----------
    df : DataFrame
        Data to write
    path : str or Path
        Output path
    partition_by : str, optional
        Column name to partition by (e.g., 'year')
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    
    if partition_by and partition_by in df.columns:
        # Partition by specified column
        for val in df[partition_by].unique():
            subset = df[df[partition_by] == val]
            part_path = path.parent / f"{path.stem}_{partition_by}={val}{path.suffix}"
            subset.to_parquet(part_path, index=False)
        print(f"Wrote partitioned parquet files to {path.parent}")
    else:
        df.to_parquet(path, index=False)
        print(f"Wrote parquet file to {path}")


def read_parquet(path):
    """Read Parquet file."""
    return pd.read_parquet(path)
'''

with open(SRC_DIR / "io.py", "w") as f:
    f.write(io_code)
print("Created src/io.py")


In [None]:
# Create the build.py module
build_code = '''"""
Variable construction functions for Molise earthquake analysis.
"""

import pandas as pd
import numpy as np
from scipy.stats import mstats


def construct_worker_type(df, type_col='type'):
    """
    Construct worker_type categorical variable.
    
    Parameters
    ----------
    df : DataFrame
        Input data
    type_col : str
        Column name for raw worker type (1=Private, 2=Public, 3=Self, 4=Non-employed)
    
    Returns
    -------
    Series
        Categorical worker_type: 'private', 'public', 'self', 'non_employed'
    """
    worker_type_map = {
        1: 'private',
        2: 'public',
        3: 'self',
        4: 'non_employed'
    }
    
    worker_type = df[type_col].map(worker_type_map)
    worker_type = pd.Categorical(worker_type, categories=['public', 'private', 'self', 'non_employed'], ordered=False)
    
    return worker_type


def make_outcomes(df, wage_col='wage', contract_col='contract_type', 
                  working_weeks_col='working_weeks', year_col='year'):
    """
    Construct outcome variables.
    
    Parameters
    ----------
    df : DataFrame
        Input data
    wage_col : str
        Column name for wage
    contract_col : str
        Column name for contract type
    working_weeks_col : str
        Column name for working weeks
    year_col : str
        Column name for year
    
    Returns
    -------
    DataFrame
        DataFrame with outcome variables
    """
    outcomes = pd.DataFrame(index=df.index)
    
    # Convert wage to numeric (handle object type)
    if df[wage_col].dtype == 'object':
        wage_numeric = pd.to_numeric(df[wage_col], errors='coerce')
    else:
        wage_numeric = df[wage_col]
    
    # Employment probability: 1 if has wage or working_weeks > 0
    outcomes['emp_prob'] = (
        (wage_numeric.notna() & (wage_numeric > 0)) | 
        (df[working_weeks_col].notna() & (pd.to_numeric(df[working_weeks_col], errors='coerce') > 0))
    ).astype(int)
    
    # Annualized monthly earnings (wage * 12, or wage * working_weeks/52 * 12)
    if working_weeks_col in df.columns:
        weeks = pd.to_numeric(df[working_weeks_col], errors='coerce').fillna(52)
        earnings_annual = wage_numeric * (weeks / 52) * 12
    else:
        earnings_annual = wage_numeric * 12
    
    # Winsorize earnings at p1-p99
    earnings_winsorized = earnings_annual.copy()
    p1 = earnings_annual.quantile(0.01)
    p99 = earnings_annual.quantile(0.99)
    earnings_winsorized = earnings_winsorized.clip(lower=p1, upper=p99)
    
    # Inverse hyperbolic sine transformation
    outcomes['earnings_asinh'] = np.arcsinh(earnings_winsorized)
    
    # Wage (daily equivalent if needed, or monthly)
    outcomes['wage_asinh'] = np.arcsinh(wage_numeric.clip(lower=wage_numeric.quantile(0.01), 
                                                           upper=wage_numeric.quantile(0.99)))
    
    # Permanent contract indicator
    if contract_col in df.columns:
        # 1 = Permanent
        outcomes['contract_perm'] = (pd.to_numeric(df[contract_col], errors='coerce') == 1).astype(int)
    else:
        outcomes['contract_perm'] = 0
    
    # Contract duration (if date_start and date_end available)
    if 'date_start' in df.columns and 'date_end' in df.columns:
        try:
            date_start = pd.to_datetime(df['date_start'], errors='coerce')
            date_end = pd.to_datetime(df['date_end'], errors='coerce')
            outcomes['contract_duration_days'] = (date_end - date_start).dt.days
            outcomes['contract_duration_days'] = outcomes['contract_duration_days'].fillna(0)
        except:
            outcomes['contract_duration_days'] = 0
    else:
        outcomes['contract_duration_days'] = 0
    
    return outcomes


def make_flags(df, person_id_col='id_worker', year_col='year', 
               region_col='region_res', municipality_col=None):
    """
    Create sample flags: balanced panel, stayers, border municipalities.
    
    Parameters
    ----------
    df : DataFrame
        Input data
    person_id_col : str
        Person identifier column
    year_col : str
        Year column
    region_col : str
        Region column
    municipality_col : str, optional
        Municipality column (if available)
    
    Returns
    -------
    DataFrame
        DataFrame with flags
    """
    flags = pd.DataFrame(index=df.index)
    
    # Balanced panel: present in all years 1997-2007 (excluding 2002)
    required_years = set(range(1997, 2002)) | set(range(2003, 2008))
    person_years = df.groupby(person_id_col)[year_col].apply(set)
    flags['is_balanced_97_07'] = person_years.map(lambda x: required_years.issubset(x)).reindex(df[person_id_col]).values
    
    # Stayer: no inter-municipality moves (if municipality data available)
    if municipality_col and municipality_col in df.columns:
        person_municipalities = df.groupby(person_id_col)[municipality_col].nunique()
        flags['is_stayer_residence'] = (person_municipalities == 1).reindex(df[person_id_col]).values
    else:
        flags['is_stayer_residence'] = True  # Assume all stayers if no municipality data
    
    # Border municipality (placeholder - would need actual border data)
    flags['is_border_municipality'] = False
    
    return flags


def assemble_panel(df, meta=None):
    """
    Assemble analysis-ready panel from raw data.
    
    Parameters
    ----------
    df : DataFrame
        Raw data
    meta : dict, optional
        Metadata from Stata file
    
    Returns
    -------
    DataFrame
        Analysis-ready panel
    """
    panel = df.copy()
    
    # Compute age
    if 'year_birth' in panel.columns:
        panel['age'] = panel['year'] - panel['year_birth']
        panel['age_sq'] = panel['age'] ** 2
    else:
        panel['age'] = np.nan
        panel['age_sq'] = np.nan
    
    # Construct worker type
    panel['worker_type'] = construct_worker_type(panel, type_col='type')
    
    # Filter to employed workers only (exclude type 4 = non-employed)
    panel = panel[panel['worker_type'] != 'non_employed'].copy()
    
    # Treatment variables
    # molise_res: 1 if region_res == 12 (Molise), 0 if region_res == 2 (Basilicata)
    panel['molise_res'] = (panel['region_res'] == '12').astype(int)
    
    # post: 1 if year >= 2003
    panel['post'] = (panel['year'] >= 2003).astype(int)
    
    # treat: interaction
    panel['treat'] = panel['molise_res'] * panel['post']
    
    # event_time: year - 2002
    panel['event_time'] = panel['year'] - 2002
    
    # Filter to analysis period: 1997-2001 and 2003-2007 (exclude 2002)
    panel = panel[panel['year'].between(1997, 2007) & (panel['year'] != 2002)].copy()
    
    # Filter to Molise (12) or Basilicata (2) residents
    panel = panel[panel['region_res'].isin(['12', '2'])].copy()
    
    # Make outcomes
    outcomes = make_outcomes(panel)
    for col in outcomes.columns:
        panel[col] = outcomes[col]
    
    # Make flags
    flags = make_flags(panel)
    for col in flags.columns:
        panel[col] = flags[col]
    
    # Fix molise_res based on pre-period (1997-2001) residence
    pre_period = panel[panel['year'] < 2002].copy()
    if len(pre_period) > 0:
        pre_residence = pre_period.groupby('id_worker')['molise_res'].first()
        panel['molise_res'] = pre_residence.reindex(panel['id_worker']).fillna(panel['molise_res']).values
    
    return panel
'''

with open(SRC_DIR / "build.py", "w") as f:
    f.write(build_code)
print("Created src/build.py")


In [None]:
# Create the models.py module
models_code = '''"""
Estimation functions for DiD, DDD, and event-study models.
"""

import pandas as pd
import numpy as np
from linearmodels import PanelOLS
from linearmodels.panel import compare
import statsmodels.api as sm


def did(df, outcome, cluster='id_worker', entity_effects=True, time_effects=True):
    """
    Estimate difference-in-differences model.
    
    Model: Y = α + β(molise_res × post) + γ_i + λ_t + ε
    
    Parameters
    ----------
    df : DataFrame
        Panel data with entity_id and time_id
    outcome : str
        Outcome variable name
    cluster : str
        Variable to cluster standard errors by
    entity_effects : bool
        Include entity fixed effects
    time_effects : bool
        Include time fixed effects
    
    Returns
    -------
    result : PanelOLSResults
        Estimation results
    """
    # Prepare data
    data = df.copy()
    data = data.dropna(subset=[outcome, 'molise_res', 'post', 'treat'])
    
    # Create entity and time indices
    data = data.set_index(['id_worker', 'year'])
    
    # Dependent variable
    y = data[outcome]
    
    # Treatment variable
    X = data[['treat']].copy()
    
    # Add constant
    X = sm.add_constant(X)
    
    # Estimate
    mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects)
    result = mod.fit(cov_type='clustered', cluster_entity=True)
    
    return result


def ddd_worker_type(df, outcome, cluster='id_worker', entity_effects=True, time_effects=True):
    """
    Estimate triple-difference model by worker type.
    
    Model: Y = α + θ(molise_res × post × private) + φ(molise_res × post × self) + 
           all two-way interactions + γ_i + λ_t + μ_s + ε
    
    Parameters
    ----------
    df : DataFrame
        Panel data
    outcome : str
        Outcome variable name
    cluster : str
        Variable to cluster standard errors by
    entity_effects : bool
        Include entity fixed effects
    time_effects : bool
        Include time fixed effects
    
    Returns
    -------
    result : PanelOLSResults
        Estimation results
    """
    # Prepare data
    data = df.copy()
    data = data[data['worker_type'].isin(['public', 'private', 'self'])].copy()
    data = data.dropna(subset=[outcome, 'molise_res', 'post', 'worker_type'])
    
    # Create worker type dummies
    data['private'] = (data['worker_type'] == 'private').astype(int)
    data['self'] = (data['worker_type'] == 'self').astype(int)
    # public is baseline
    
    # Create triple interactions
    data['treat_private'] = data['treat'] * data['private']
    data['treat_self'] = data['treat'] * data['self']
    
    # Two-way interactions
    data['molise_private'] = data['molise_res'] * data['private']
    data['molise_self'] = data['molise_res'] * data['self']
    data['post_private'] = data['post'] * data['private']
    data['post_self'] = data['post'] * data['self']
    
    # Set index
    data = data.set_index(['id_worker', 'year'])
    
    # Dependent variable
    y = data[outcome]
    
    # Regressors: triple interactions and all two-way interactions
    X = data[['treat_private', 'treat_self', 
              'molise_res', 'post', 'private', 'self',
              'molise_private', 'molise_self', 'post_private', 'post_self']].copy()
    
    # Add constant
    X = sm.add_constant(X)
    
    # Estimate with entity, time, and worker type FE
    # Note: worker type FE via dummy variables in X
    mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects)
    result = mod.fit(cov_type='clustered', cluster_entity=True)
    
    return result


def event_study(df, outcome, by_type=False, entity_effects=True, time_effects=True):
    """
    Estimate event-study (dynamic DiD) model.
    
    Model: Y = α + Σ_{k≠-1} β_k[1{event_time=k} × molise_res] + γ_i + λ_t + ε
    
    Parameters
    ----------
    df : DataFrame
        Panel data
    outcome : str
        Outcome variable name
    by_type : bool
        If True, estimate separate dynamics by worker type
    entity_effects : bool
        Include entity fixed effects
    time_effects : bool
        Include time fixed effects
    
    Returns
    -------
    results_df : DataFrame
        DataFrame with columns: event_time, beta, se, ci_low, ci_high
    """
    data = df.copy()
    data = data.dropna(subset=[outcome, 'molise_res', 'event_time'])
    
    # Filter to event_time in [-5, 5] (excluding 0 which is 2002)
    data = data[data['event_time'].between(-5, 5) & (data['event_time'] != 0)].copy()
    
    # Create event time dummies (excluding -1 as reference)
    event_times = sorted([k for k in data['event_time'].unique() if k != -1])
    
    results_list = []
    
    if by_type:
        # Estimate separately by worker type
        for worker_type in ['public', 'private', 'self']:
            data_type = data[data['worker_type'] == worker_type].copy()
            if len(data_type) == 0:
                continue
            
            # Create interaction terms
            for k in event_times:
                data_type[f'event_{k}'] = ((data_type['event_time'] == k) * data_type['molise_res']).astype(int)
            
            # Set index
            data_type = data_type.set_index(['id_worker', 'year'])
            y = data_type[outcome]
            
            # Regressors: event time interactions
            X_cols = [f'event_{k}' for k in event_times]
            X = data_type[X_cols].copy()
            X = sm.add_constant(X)
            
            # Estimate
            mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects)
            result = mod.fit(cov_type='clustered', cluster_entity=True)
            
            # Extract coefficients
            for k in event_times:
                coef_name = f'event_{k}'
                if coef_name in result.params.index:
                    beta = result.params[coef_name]
                    se = result.std_errors[coef_name]
                    ci_low = beta - 1.96 * se
                    ci_high = beta + 1.96 * se
                    results_list.append({
                        'event_time': k,
                        'worker_type': worker_type,
                        'beta': beta,
                        'se': se,
                        'ci_low': ci_low,
                        'ci_high': ci_high
                    })
            
            # Add reference period (-1)
            results_list.append({
                'event_time': -1,
                'worker_type': worker_type,
                'beta': 0.0,
                'se': 0.0,
                'ci_low': 0.0,
                'ci_high': 0.0
            })
    else:
        # Pooled estimation
        # Create interaction terms
        for k in event_times:
            data[f'event_{k}'] = ((data['event_time'] == k) * data['molise_res']).astype(int)
        
        # Set index
        data = data.set_index(['id_worker', 'year'])
        y = data[outcome]
        
        # Regressors
        X_cols = [f'event_{k}' for k in event_times]
        X = data[X_cols].copy()
        X = sm.add_constant(X)
        
        # Estimate
        mod = PanelOLS(y, X, entity_effects=entity_effects, time_effects=time_effects)
        result = mod.fit(cov_type='clustered', cluster_entity=True)
        
        # Extract coefficients
        for k in event_times:
            coef_name = f'event_{k}'
            if coef_name in result.params.index:
                beta = result.params[coef_name]
                se = result.std_errors[coef_name]
                ci_low = beta - 1.96 * se
                ci_high = beta + 1.96 * se
                results_list.append({
                    'event_time': k,
                    'beta': beta,
                    'se': se,
                    'ci_low': ci_low,
                    'ci_high': ci_high
                })
        
        # Add reference period
        results_list.append({
            'event_time': -1,
            'beta': 0.0,
            'se': 0.0,
            'ci_low': 0.0,
            'ci_high': 0.0
        })
    
    results_df = pd.DataFrame(results_list)
    return results_df
'''

with open(SRC_DIR / "models.py", "w") as f:
    f.write(models_code)
print("Created src/models.py")


In [None]:
# Create plots.py, export.py, robustness.py, and diagnostics.py modules
# (Creating stubs for now, will be expanded in Phase 3)

plots_code = '''"""
Plotting functions for Molise earthquake analysis.
"""

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path


def plot_event(results_df, title, outfile, by_type=False):
    """
    Plot event-study coefficients with confidence intervals.
    
    Parameters
    ----------
    results_df : DataFrame
        Results from event_study()
    title : str
        Plot title
    outfile : str or Path
        Output file path
    by_type : bool
        If True, plot separate lines for each worker type
    """
    fig, ax = plt.subplots(figsize=(10, 6))
    
    if by_type and 'worker_type' in results_df.columns:
        for worker_type in results_df['worker_type'].unique():
            data = results_df[results_df['worker_type'] == worker_type].sort_values('event_time')
            ax.plot(data['event_time'], data['beta'], marker='o', label=worker_type)
            ax.fill_between(data['event_time'], data['ci_low'], data['ci_high'], alpha=0.2)
    else:
        data = results_df.sort_values('event_time')
        ax.plot(data['event_time'], data['beta'], marker='o', color='blue')
        ax.fill_between(data['event_time'], data['ci_low'], data['ci_high'], alpha=0.2)
    
    ax.axhline(y=0, color='black', linestyle='--', linewidth=0.5)
    ax.axvline(x=0, color='red', linestyle='--', linewidth=0.5, label='Earthquake')
    ax.set_xlabel('Event Time (Years from 2002)')
    ax.set_ylabel('Coefficient')
    ax.set_title(title)
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    outfile = Path(outfile)
    outfile.parent.mkdir(parents=True, exist_ok=True)
    plt.savefig(outfile, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Saved plot to {outfile}")


def plot_three_env(tables, outfile):
    """Plot three environments analysis."""
    # Placeholder - will implement in Phase 3
    pass


def plot_heatmap(grid, outfile):
    """Plot coefficient heatmap for heterogeneity analysis."""
    # Placeholder - will implement in Phase 3
    pass
'''

export_code = '''"""
Export functions for tables and summaries.
"""

import pandas as pd
from pathlib import Path


def write_table(result, outfile, format='both'):
    """
    Write estimation results to CSV and/or LaTeX.
    
    Parameters
    ----------
    result : PanelOLSResults or similar
        Estimation results
    outfile : str or Path
        Output file path (without extension)
    format : str
        'csv', 'tex', or 'both'
    """
    outfile = Path(outfile)
    outfile.parent.mkdir(parents=True, exist_ok=True)
    
    # Extract coefficients and standard errors
    summary = pd.DataFrame({
        'Coefficient': result.params,
        'Std_Error': result.std_errors,
        'P_Value': result.pvalues
    })
    
    if 'csv' in format or format == 'both':
        summary.to_csv(f"{outfile}.csv")
        print(f"Saved table to {outfile}.csv")
    
    if 'tex' in format or format == 'both':
        summary.to_latex(f"{outfile}.tex", float_format="%.4f")
        print(f"Saved table to {outfile}.tex")


def write_summary(results, outfile):
    """Write summary of multiple results."""
    # Placeholder - will implement in Phase 3
    pass
'''

robustness_code = '''"""
Robustness check functions.
"""

import pandas as pd
from .models import did


def run_all(df):
    """
    Run all robustness checks.
    
    Returns
    -------
    dict
        Dictionary of results keyed by robustness check name
    """
    results = {}
    # Placeholder - will implement in Phase 3
    return results
'''

diagnostics_code = '''"""
Diagnostic functions.
"""

import pandas as pd
import numpy as np
from scipy import stats


def pretrend_test(event_df):
    """
    Test for parallel trends in pre-period.
    
    Parameters
    ----------
    event_df : DataFrame
        Event-study results
    
    Returns
    -------
    dict
        Test statistics and p-values
    """
    # Filter to pre-period (event_time < 0)
    pre = event_df[event_df['event_time'] < 0].copy()
    
    if len(pre) == 0:
        return {'test_stat': np.nan, 'p_value': np.nan}
    
    # Test if sum of pre-period coefficients is zero
    # This is a simplified version - full implementation would use F-test
    coefs = pre['beta'].values
    ses = pre['se'].values
    
    # Wald test
    test_stat = (coefs.sum() ** 2) / (ses ** 2).sum()
    p_value = 1 - stats.chi2.cdf(test_stat, df=len(coefs))
    
    return {'test_stat': test_stat, 'p_value': p_value, 'n_pre_periods': len(pre)}


def cell_counts(df, keys):
    """
    Count observations by cell (e.g., region × type × year).
    
    Parameters
    ----------
    df : DataFrame
        Data
    keys : list of str
        Column names to group by
    
    Returns
    -------
    DataFrame
        Cell counts
    """
    counts = df.groupby(keys).size().reset_index(name='count')
    return counts
'''

# Write all modules
modules = {
    'plots.py': plots_code,
    'export.py': export_code,
    'robustness.py': robustness_code,
    'diagnostics.py': diagnostics_code
}

for filename, code in modules.items():
    with open(SRC_DIR / filename, "w") as f:
        f.write(code)
    print(f"Created src/{filename}")


In [None]:
# Test loading and basic construction
print("=" * 80)
print("TESTING MODULES")
print("=" * 80)

from src.io import load_raw
from src.build import assemble_panel, construct_worker_type

# Load data
df_raw, meta = load_raw()
print(f"\nLoaded {len(df_raw):,} rows")

# Check if we have Basilicata data
print(f"\nRegion distribution in raw data:")
print(df_raw['region_res'].value_counts())

# Try to assemble panel (will filter to available regions)
try:
    panel = assemble_panel(df_raw, meta)
    print(f"\nAssembled panel: {len(panel):,} rows")
    print(f"Years: {panel['year'].min()} - {panel['year'].max()}")
    print(f"Worker types: {panel['worker_type'].value_counts().to_dict()}")
    print(f"Molise vs Basilicata: {panel['molise_res'].value_counts().to_dict()}")
except Exception as e:
    print(f"\nError assembling panel: {e}")
    import traceback
    traceback.print_exc()


## Summary

Code structure created with:
- `src/io.py`: Data loading functions
- `src/build.py`: Variable construction functions
- `src/models.py`: Estimation functions (DiD, DDD, event-study)
- `src/plots.py`: Plotting functions
- `src/export.py`: Table export functions
- `src/robustness.py`: Robustness checks
- `src/diagnostics.py`: Diagnostic functions

**Note**: If Basilicata data is not in the file, we may need to:
1. Load additional data files, or
2. Use a different control region (e.g., Puglia = 14), or
3. Use only within-Molise variation (private vs public workers)

Proceeding to Phase 3 with the current structure.
