# Phase 1: Data Analysis

This notebook analyzes the structure of `Data/Molise.dta` to understand:
- Available variables and their types
- Temporal coverage
- Region distribution
- Worker classification fields
- Outcome variables
- Data quality issues


In [None]:
import pandas as pd
import numpy as np
import pyreadstat
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Set up paths
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "Data"
OUT_DIR = BASE_DIR / "out"
DERIVED_DIR = BASE_DIR / "data" / "derived"

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Data file exists: {(DATA_DIR / 'Molise.dta').exists()}")


In [None]:
# Load the Stata files (both Molise and Basilicata if available)
molise_file = DATA_DIR / "Molise.dta"
basilicata_file = DATA_DIR / "Basilicata.dta"

dfs = []
meta = None

# Load Molise
if molise_file.exists():
    try:
        df_molise, meta = pyreadstat.read_dta(molise_file)
        df_molise['region_res'] = '12'  # Ensure region code
        dfs.append(df_molise)
        print(f"Loaded Molise: {len(df_molise):,} rows using pyreadstat")
    except Exception as e:
        print(f"pyreadstat failed for Molise: {e}, trying pandas...")
        df_molise = pd.read_stata(molise_file)
        df_molise['region_res'] = '12'
        dfs.append(df_molise)
        print(f"Loaded Molise: {len(df_molise):,} rows using pandas")
else:
    print(f"Warning: {molise_file} not found")

# Load Basilicata if available
if basilicata_file.exists():
    try:
        df_basilicata, meta_bas = pyreadstat.read_dta(basilicata_file)
        df_basilicata['region_res'] = '2'  # Ensure region code
        dfs.append(df_basilicata)
        print(f"Loaded Basilicata: {len(df_basilicata):,} rows using pyreadstat")
    except Exception as e:
        print(f"pyreadstat failed for Basilicata: {e}, trying pandas...")
        df_basilicata = pd.read_stata(basilicata_file)
        df_basilicata['region_res'] = '2'
        dfs.append(df_basilicata)
        print(f"Loaded Basilicata: {len(df_basilicata):,} rows using pandas")
else:
    print(f"Note: {basilicata_file} not found - analyzing Molise only")

# Combine if both loaded
if len(dfs) > 1:
    df = pd.concat(dfs, ignore_index=True)
    print(f"\nCombined dataset: {len(df):,} rows")
elif len(dfs) == 1:
    df = dfs[0]
    print(f"\nSingle dataset: {len(df):,} rows")
else:
    raise FileNotFoundError("No data files found")

print(f"\nDataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


In [None]:
# Basic info about the dataset
print("=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"\nShape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")
print(f"\nColumn names:")
print(df.columns.tolist())
print(f"\nData types:")
print(df.dtypes)


In [None]:
# Check for missing values
print("=" * 80)
print("MISSING VALUES")
print("=" * 80)
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print(missing_df)


In [None]:
# Identify key variables - look for common patterns
print("=" * 80)
print("KEY VARIABLE IDENTIFICATION")
print("=" * 80)

# Person identifier
person_id_cols = [c for c in df.columns if any(x in c.lower() for x in ['id', 'person', 'individual', 'codice'])]
print(f"\nPotential person ID columns: {person_id_cols}")

# Year
year_cols = [c for c in df.columns if any(x in c.lower() for x in ['year', 'anno', 'yr'])]
print(f"\nPotential year columns: {year_cols}")

# Region
region_cols = [c for c in df.columns if any(x in c.lower() for x in ['region', 'regione', 'reg'])]
print(f"\nPotential region columns: {region_cols}")

# Municipality
municipality_cols = [c for c in df.columns if any(x in c.lower() for x in ['municip', 'comune', 'city'])]
print(f"\nPotential municipality columns: {municipality_cols}")

# Gender
gender_cols = [c for c in df.columns if any(x in c.lower() for x in ['gender', 'sex', 'sesso', 'genere'])]
print(f"\nPotential gender columns: {gender_cols}")

# Age
age_cols = [c for c in df.columns if any(x in c.lower() for x in ['age', 'eta', 'anni'])]
print(f"\nPotential age columns: {age_cols}")

# Employment/Earnings
employment_cols = [c for c in df.columns if any(x in c.lower() for x in ['employ', 'earn', 'wage', 'reddito', 'stipendio', 'lavoro'])]
print(f"\nPotential employment/earnings columns: {employment_cols}")

# Contract
contract_cols = [c for c in df.columns if any(x in c.lower() for x in ['contract', 'contratto', 'tipologia'])]
print(f"\nPotential contract columns: {contract_cols}")

# Sector
sector_cols = [c for c in df.columns if any(x in c.lower() for x in ['sector', 'settore', 'nace', 'ateco'])]
print(f"\nPotential sector columns: {sector_cols}")

# Public/Private
worker_type_cols = [c for c in df.columns if any(x in c.lower() for x in ['public', 'private', 'self', 'pubblico', 'privato', 'autonomo', 'dipendente'])]
print(f"\nPotential worker type columns: {worker_type_cols}")


In [None]:
# Display first few rows
print("=" * 80)
print("SAMPLE DATA (first 5 rows)")
print("=" * 80)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)
print(df.head())


In [None]:
# Check temporal coverage
if year_cols:
    year_col = year_cols[0]
    print("=" * 80)
    print("TEMPORAL COVERAGE")
    print("=" * 80)
    print(f"\nYear column: {year_col}")
    print(f"\nYear range: {df[year_col].min()} - {df[year_col].max()}")
    print(f"\nYear distribution:")
    print(df[year_col].value_counts().sort_index())
else:
    print("No year column identified - need to inspect manually")


In [None]:
# Check region distribution
if region_cols:
    region_col = region_cols[0]
    print("=" * 80)
    print("REGION DISTRIBUTION")
    print("=" * 80)
    print(f"\nRegion column: {region_col}")
    print(f"\nUnique regions: {df[region_col].nunique()}")
    print(f"\nRegion distribution:")
    print(df[region_col].value_counts())
    
    # Check for Molise and Basilicata specifically
    if df[region_col].dtype == 'object':
        molise_check = df[region_col].str.contains('Molise|molise', case=False, na=False).sum()
        basilicata_check = df[region_col].str.contains('Basilicata|basilicata', case=False, na=False).sum()
        print(f"\nRows containing 'Molise': {molise_check}")
        print(f"\nRows containing 'Basilicata': {basilicata_check}")
    else:
        print(f"\nRegion values (first 20): {df[region_col].unique()[:20]}")
else:
    print("No region column identified - need to inspect manually")


In [None]:
# Detailed variable inspection - show unique values for categorical variables
print("=" * 80)
print("DETAILED VARIABLE INSPECTION")
print("=" * 80)

# For each column, show basic stats
for col in df.columns:
    print(f"\n{col}:")
    print(f"  Type: {df[col].dtype}")
    print(f"  Non-null: {df[col].notna().sum():,} / {len(df):,}")
    if df[col].dtype in ['object', 'category']:
        n_unique = df[col].nunique()
        print(f"  Unique values: {n_unique}")
        if n_unique <= 20:
            print(f"  Values: {df[col].unique()}")
        else:
            print(f"  First 10 values: {df[col].unique()[:10]}")
    elif df[col].dtype in ['int64', 'float64']:
        print(f"  Min: {df[col].min()}, Max: {df[col].max()}, Mean: {df[col].mean():.2f}")
        if df[col].nunique() <= 20:
            print(f"  Unique values: {sorted(df[col].dropna().unique())}")


In [None]:
# Create a codebook/data dictionary
print("=" * 80)
print("CREATING CODEBOOK")
print("=" * 80)

codebook = []
for col in df.columns:
    entry = {
        'Variable': col,
        'Type': str(df[col].dtype),
        'Non_Missing': df[col].notna().sum(),
        'Missing': df[col].isna().sum(),
        'Missing_Pct': f"{(df[col].isna().sum() / len(df) * 100):.2f}%",
        'N_Unique': df[col].nunique()
    }
    
    if df[col].dtype in ['int64', 'float64']:
        entry['Min'] = df[col].min()
        entry['Max'] = df[col].max()
        entry['Mean'] = f"{df[col].mean():.2f}"
        entry['Median'] = f"{df[col].median():.2f}"
    else:
        entry['Min'] = 'N/A'
        entry['Max'] = 'N/A'
        entry['Mean'] = 'N/A'
        entry['Median'] = 'N/A'
    
    if df[col].nunique() <= 50:
        unique_vals = df[col].dropna().unique()
        if len(unique_vals) <= 20:
            entry['Sample_Values'] = str(list(unique_vals))
        else:
            entry['Sample_Values'] = str(list(unique_vals[:20])) + " ..."
    else:
        entry['Sample_Values'] = f"{df[col].nunique()} unique values"
    
    codebook.append(entry)

codebook_df = pd.DataFrame(codebook)
print(codebook_df.to_string())

# Save codebook
OUT_DIR.mkdir(parents=True, exist_ok=True)
codebook_df.to_csv(OUT_DIR / "codebook.csv", index=False)
print(f"\n\nCodebook saved to: {OUT_DIR / 'codebook.csv'}")


In [None]:
# Summary statistics for key numeric variables
print("=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)

numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    print(df[numeric_cols].describe())
else:
    print("No numeric columns found")


In [None]:
# Save a sample of the data for inspection
DERIVED_DIR.mkdir(parents=True, exist_ok=True)
df_sample = df.head(1000)
df_sample.to_parquet(DERIVED_DIR / "data_sample.parquet", index=False)
print(f"Sample data saved to: {DERIVED_DIR / 'data_sample.parquet'}")
print(f"\nAnalysis complete! Review the codebook and variable inspection above.")
