# FAF5 Data Loading and Cleaning

## Objectives
- Load FAF5 dataset from `data/raw`
- Explore columns and data types
- Clean data
- Save processed data to `data/processed`

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Set paths
data_dir = Path('../data')
raw_dir = data_dir / 'raw'
processed_dir = data_dir / 'processed'

# Specific data paths
faf5_dir = raw_dir / 'FAF5'
faf5_csv_path = faf5_dir / 'FAF5.7.1.csv'
metadata_path = faf5_dir / 'FAF5_metadata.xlsx'
cfs_area_code_path = raw_dir / 'CFS-area-code-FAF5-zone-id.xlsx'

# Output path
processed_data_path = processed_dir / 'faf5_cleaned.csv'

# List files to verify
print(f'Looking for data in: {faf5_dir.resolve()}')
[f.name for f in faf5_dir.glob('*') if f.is_file()]

In [None]:
# Load the dataset
print(f"Loading data from {faf5_csv_path}...")
df = pd.read_csv(faf5_csv_path)
print(f"Data loaded successfully! Shape: {df.shape}")
df.head()

### Initial Data Inspection
Checking for missing values and data types.

In [None]:
# check data types and missing values
df.info()

In [None]:
# Check specifically for NaN values per column
nan_counts = df.isnull().sum()
print("Columns with missing values:")
print(nan_counts[nan_counts > 0])

In [None]:
# Verify first few lines of raw file to see if headers align
with open(faf5_csv_path, 'r') as f:
    for i, line in enumerate(f):
        if i < 5:
            print(line.strip())
        else:
            break