**Goal:**
create a filtered subset of the data to improve performance and obtain results for pathfinding. other etl logic same as etl_process


## Cell 1: Setup for filtering and Load Original Data

In [34]:
import pandas as pd
import os
import re

SOURCE_FULL_DATASET_FILE  = os.path.join('..','data', 'source', 'Project2_Dataset_Corrected.csv') 
OUTPUT_DIR_FILTERED = os.path.join('..','data', 'import_filtered')

FILTER_STATE_VALUE = 'NSW'
FILTER_YEAR_VALUE = 2024

# --- Load the FULL original dataset ---
df_raw = pd.read_csv(SOURCE_FULL_DATASET_FILE, low_memory=False)
print(f"Successfully loaded {len(df_raw)} rows from the full dataset: {SOURCE_FULL_DATASET_FILE}")
print("Setup Complete.")

Successfully loaded 10490 rows from the full dataset: ../data/source/Project2_Dataset_Corrected.csv
Setup Complete.


## Cell 2: Apply state&year Filter and Clean Column Names

In [35]:
print(f"Applying filter: State == '{FILTER_STATE_VALUE}' AND Year == '{FILTER_YEAR_VALUE}'")

df_filtered = df_raw[
    (df_raw['State'] == FILTER_STATE_VALUE) &
    (df_raw['Year'] == FILTER_YEAR_VALUE) 
].copy()
print(f"Filtered dataset for '{FILTER_STATE_VALUE}' and Year 2024 contains {len(df_filtered)} rows.")

def clean_col_name(col_name):
    col_name = str(col_name)
    col_name = col_name.replace(' (', '_').replace(') ', '_').replace(' ', '_')
    clean_name = re.sub(r'[^a-zA-Z0-9_]+', '', col_name).lower()
    return clean_name.strip('_')

df_filtered.columns = [clean_col_name(col) for col in df_filtered.columns]
print("Column names have been cleaned for the filtered DataFrame.")

df = df_filtered.copy() # Use 'df' for consistency in subsequent cells
df.info() 

Applying filter: State == 'NSW' AND Year == '2024'
Filtered dataset for 'NSW' and Year 2024 contains 339 rows.
Column names have been cleaned for the filtered DataFrame.
<class 'pandas.core.frame.DataFrame'>
Index: 339 entries, 0 to 978
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   id                             339 non-null    int64 
 1   crash_id                       339 non-null    int64 
 2   state                          339 non-null    object
 3   month                          339 non-null    int64 
 4   year                           339 non-null    int64 
 5   dayweek                        339 non-null    object
 6   time                           339 non-null    object
 7   crash_type                     339 non-null    object
 8   number_fatalities              339 non-null    int64 
 9   bus_involvement                339 non-null    object
 10  heavy_rigid_truck

## ----- from NSW2024 filtered df -----
## Cell 3: Create Location Nodes (State, SA4, LGA) 

In [40]:
print("--- Creating Location Node Files (from NSW2024 data) ---")

states_nodes = df[['state']].drop_duplicates()
states_nodes.rename(columns={'state': 'name:ID(State)'}, inplace=True)
states_nodes[':LABEL'] = 'State'
states_nodes.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'states.csv'), index=False, encoding='utf-8')
print(f"Saved {len(states_nodes)} State nodes.")

sa4_link_data = df[['sa4_name_2021', 'state']].drop_duplicates()
sa4_nodes = sa4_link_data[['sa4_name_2021']].drop_duplicates().copy()
sa4_nodes.rename(columns={'sa4_name_2021': 'name:ID(SA4)'}, inplace=True)
sa4_nodes[':LABEL'] = 'SA4'
sa4_nodes.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'sa4s.csv'), index=False, encoding='utf-8')
print(f"Saved {len(sa4_nodes)} SA4 nodes.")

lga_link_data = df[['national_lga_name_2024', 'sa4_name_2021']].drop_duplicates()
lga_nodes = lga_link_data[['national_lga_name_2024']].drop_duplicates().copy()
lga_nodes.rename(columns={'national_lga_name_2024': 'name:ID(LGA)'}, inplace=True)
lga_nodes[':LABEL'] = 'LGA'
lga_nodes.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'lgas.csv'), index=False, encoding='utf-8')
print(f"Saved {len(lga_nodes)} LGA nodes.")

--- Creating Location Node Files (from NSW2024 data) ---
Saved 1 State nodes.
Saved 28 SA4 nodes.
Saved 97 LGA nodes.


## Cell 4: Create Unique Crash Nodes

In [None]:
print("--- Creating Unique Crash Node File (from NSW2024 data) ---")
crash_data = df.drop_duplicates(subset=['crash_id'], keep='first').reset_index(drop=True).copy()
crash_data['internalCrashID:ID(Crash)'] = crash_data.index

yes_no_cols = ['bus_involvement', 'heavy_rigid_truck_involvement', 'articulated_truck_involvement', 'christmas_period', 'easter_period']
for col in yes_no_cols:
    if col in crash_data.columns:
        crash_data[col] = crash_data[col].map({'Yes': 'yes', 'No': 'no'})

crash_data['year'] = crash_data['year'].astype(int)
crash_data['month'] = crash_data['month'].astype(int)
crash_data['number_fatalities'] = crash_data['number_fatalities'].astype(int)
crash_data['speed_limit'] = pd.to_numeric(crash_data['speed_limit'], errors='coerce').astype('Int64')

crash_node_cols = {
    'internalCrashID:ID(Crash)': 'internalCrashID:ID(Crash)', 'crash_id': 'crashID_orig',
    'year': 'year', 'month': 'month', 'dayweek': 'dayweek', 'time': 'time',
    'crash_type': 'crashType', 'number_fatalities': 'numberFatalities', 'bus_involvement': 'busInvolvement',
    'heavy_rigid_truck_involvement': 'heavyRigidTruckInvolvement', 'articulated_truck_involvement': 'articulatedTruckInvolvement',
    'speed_limit': 'speedLimit', 'national_road_type': 'nationalRoadType', 'christmas_period': 'christmasPeriod',
    'easter_period': 'easterPeriod', 'national_remoteness_areas': 'nationalRemotenessAreas',
    'day_of_week': 'dayOfWeekType', 'time_of_day': 'timeOfDay'
}
existing_cols = [col for col in crash_node_cols.keys() if col in crash_data.columns]
crash_nodes_df = crash_data[existing_cols].copy()
rename_map_crash = {k: v for k, v in crash_node_cols.items() if k in existing_cols}
crash_nodes_df.rename(columns=rename_map_crash, inplace=True)
crash_nodes_df[':LABEL'] = 'Crash'
crash_nodes_df.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'crashes.csv'), index=False, encoding='utf-8')
print(f"Saved {len(crash_nodes_df)} unique Crash nodes.")

crash_id_map = crash_data[['crash_id', 'internalCrashID:ID(Crash)']].copy().set_index('crash_id')
print("Created crash_id to internalCrashID map (from NSW2024 data).")

--- Creating Unique Crash Node File (from NSW2024 data) ---
Saved 307 unique Crash nodes.
Created crash_id to internalCrashID map (from NSW data).


## Cell 5: Create Person Nodes

In [41]:
print("--- Creating Person Node File (from NSW2024 data) ---")
df['age'] = df['age'].astype(int) 

person_node_cols = {
    'id': 'personID:ID(Person)',
    'road_user': 'roadUser', 'gender': 'gender', 'age': 'age', 'age_group': 'ageGroup'
}
existing_cols = [col for col in person_node_cols.keys() if col in df.columns]
person_nodes_df = df[existing_cols].copy()
rename_map_person = {k: v for k, v in person_node_cols.items() if k in existing_cols}
person_nodes_df.rename(columns=rename_map_person, inplace=True)
person_nodes_df[':LABEL'] = 'Person'
person_nodes_df.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'persons.csv'), index=False, encoding='utf-8')
print(f"Saved {len(person_nodes_df)} Person nodes.")

--- Creating Person Node File (from NSW2024 data) ---
Saved 339 Person nodes.


## Cell 6: Create Relationship Files

In [42]:
print("--- Creating Relationship Files (from NSW2024 data) ---")

# --- 1. SA4 -> State ---
try:
    sa4_state_rels = sa4_link_data.copy()
    sa4_state_rels.rename(columns={'sa4_name_2021': ':START_ID(SA4)', 'state': ':END_ID(State)'}, inplace=True)
    sa4_state_rels[':TYPE'] = 'IN_STATE'
    sa4_state_rels.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'rels_sa4_state.csv'), index=False, encoding='utf-8')
    print(f"Saved {len(sa4_state_rels)} SA4->State relationships.")
except Exception as e: print(f"Error in SA4->State: {e}")

# --- 2. Crash -> LGA  ---
try:
    if 'internalCrashID:ID(Crash)' in crash_data.columns and 'national_lga_name_2024' in crash_data.columns:
        crash_lga_rels = crash_data[['internalCrashID:ID(Crash)', 'national_lga_name_2024']].copy()
        crash_lga_rels.rename(columns={'internalCrashID:ID(Crash)': ':START_ID(Crash)', 'national_lga_name_2024': ':END_ID(LGA)'}, inplace=True)
        crash_lga_rels[':TYPE'] = 'OCCURRED_IN_LGA'
        crash_lga_rels.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'rels_crash_lga.csv'), index=False, encoding='utf-8')
        print(f"Saved {len(crash_lga_rels)} Crash->LGA relationships.")
    else:
        print("Error or skipping Crash->LGA: Source columns not found in crash_data.")
except Exception as e: print(f"Error in Crash->LGA: {e}")

# --- 3. Crash -> SA4  ---
try:
    if 'internalCrashID:ID(Crash)' in crash_data.columns and 'sa4_name_2021' in crash_data.columns:
        crash_sa4_rels = crash_data[['internalCrashID:ID(Crash)', 'sa4_name_2021']].copy()
        crash_sa4_rels.rename(columns={'internalCrashID:ID(Crash)': ':START_ID(Crash)', 'sa4_name_2021': ':END_ID(SA4)'}, inplace=True)
        crash_sa4_rels[':TYPE'] = 'OCCURRED_IN_SA4'
        crash_sa4_rels.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'rels_crash_sa4.csv'), index=False, encoding='utf-8')
        print(f"Saved {len(crash_sa4_rels)} Crash->SA4 relationships.")
    else:
        print("Error or skipping Crash->SA4: Source columns not found.")
except Exception as e: print(f"Error in Crash->SA4: {e}")

# --- 4. Person -> Crash ---
try:
    df_merged_person = df.merge(crash_id_map, left_on='crash_id', right_index=True, how='inner')
    if 'id' in df_merged_person.columns and 'internalCrashID:ID(Crash)' in df_merged_person.columns:
        person_crash_rels = df_merged_person[['id', 'internalCrashID:ID(Crash)']].copy()
        person_crash_rels.rename(columns={'id': ':START_ID(Person)', 'internalCrashID:ID(Crash)': ':END_ID(Crash)'}, inplace=True)
        person_crash_rels[':TYPE'] = 'WAS_INVOLVED_IN'
        person_crash_rels.to_csv(os.path.join(OUTPUT_DIR_FILTERED, 'rels_person_crash.csv'), index=False, encoding='utf-8')
        print(f"Saved {len(person_crash_rels)} Person->Crash relationships.")
    else:
        print("Error or skipping Person->Crash: Required columns not found after merge.")
except Exception as e: print(f"Error in Person->Crash: {e}")

print("\n--- All Relationship File Generation Complete (for NSW2024 filtered data) ---")

--- Creating Relationship Files (from NSW2024 data) ---
Saved 28 SA4->State relationships.
Saved 307 Crash->LGA relationships.
Saved 307 Crash->SA4 relationships.
Saved 339 Person->Crash relationships.

--- All Relationship File Generation Complete (for NSW2024 filtered data) ---
