**Goal:**
 Generate clean CSVs for Nodes (Crash, Person, LGA, SA4, State) and Relationships (WAS_INVOLVED_IN, OCCURRED_IN, PART_OF, IN_STATE) that precisely match the headers needed for LOAD CSV and reflect graph model design in arrows


## Cell 1: Setup and Configuration

In [2]:
import pandas as pd
import os
import numpy as np
import re

# --- Configuration ---
SOURCE_FILE = os.path.join('..', 'data', 'source', 'Project2_Dataset_Corrected.csv')
OUTPUT_DIR = os.path.join('..', 'data', 'import_final') # Output directory

print(f"Source file path: {SOURCE_FILE}")
print(f"Output directory path: {OUTPUT_DIR}")
print("Setup Complete. Necessary libraries imported and paths configured.")

Source file path: ../data/source/Project2_Dataset_Corrected.csv
Output directory path: ../data/import_final
Setup Complete. Necessary libraries imported and paths configured.


## Cell 2: Load and Initial Clean Data

In [None]:
# --- Load Raw Data ---
try:
    df_raw = pd.read_csv(SOURCE_FILE, low_memory=False)
  
    print(f"Successfully loaded {len(df_raw)} rows from {SOURCE_FILE}") 
except FileNotFoundError:
    print(f"ERROR: Source file not found at {SOURCE_FILE}. Please check the path.")
    # Stop execution if file not found
    raise

# --- Define Column Cleaning Function ---
def clean_col_name(col_name):
    col_name = str(col_name)
    col_name = col_name.replace(' (', '_').replace(') ', '_').replace(' ', '_')
    clean_name = re.sub(r'[^a-zA-Z0-9_]+', '', col_name).lower()
    clean_name = clean_name.strip('_')
    return clean_name

# --- Apply Cleaning and Inspect ---
df = df_raw.copy() # Work on a copy
original_columns = df.columns.tolist()
df.columns = [clean_col_name(col) for col in df.columns]
cleaned_columns = df.columns.tolist()

print("--- Column Name Cleaning ---")
print("Original -> Cleaned:")
for orig, clean in zip(original_columns, cleaned_columns):
    if orig != clean:
        print(f"'{orig}' -> '{clean}'")
    else:
        print(f"'{orig}' -> (no change)")

print("\n--- DataFrame Info After Cleaning ---")
df.info()
print("\n--- First 5 Rows After Cleaning ---")
display(df.head())

Successfully loaded 10490 rows from ../data/source/Project2_Dataset_Corrected.csv
--- Column Name Cleaning ---
Original -> Cleaned:
'ID' -> 'id'
'Crash ID' -> 'crash_id'
'State' -> 'state'
'Month' -> 'month'
'Year' -> 'year'
'Dayweek' -> 'dayweek'
'Time' -> 'time'
'Crash Type' -> 'crash_type'
'Number Fatalities' -> 'number_fatalities'
'Bus Involvement' -> 'bus_involvement'
'Heavy Rigid Truck Involvement' -> 'heavy_rigid_truck_involvement'
'Articulated Truck Involvement' -> 'articulated_truck_involvement'
'Speed Limit' -> 'speed_limit'
'Road User' -> 'road_user'
'Gender' -> 'gender'
'Age' -> 'age'
'National Remoteness Areas' -> 'national_remoteness_areas'
'SA4 Name 2021' -> 'sa4_name_2021'
'National LGA Name 2024' -> 'national_lga_name_2024'
'National Road Type' -> 'national_road_type'
'Christmas Period' -> 'christmas_period'
'Easter Period' -> 'easter_period'
'Age Group' -> 'age_group'
'Day of week' -> 'day_of_week'
'Time of day' -> 'time_of_day'

--- DataFrame Info After Cleaning ---


Unnamed: 0,id,crash_id,state,month,year,dayweek,time,crash_type,number_fatalities,bus_involvement,...,age,national_remoteness_areas,sa4_name_2021,national_lga_name_2024,national_road_type,christmas_period,easter_period,age_group,day_of_week,time_of_day
0,1,20241115,NSW,12,2024,Friday,4:00,Single,1,No,...,74,Inner Regional Australia,Riverina,Wagga Wagga,Arterial Road,Yes,No,65_to_74,Weekday,Night
1,2,20241125,NSW,12,2024,Friday,6:15,Single,1,No,...,19,Inner Regional Australia,Sydney - Baulkham Hills and Hawkesbury,Hawkesbury,Local Road,No,No,17_to_25,Weekday,Day
2,3,20246013,TAS,12,2024,Friday,9:43,Single,1,No,...,33,Inner Regional Australia,Launceston and North East,Northern Midlands,Local Road,Yes,No,26_to_39,Weekday,Day
3,4,20241002,NSW,12,2024,Friday,10:35,Single,1,No,...,32,Outer Regional Australia,New England and North West,Armidale,National or State Highway,No,No,26_to_39,Weekday,Day
4,5,20243185,QLD,12,2024,Friday,13:00,Single,1,No,...,61,Inner Regional Australia,Toowoomba,Lockyer Valley,National or State Highway,No,No,40_to_64,Weekday,Day


## Cell 3: Create Location Nodes AND Extract SA4-State Link Data

In [None]:
print("--- Creating Location Node Files & Extracting Link Data---")

# --- State nodes---
states_nodes = df[['state']].drop_duplicates().dropna()
states_nodes.rename(columns={'state': 'name:ID(State)'}, inplace=True)
states_nodes[':LABEL'] = 'State'
states_path = os.path.join(OUTPUT_DIR, 'states.csv')
states_nodes.to_csv(states_path, index=False, encoding='utf-8')
print(f"Saved {len(states_nodes)} State nodes to {states_path}")
display(states_nodes.head()) 

# --- SA4 nodes & SA4-State Link Data  ---
# for SA4-[:IN_STATE]->State relationship
sa4_link_data = df[['sa4_name_2021', 'state']].drop_duplicates().dropna()
# Generate unique SA4 nodes
sa4_nodes = sa4_link_data[['sa4_name_2021']].drop_duplicates().copy()
sa4_nodes.rename(columns={'sa4_name_2021': 'name:ID(SA4)'}, inplace=True)
sa4_nodes[':LABEL'] = 'SA4'
sa4s_path = os.path.join(OUTPUT_DIR, 'sa4s.csv')
sa4_nodes.to_csv(sa4s_path, index=False, encoding='utf-8')
print(f"Saved {len(sa4_nodes)} SA4 nodes to {sa4s_path}")
display(sa4_nodes.head()) 

# --- LGAs Nodes ---
lga_nodes = df[['national_lga_name_2024']].drop_duplicates().dropna().copy()
lga_nodes.rename(columns={'national_lga_name_2024': 'name:ID(LGA)'}, inplace=True)
lga_nodes[':LABEL'] = 'LGA'
lgas_path = os.path.join(OUTPUT_DIR, 'lgas.csv')
lga_nodes.to_csv(lgas_path, index=False, encoding='utf-8')
print(f"Saved {len(lga_nodes)} LGA nodes to {lgas_path}")
display(lga_nodes.head()) 

print("--- Location Node Files Created & SA4-State Link Data Extracted ---")

--- Creating Location Node Files & Extracting Link Data---
Saved 8 State nodes to ../data/import_final/states.csv


Unnamed: 0,name:ID(State),:LABEL
0,NSW,State
2,TAS,State
4,QLD,State
5,SA,State
73,VIC,State


Saved 88 SA4 nodes to ../data/import_final/sa4s.csv


Unnamed: 0,name:ID(SA4),:LABEL
0,Riverina,SA4
1,Sydney - Baulkham Hills and Hawkesbury,SA4
2,Launceston and North East,SA4
3,New England and North West,SA4
4,Toowoomba,SA4


Saved 509 LGA nodes to ../data/import_final/lgas.csv


Unnamed: 0,name:ID(LGA),:LABEL
0,Wagga Wagga,LGA
1,Hawkesbury,LGA
2,Northern Midlands,LGA
3,Armidale,LGA
4,Lockyer Valley,LGA


--- Location Node Files Created & SA4-State Link Data Extracted ---


## Cell 4: Create Unique Crash Nodes

In [None]:
print("--- Creating Unique Crash Node File & Crash Data for Relationships ---")

# Deduplicate based on 'crash_id'.
# This crash_data DataFrame now holds one row per unique crash event
# and retains all original columns (cleaned), including LGA and SA4 names needed for relationships.
crash_data = df.drop_duplicates(subset=['crash_id'], keep='first').reset_index(drop=True).copy()

# Generate Neo4j Internal ID for Crash nodes
crash_data['internalCrashID:ID(Crash)'] = crash_data.index

# Standardize Yes/No columns
yes_no_cols = ['bus_involvement', 'heavy_rigid_truck_involvement',
               'articulated_truck_involvement', 'christmas_period', 'easter_period']
for col in yes_no_cols:
    if col in crash_data.columns:
        crash_data[col] = crash_data[col].astype(str).str.lower().map({'yes': 'yes', 'no': 'no'}).fillna('unknown')

# Handle numeric types
crash_data['year'] = pd.to_numeric(crash_data['year'], errors='coerce').fillna(0).astype(int)
crash_data['month'] = pd.to_numeric(crash_data['month'], errors='coerce').fillna(0).astype(int)
crash_data['number_fatalities'] = pd.to_numeric(crash_data['number_fatalities'], errors='coerce').fillna(0).astype(int)
crash_data['speed_limit'] = pd.to_numeric(crash_data['speed_limit'], errors='coerce').astype('Int64')

# Define columns specifically for the Crash node properties CSV
crash_node_properties_map = {
    'internalCrashID:ID(Crash)': 'internalCrashID:ID(Crash)',
    'crash_id': 'crashID_orig', # Store original crash_id as a property
    'year': 'year', 'month': 'month', 'dayweek': 'dayweek', 'time': 'time',
    'crash_type': 'crashType', 'number_fatalities': 'numberFatalities',
    'bus_involvement': 'busInvolvement', 'heavy_rigid_truck_involvement': 'heavyRigidTruckInvolvement',
    'articulated_truck_involvement': 'articulatedTruckInvolvement', 'speed_limit': 'speedLimit',
    'national_road_type': 'nationalRoadType', 'christmas_period': 'christmasPeriod',
    'easter_period': 'easterPeriod', 'national_remoteness_areas': 'nationalRemotenessAreas',
    'day_of_week': 'dayOfWeekType', 'time_of_day': 'timeOfDay'
    # LGA and SA4 names are in crash_data but not directly saved as properties on Crash node here,
    # as they will be linked via relationships.
}

# Create the DataFrame for the crashes.csv node file
crash_nodes_for_csv = crash_data[list(crash_node_properties_map.keys())].copy()
crash_nodes_for_csv.rename(columns=crash_node_properties_map, inplace=True)
crash_nodes_for_csv[':LABEL'] = 'Crash'

crashes_path = os.path.join(OUTPUT_DIR, 'crashes.csv')
crash_nodes_for_csv.to_csv(crashes_path, index=False, encoding='utf-8')
print(f"Saved {len(crash_nodes_for_csv)} unique Crash nodes to {crashes_path}")

# Create Mapping from original 'crash_id' (cleaned) to 'internalCrashID:ID(Crash)'
# This is used for linking Person nodes to the correct internal Crash ID.
# This map comes from crash_data which has both.
crash_id_map = crash_data[['crash_id', 'internalCrashID:ID(Crash)']].copy().set_index('crash_id')
print("Created crash_id to internalCrashID map for Person->Crash linking.")
display(crash_data[['crash_id', 'national_lga_name_2024', 'sa4_name_2021']].head()) 

--- Creating Unique Crash Node File & Crash Data for Relationships ---
Saved 9683 unique Crash nodes to ../data/import_final/crashes.csv
Created crash_id to internalCrashID map for Person->Crash linking.


Unnamed: 0,crash_id,national_lga_name_2024,sa4_name_2021
0,20241115,Wagga Wagga,Riverina
1,20241125,Hawkesbury,Sydney - Baulkham Hills and Hawkesbury
2,20246013,Northern Midlands,Launceston and North East
3,20241002,Armidale,New England and North West
4,20243185,Lockyer Valley,Toowoomba


## Cell 5: Create Person Nodes

In [17]:
print("--- Creating Person Node File ---") # Replaced logging

# --- Handle Data Types ---
df['age'] = pd.to_numeric(df['age'], errors='coerce').fillna(-1).astype(int)

# --- Select and Rename Columns for Neo4j ---
person_node_cols = {
    'id': 'personID:ID(Person)',
    'road_user': 'roadUser', 'gender': 'gender', 'age': 'age', 'age_group': 'ageGroup'
}
person_nodes = df[list(person_node_cols.keys())].copy()
person_nodes.rename(columns=person_node_cols, inplace=True)
person_nodes[':LABEL'] = 'Person'

# --- Save Person Nodes ---
persons_path = os.path.join(OUTPUT_DIR, 'persons.csv')
person_nodes.to_csv(persons_path, index=False, encoding='utf-8')

print(f"Saved {len(person_nodes)} Person nodes to {persons_path}") 
display(person_nodes.head())

--- Creating Person Node File ---
Saved 10490 Person nodes to ../data/import_final/persons.csv


Unnamed: 0,personID:ID(Person),roadUser,gender,age,ageGroup,:LABEL
0,1,Driver,Male,74,65_to_74,Person
1,2,Driver,Female,19,17_to_25,Person
2,3,Driver,Female,33,26_to_39,Person
3,4,Driver,Female,32,26_to_39,Person
4,5,Passenger,Female,61,40_to_64,Person


## Cell 6: Create Relationship Files

In [None]:
print("--- Creating Relationship Files ---")

# --- 1. SA4 -> State (:IN_STATE) ---
# Uses sa4_link_data (SA4-State pairs) from Cell 3
try:
    sa4_state_rels = sa4_link_data.copy()
    sa4_state_rels.rename(columns={'sa4_name_2021': ':START_ID(SA4)', 'state': ':END_ID(State)'}, inplace=True)
    sa4_state_rels[':TYPE'] = 'IN_STATE'
    rels_sa4_state_path = os.path.join(OUTPUT_DIR, 'rels_sa4_state.csv')
    sa4_state_rels.to_csv(rels_sa4_state_path, index=False, encoding='utf-8')
    print(f"Saved {len(sa4_state_rels)} :IN_STATE (SA4->State) relationships to {rels_sa4_state_path}")
except Exception as e: print(f"Error in :IN_STATE (SA4->State) relationships: {e}")

# --- 2. Crash -> LGA (:OCCURRED_IN_LGA) ---
# Uses crash_data from Cell 4 (which has internalCrashID and national_lga_name_2024)
try:
    # Select necessary columns, drop rows if LGA name is missing
    crash_lga_rels = crash_data[['internalCrashID:ID(Crash)', 'national_lga_name_2024']].dropna().copy()
    crash_lga_rels.rename(columns={
        'internalCrashID:ID(Crash)': ':START_ID(Crash)',
        'national_lga_name_2024': ':END_ID(LGA)'
    }, inplace=True)
    crash_lga_rels[':TYPE'] = 'OCCURRED_IN_LGA'
    rels_crash_lga_path = os.path.join(OUTPUT_DIR, 'rels_crash_lga.csv')
    crash_lga_rels.to_csv(rels_crash_lga_path, index=False, encoding='utf-8')
    print(f"Saved {len(crash_lga_rels)} :OCCURRED_IN_LGA (Crash->LGA) relationships to {rels_crash_lga_path}")
except Exception as e: print(f"Error in :OCCURRED_IN_LGA (Crash->LGA) relationships: {e}")

# --- 3. Crash -> SA4 (:OCCURRED_IN_SA4) --- NEW RELATIONSHIP
# Uses crash_data from Cell 4 (which has internalCrashID and sa4_name_2021)
try:
    # Select necessary columns, drop rows if SA4 name is missing
    crash_sa4_rels = crash_data[['internalCrashID:ID(Crash)', 'sa4_name_2021']].dropna().copy()
    crash_sa4_rels.rename(columns={
        'internalCrashID:ID(Crash)': ':START_ID(Crash)',
        'sa4_name_2021': ':END_ID(SA4)'
    }, inplace=True)
    crash_sa4_rels[':TYPE'] = 'OCCURRED_IN_SA4'
    rels_crash_sa4_path = os.path.join(OUTPUT_DIR, 'rels_crash_sa4.csv') # New CSV file
    crash_sa4_rels.to_csv(rels_crash_sa4_path, index=False, encoding='utf-8')
    print(f"Saved {len(crash_sa4_rels)} :OCCURRED_IN_SA4 (Crash->SA4) relationships to {rels_crash_sa4_path}")
except Exception as e: print(f"Error in :OCCURRED_IN_SA4 (Crash->SA4) relationships: {e}")

# --- 4. Person -> Crash (:WAS_INVOLVED_IN) ---
# Uses original df (with cleaned 'id', 'crash_id') and crash_id_map (from Cell 4)
try:
    df_merged_person = df.merge(crash_id_map, left_on='crash_id', right_index=True, how='inner')
    person_crash_rels = df_merged_person[['id', 'internalCrashID:ID(Crash)']].copy()
    person_crash_rels.rename(columns={'id': ':START_ID(Person)', 'internalCrashID:ID(Crash)': ':END_ID(Crash)'}, inplace=True)
    person_crash_rels[':TYPE'] = 'WAS_INVOLVED_IN'
    rels_person_crash_path = os.path.join(OUTPUT_DIR, 'rels_person_crash.csv')
    person_crash_rels.to_csv(rels_person_crash_path, index=False, encoding='utf-8')
    print(f"Saved {len(person_crash_rels)} :WAS_INVOLVED_IN (Person->Crash) relationships to {rels_person_crash_path}")
except Exception as e: print(f"Error in :WAS_INVOLVED_IN (Person->Crash) relationships: {e}")

print("\n--- All Relationship File Generation Complete ---")


--- Creating Relationship Files (Revised Model) ---
Saved 88 :IN_STATE (SA4->State) relationships to ../data/import_final/rels_sa4_state.csv
Saved 9683 :OCCURRED_IN_LGA (Crash->LGA) relationships to ../data/import_final/rels_crash_lga.csv
Saved 9683 :OCCURRED_IN_SA4 (Crash->SA4) relationships to ../data/import_final/rels_crash_sa4.csv
Saved 10490 :WAS_INVOLVED_IN (Person->Crash) relationships to ../data/import_final/rels_person_crash.csv

--- All Relationship File Generation Complete (Revised Model) ---
