# Setup and Configuration

In [3]:
import pandas as pd
import os
import logging
import re # For potential advanced cleaning

# --- Configuration ---
SOURCE_FILE = os.path.join('..', 'data', 'source', 'Project2_Dataset.csv') # Path relative to notebook location
OUTPUT_DIR_ORIGINAL = os.path.join('..', 'data', 'import_original')
OUTPUT_DIR_FILTERED = os.path.join('..', 'data', 'import_filtered') # Optional

# Filtering Configuration
DO_FILTERING = False # Set to True to generate filtered set
FILTER_YEAR_MIN = 2023

# Ensure output directories exist
os.makedirs(OUTPUT_DIR_ORIGINAL, exist_ok=True)
if DO_FILTERING:
    os.makedirs(OUTPUT_DIR_FILTERED, exist_ok=True)

# Logging (optional in notebook, print statements might be sufficient)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

print("Setup Complete. Configuration loaded.")

Setup Complete. Configuration loaded.


# Load and Initial Clean Data

In [4]:
# Function to clean column names (optional but helpful)
def clean_col_names(df):
    cols = df.columns
    new_cols = []
    for col in cols:
        # Remove special characters, replace spaces with underscores
        clean_col = re.sub(r'[^0-9a-zA-Z_]+', '', str(col).replace(' ', '_'))
        new_cols.append(clean_col)
    df.columns = new_cols
    return df

# Load the dataset
try:
    df_raw = pd.read_csv(SOURCE_FILE, low_memory=False)
    df = clean_col_names(df_raw.copy()) # Work on a copy with cleaned names
    print(f"Loaded {len(df)} rows from {SOURCE_FILE}")
    print("Initial DataFrame Info:")
    df.info()
    # Display first few rows to check
    display(df.head())
except FileNotFoundError:
    print(f"ERROR: Source file not found at {SOURCE_FILE}")
    # Stop execution or handle error appropriately

Loaded 10490 rows from ../data/source/Project2_Dataset.csv
Initial DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10490 entries, 0 to 10489
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   ID                             10490 non-null  int64 
 1   Crash_ID                       10490 non-null  int64 
 2   State                          10490 non-null  object
 3   Month                          10490 non-null  int64 
 4   Year                           10490 non-null  int64 
 5   Dayweek                        10490 non-null  object
 6   Time                           10490 non-null  object
 7   Crash_Type                     10490 non-null  object
 8   Number_Fatalities              10490 non-null  int64 
 9   Bus_Involvement                10490 non-null  object
 10  Heavy_Rigid_Truck_Involvement  10490 non-null  object
 11  Articulated_Truck_Involvement  10490

Unnamed: 0,ID,Crash_ID,State,Month,Year,Dayweek,Time,Crash_Type,Number_Fatalities,Bus_Involvement,...,Age,National_Remoteness_Areas,SA4_Name_2021,National_LGA_Name_2024,National_Road_Type,Christmas_Period,Easter_Period,Age_Group,Day_of_week,Time_of_day
0,1,20241115,NSW,12,2024,Friday,4:00,Single,1,No,...,74,Inner Regional Australia,Riverina,Wagga Wagga,Arterial Road,Yes,No,65_to_74,Weekday,Night
1,2,20241125,NSW,12,2024,Friday,6:15,Single,1,No,...,19,Inner Regional Australia,Sydney - Baulkham Hills and Hawkesbury,Hawkesbury,Local Road,No,No,17_to_25,Weekday,Day
2,3,20246013,TAS,12,2024,Friday,9:43,Single,1,No,...,33,Inner Regional Australia,Launceston and North East,Northern Midlands,Local Road,Yes,No,26_to_39,Weekday,Day
3,4,20241002,NSW,12,2024,Friday,10:35,Single,1,No,...,32,Outer Regional Australia,New England and North West,Armidale,National or State Highway,No,No,26_to_39,Weekday,Day
4,5,20243185,QLD,12,2024,Friday,13:00,Single,1,No,...,61,Inner Regional Australia,Toowoomba,Lockyer Valley,National or State Highway,No,No,40_to_64,Weekday,Day


# 1. Process Location Nodes (State, SA4, LGA)

In [6]:
print("\n--- Processing Location Nodes ---")

# States
states = df[['State']].drop_duplicates().reset_index(drop=True)
states.rename(columns={'State': 'name:ID(State)'}, inplace=True)
states[':LABEL'] = 'State'
states_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'states.csv')
states.to_csv(states_path, index=False, encoding='utf-8')
print(f"Generated {len(states)} State nodes -> {states_path}")
display(states.head())

# SA4s
sa4s_data = df[['SA4_Name_2021', 'State']].drop_duplicates().reset_index(drop=True)
sa4s_nodes = sa4s_data[['SA4_Name_2021']].copy()
sa4s_nodes.rename(columns={'SA4_Name_2021': 'name:ID(SA4)'}, inplace=True)
sa4s_nodes[':LABEL'] = 'SA4'
sa4s_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'sa4s.csv')
sa4s_nodes.to_csv(sa4s_path, index=False, encoding='utf-8')
print(f"Generated {len(sa4s_nodes)} SA4 nodes -> {sa4s_path}")
display(sa4s_nodes.head())


# LGAs
lgas_data = df[['National_LGA_Name_2024', 'SA4_Name_2021']].drop_duplicates().reset_index(drop=True)
lga_nodes = lgas_data[['National_LGA_Name_2024']].copy()
lga_nodes.rename(columns={'National_LGA_Name_2024': 'name:ID(LGA)'}, inplace=True)
lga_nodes[':LABEL'] = 'LGA'
lgas_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'lgas.csv')
lga_nodes.to_csv(lgas_path, index=False, encoding='utf-8')
print(f"Generated {len(lga_nodes)} LGA nodes -> {lgas_path}")
display(lga_nodes.head())


--- Processing Location Nodes ---
Generated 8 State nodes -> ../data/import_original/states.csv


Unnamed: 0,name:ID(State),:LABEL
0,NSW,State
1,TAS,State
2,QLD,State
3,SA,State
4,VIC,State


Generated 88 SA4 nodes -> ../data/import_original/sa4s.csv


Unnamed: 0,name:ID(SA4),:LABEL
0,Riverina,SA4
1,Sydney - Baulkham Hills and Hawkesbury,SA4
2,Launceston and North East,SA4
3,New England and North West,SA4
4,Toowoomba,SA4


Generated 582 LGA nodes -> ../data/import_original/lgas.csv


Unnamed: 0,name:ID(LGA),:LABEL
0,Wagga Wagga,LGA
1,Hawkesbury,LGA
2,Northern Midlands,LGA
3,Armidale,LGA
4,Lockyer Valley,LGA


# 2. Process Unique Crash Nodes

In [8]:
print("\n--- Processing Unique Crash Nodes ---")

# Deduplicate based on Crash ID
crash_data = df.drop_duplicates(subset=['Crash_ID'], keep='first').reset_index(drop=True).copy()

# Generate internal ID
crash_data['internalCrashID:ID(Crash)'] = crash_data.index

# Handle Speed Limit conversion (Example: fill missing with -1, adjust as needed)
crash_data['Speed_Limit'] = pd.to_numeric(crash_data['Speed_Limit'], errors='coerce')
crash_data['Speed_Limit'].fillna(-1, inplace=True) # Or choose another strategy
crash_data['Speed_Limit'] = crash_data['Speed_Limit'].astype(int) # Convert after filling NaN

# Define columns and rename
crash_columns_map = {
    'internalCrashID:ID(Crash)': 'internalCrashID:ID(Crash)',
    'Crash_ID': 'crashID', 'Year': 'year', 'Month': 'month', 'Dayweek': 'dayweek', 'Time': 'time',
    'Crash_Type': 'crashType', 'Number_Fatalities': 'numberFatalities', 'Bus_Involvement': 'busInvolvement',
    'Heavy_Rigid_Truck_Involvement': 'heavyRigidTruckInvolvement', 'Articulated_Truck_Involvement': 'articulatedTruckInvolvement',
    'Speed_Limit': 'speedLimit', 'National_Road_Type': 'nationalRoadType', 'Christmas_Period': 'christmasPeriod',
    'Easter_Period': 'easterPeriod', 'National_Remoteness_Areas': 'nationalRemotenessAreas',
    'Day_of_week': 'dayOfWeekType', 'Time_of_day': 'timeOfDay'
}
crash_nodes = crash_data[list(crash_columns_map.keys())].rename(columns=crash_columns_map)
crash_nodes[':LABEL'] = 'Crash'

# Save
crashes_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'crashes.csv')
crash_nodes.to_csv(crashes_path, index=False, encoding='utf-8')
print(f"Generated {len(crash_nodes)} unique Crash nodes -> {crashes_path}")
display(crash_nodes.head())

# Create Crash_ID to internalCrashID mapping for relationships
crash_id_map = crash_nodes[['crashID', 'internalCrashID:ID(Crash)']].set_index('crashID')
print("Created Crash ID map.")


--- Processing Unique Crash Nodes ---
Generated 9683 unique Crash nodes -> ../data/import_original/crashes.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  crash_data['Speed_Limit'].fillna(-1, inplace=True) # Or choose another strategy


Unnamed: 0,internalCrashID:ID(Crash),crashID,year,month,dayweek,time,crashType,numberFatalities,busInvolvement,heavyRigidTruckInvolvement,articulatedTruckInvolvement,speedLimit,nationalRoadType,christmasPeriod,easterPeriod,nationalRemotenessAreas,dayOfWeekType,timeOfDay,:LABEL
0,0,20241115,2024,12,Friday,4:00,Single,1,No,No,No,100,Arterial Road,Yes,No,Inner Regional Australia,Weekday,Night,Crash
1,1,20241125,2024,12,Friday,6:15,Single,1,No,No,No,80,Local Road,No,No,Inner Regional Australia,Weekday,Day,Crash
2,2,20246013,2024,12,Friday,9:43,Single,1,No,No,No,50,Local Road,Yes,No,Inner Regional Australia,Weekday,Day,Crash
3,3,20241002,2024,12,Friday,10:35,Single,1,No,No,No,100,National or State Highway,No,No,Outer Regional Australia,Weekday,Day,Crash
4,4,20243185,2024,12,Friday,13:00,Single,1,No,No,No,100,National or State Highway,No,No,Inner Regional Australia,Weekday,Day,Crash


Created Crash ID map.


# 3. Process Person Nodes

In [9]:
print("\n--- Processing Person Nodes ---")

# Use the original unique 'ID' column as the person identifier
person_columns_map = {
    'ID': 'personID:ID(Person)', 
    'Road_User': 'roadUser',
    'Gender': 'gender',
    'Age': 'age',
    'Age_Group': 'ageGroup'
}
# Select these columns directly from the original df (no deduplication needed here)
person_nodes = df[list(person_columns_map.keys())].rename(columns=person_columns_map)
person_nodes[':LABEL'] = 'Person'

persons_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'persons.csv')
person_nodes.to_csv(persons_path, index=False, encoding='utf-8')
print(f"Generated {len(person_nodes)} Person nodes -> {persons_path}")
display(person_nodes.head())



--- Processing Person Nodes ---
Generated 10490 Person nodes -> ../data/import_original/persons.csv


Unnamed: 0,personID:ID(Person),roadUser,gender,age,ageGroup,:LABEL
0,1,Driver,Male,74,65_to_74,Person
1,2,Driver,Female,19,17_to_25,Person
2,3,Driver,Female,33,26_to_39,Person
3,4,Driver,Female,32,26_to_39,Person
4,5,Passenger,Female,61,40_to_64,Person


# generate relationship files

## 1. SA4 -> State Relationships
Need 'Statistical_Areas_Level_4_SA4_Name_2021' and 'State' from sa4s_data

In [10]:
try:
    sa4_state_rels = sa4s_data.copy()
    sa4_state_rels.rename(columns={'Statistical_Areas_Level_4_SA4_Name_2021': ':START_ID(SA4)',
                                   'State': ':END_ID(State)'}, inplace=True)
    sa4_state_rels[':TYPE'] = 'IN_STATE'
    rels_sa4_state_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'rels_sa4_state.csv')
    sa4_state_rels.to_csv(rels_sa4_state_path, index=False, encoding='utf-8')
    print(f"Generated {len(sa4_state_rels)} SA4->State relationships -> {rels_sa4_state_path}")
    display(sa4_state_rels.head())
except NameError:
    print("ERROR: DataFrame 'sa4s_data' not found.")
except KeyError as e:
    print(f"ERROR: Missing expected columns in 'sa4s_data'. Error: {e}")

Generated 88 SA4->State relationships -> ../data/import_original/rels_sa4_state.csv


Unnamed: 0,SA4_Name_2021,:END_ID(State),:TYPE
0,Riverina,NSW,IN_STATE
1,Sydney - Baulkham Hills and Hawkesbury,NSW,IN_STATE
2,Launceston and North East,TAS,IN_STATE
3,New England and North West,NSW,IN_STATE
4,Toowoomba,QLD,IN_STATE


## 2. LGA -> SA4 Relationships
Need 'National_Local_Government_Areas_LGAs_Name_2024' and 'Statistical_Areas_Level_4_SA4_Name_2021' from lgas_data

In [14]:

try:
    lga_sa4_rels = lgas_data.copy()
    lga_sa4_rels.rename(columns={'National_LGA_Name_2024': ':START_ID(LGA)', # Renamed here
                                 'SA4_Name_2021': ':END_ID(SA4)'}, inplace=True) # Renamed here
    lga_sa4_rels[':TYPE'] = 'PART_OF'
    rels_lga_sa4_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'rels_lga_sa4.csv')

    # Ensure the SA4 name exists before saving (optional robustness check)
    # *** The error is likely happening HERE ***
    lga_sa4_rels.dropna(subset=[':START_ID(LGA)', ':END_ID(SA4)'], inplace=True)

    lga_sa4_rels.to_csv(rels_lga_sa4_path, index=False, encoding='utf-8')
    print(f"Generated {len(lga_sa4_rels)} LGA->SA4 relationships -> {rels_lga_sa4_path}")
    display(lga_sa4_rels.head())
except NameError:
    print("ERROR: DataFrame 'lgas_data' not found.")
except KeyError as e:
    # This is the error being caught, but it happened inside the 'try' block
    print(f"ERROR: Missing expected columns in 'lgas_data' or renamed DataFrame. Error: {e}")

Generated 582 LGA->SA4 relationships -> ../data/import_original/rels_lga_sa4.csv


Unnamed: 0,:START_ID(LGA),:END_ID(SA4),:TYPE
0,Wagga Wagga,Riverina,PART_OF
1,Hawkesbury,Sydney - Baulkham Hills and Hawkesbury,PART_OF
2,Northern Midlands,Launceston and North East,PART_OF
3,Armidale,New England and North West,PART_OF
4,Lockyer Valley,Toowoomba,PART_OF


# 3. Person -> Crash Relationship
Need original 'df' (with 'ID' and 'Crash_ID') and 'crash_id_map'

In [15]:
try:
    # Merge original df (all fatality rows) with the crash_id_map
    # Use 'Crash_ID' from the cleaned 'df' and the index (which is 'crashID') from 'crash_id_map'
    df_merged_person = df.merge(crash_id_map, left_on='Crash_ID', right_index=True, how='inner')

    # Select the unique Person ID ('ID') and the corresponding unique internalCrashID
    person_crash_rels = df_merged_person[['ID', 'internalCrashID:ID(Crash)']].copy()

    # Rename for Neo4j import: START is Person ID, END is internal Crash ID
    person_crash_rels.rename(columns={'ID': ':START_ID(Person)',
                                      'internalCrashID:ID(Crash)': ':END_ID(Crash)'}, inplace=True)
    person_crash_rels[':TYPE'] = 'WAS_INVOLVED_IN'
    rels_person_crash_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'rels_person_crash.csv')
    person_crash_rels.to_csv(rels_person_crash_path, index=False, encoding='utf-8')
    print(f"Generated {len(person_crash_rels)} Person->Crash relationships -> {rels_person_crash_path}")
    display(person_crash_rels.head())
except NameError:
    print("ERROR: DataFrame 'df' or 'crash_id_map' not found. Ensure Cells 2 and 4 executed correctly.")
except KeyError as e:
    print(f"ERROR: Missing expected columns ('ID', 'Crash_ID', 'internalCrashID:ID(Crash)'). Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred during Person->Crash relationship generation: {e}")


Generated 10490 Person->Crash relationships -> ../data/import_original/rels_person_crash.csv


Unnamed: 0,:START_ID(Person),:END_ID(Crash),:TYPE
0,1,0,WAS_INVOLVED_IN
1,2,1,WAS_INVOLVED_IN
2,3,2,WAS_INVOLVED_IN
3,4,3,WAS_INVOLVED_IN
4,5,4,WAS_INVOLVED_IN


# 4. Crash -> LGA Relationship
Need 'crash_data' (from Cell 4) which has 'internalCrashID:ID(Crash)' and 'National_LGA_Name_2024'

In [16]:
try:
    crash_lga_rels = crash_data[['internalCrashID:ID(Crash)', 'National_LGA_Name_2024']].copy()
    crash_lga_rels.rename(columns={'internalCrashID:ID(Crash)': ':START_ID(Crash)',
                                 'National_LGA_Name_2024': ':END_ID(LGA)'}, inplace=True)
    crash_lga_rels[':TYPE'] = 'OCCURRED_IN'
    # Remove rows where LGA name might be missing to avoid errors during import
    crash_lga_rels.dropna(subset=[':START_ID(Crash)', ':END_ID(LGA)'], inplace=True)
    rels_crash_lga_path = os.path.join(OUTPUT_DIR_ORIGINAL, 'rels_crash_lga.csv')
    crash_lga_rels.to_csv(rels_crash_lga_path, index=False, encoding='utf-8')
    print(f"Generated {len(crash_lga_rels)} Crash->LGA relationships -> {rels_crash_lga_path}")
    display(crash_lga_rels.head())
except NameError:
    print("ERROR: DataFrame 'crash_data' not found. Ensure Cell 4 executed correctly.")
except KeyError as e:
    print(f"ERROR: Missing expected columns in 'crash_data'. Error: {e}")


print("\n--- All Relationship File Generation Complete ---")

Generated 9683 Crash->LGA relationships -> ../data/import_original/rels_crash_lga.csv


Unnamed: 0,:START_ID(Crash),:END_ID(LGA),:TYPE
0,0,Wagga Wagga,OCCURRED_IN
1,1,Hawkesbury,OCCURRED_IN
2,2,Northern Midlands,OCCURRED_IN
3,3,Armidale,OCCURRED_IN
4,4,Lockyer Valley,OCCURRED_IN



--- All Relationship File Generation Complete ---
