In [1]:
# Import Library
import pandas as pd

## Load data

In [31]:

# Load fatal crashes data

# Load data from a specific sheet named "BITRE_Fatal_Crash",skipping non-data rows
crash_df = pd.read_excel(
    'bitre_fatal_crashes_dec2024.xlsx', 
    sheet_name='BITRE_Fatal_Crash',
    skiprows=4
)

# Parse time column safely and extract hour
crash_df['Time'] = crash_df['Time'].astype(str).str.strip()
crash_df['Time'] = pd.to_datetime(crash_df['Time'], format='%H:%M:%S', errors='coerce').dt.hour
crash_df['Time'] = crash_df['Time'].fillna(0).astype(int)

print("Fatal Crashes Data (Sheet: BITRE_Fatal_Crash):")
crash_df.head()


Fatal Crashes Data (Sheet: BITRE_Fatal_Crash):


Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Number Fatalities,Bus \nInvolvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,Speed Limit,National Remoteness Areas,SA4 Name 2021,National LGA Name 2021,National Road Type,Christmas Period,Easter Period,Day of week,Time of Day
0,20241115,NSW,12,2024,Friday,4,Single,1,No,No,No,100,Inner Regional Australia,Riverina,Wagga Wagga,Arterial Road,Yes,No,Weekday,Night
1,20241125,NSW,12,2024,Friday,6,Single,1,No,No,No,80,Inner Regional Australia,Sydney - Baulkham Hills and Hawkesbury,Hawkesbury,Local Road,No,No,Weekday,Day
2,20246013,Tas,12,2024,Friday,9,Multiple,1,No,No,No,50,Inner Regional Australia,Launceston and North East,Northern Midlands,Local Road,Yes,No,Weekday,Day
3,20241002,NSW,12,2024,Friday,10,Multiple,1,No,No,No,100,Outer Regional Australia,New England and North West,Armidale Regional,National or State Highway,No,No,Weekday,Day
4,20242261,Vic,12,2024,Friday,11,Multiple,1,-9,-9,-9,-9,Unknown,,,Undetermined,No,No,Weekday,Day


In [33]:
# Load fatalities data

fatality_df = pd.read_excel('bitre_fatalities_dec2024.xlsx', sheet_name='BITRE_Fatality', skiprows=4)

fatality_df['Time'] = fatality_df['Time'].astype(str).str.strip()
fatality_df['Time'] = pd.to_datetime(fatality_df['Time'], format='%H:%M:%S', errors='coerce').dt.hour
fatality_df['Time'] = fatality_df['Time'].fillna(0).astype(int)

fatality_df.head()

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Bus Involvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,...,Age,National Remoteness Areas,SA4 Name 2021,National LGA Name 2021,National Road Type,Christmas Period,Easter Period,Age Group,Day of week,Time of day
0,20241115,NSW,12,2024,Friday,4,Single,No,No,No,...,74,Inner Regional Australia,Riverina,Wagga Wagga,Arterial Road,Yes,No,65_to_74,Weekday,Night
1,20241125,NSW,12,2024,Friday,6,Single,No,No,No,...,19,Inner Regional Australia,Sydney - Baulkham Hills and Hawkesbury,Hawkesbury,Local Road,No,No,17_to_25,Weekday,Day
2,20246013,Tas,12,2024,Friday,9,Multiple,No,No,No,...,33,Inner Regional Australia,Launceston and North East,Northern Midlands,Local Road,Yes,No,26_to_39,Weekday,Day
3,20241002,NSW,12,2024,Friday,10,Multiple,No,No,No,...,32,Outer Regional Australia,New England and North West,Armidale Regional,National or State Highway,No,No,26_to_39,Weekday,Day
4,20242261,Vic,12,2024,Friday,11,Multiple,-9,-9,-9,...,62,Unknown,,,Undetermined,No,No,40_to_64,Weekday,Day


## Dimension Tables

### 1.Time 

In [37]:

# Extract time-related columns from both datasets (excluding Crash ID)
crash_time_df = crash_df[['Year', 'Month', 'Dayweek', 'Time', 'Time of Day']].copy()


fatality_time_df = fatality_df[['Year', 'Month', 'Dayweek', 'Time', 'Time of day']].copy()



# Standardize column names
fatality_time_df.columns = ['Year', 'Month', 'Dayweek', 'Time', 'Time of Day']

# Combine both dataframes and remove duplicate time combinations
combined_time_df = pd.concat([crash_time_df, fatality_time_df], ignore_index=True)
combined_time_df.drop_duplicates(inplace=True)

# Generate a temporary unique key to identify each distinct time row
combined_time_df['Date_Key'] = (
    combined_time_df['Year'].astype(str) + '-' +
    combined_time_df['Month'].astype(str).str.zfill(2) + '-' +
    combined_time_df['Dayweek'].astype(str) + '-' +
    combined_time_df['Time'].astype(str)
)

# Drop duplicates based on the generated key and reset index
dim_time_df = combined_time_df.drop_duplicates(subset=['Date_Key']).reset_index(drop=True)

# Insert manual primary key (Date_ID)
dim_time_df.insert(0, 'Date_ID', range(1, len(dim_time_df) + 1))

# Drop the temporary key
dim_time_df.drop(columns=['Date_Key'], inplace=True)

# Export clean dim_time to CSV
dim_time_df.to_csv('dim_time.csv', index=False)

# Optional: preview
dim_time_df.head(50)



Unnamed: 0,Date_ID,Year,Month,Dayweek,Time,Time of Day
0,1,2024,12,Friday,4,Night
1,2,2024,12,Friday,6,Day
2,3,2024,12,Friday,9,Day
3,4,2024,12,Friday,10,Day
4,5,2024,12,Friday,11,Day
5,6,2024,12,Friday,13,Day
6,7,2024,12,Friday,17,Day
7,8,2024,12,Friday,19,Night
8,9,2024,12,Friday,22,Night
9,10,2024,12,Friday,23,Night


### 2.Location 

In [7]:
# Extract location-related fields from both datasets
crash_location_df = crash_df[['State', 'SA4 Name 2021', 'National LGA Name 2021', 'National Remoteness Areas']].copy()
fatality_location_df = fatality_df[['State', 'SA4 Name 2021', 'National LGA Name 2021', 'National Remoteness Areas']].copy()

# Rename columns for consistency
crash_location_df.columns = ['State', 'SA4_Name', 'LGA_Name', 'Remoteness_Category']
fatality_location_df.columns = ['State', 'SA4_Name', 'LGA_Name', 'Remoteness_Category']

# Combine both sources and remove duplicates
combined_location_df = pd.concat([crash_location_df, fatality_location_df], ignore_index=True)
dim_location_df = combined_location_df.drop_duplicates().reset_index(drop=True)

# Add a surrogate primary key: Location_ID
dim_location_df.insert(0, 'Location_ID', range(1, len(dim_location_df) + 1))

# Optional: save the dimension table to CSV
dim_location_df.to_csv('dim_location.csv', index=False)

# Display the first few rows
dim_location_df.head()


Unnamed: 0,Location_ID,State,SA4_Name,LGA_Name,Remoteness_Category
0,1,NSW,Riverina,Wagga Wagga,Inner Regional Australia
1,2,NSW,Sydney - Baulkham Hills and Hawkesbury,Hawkesbury,Inner Regional Australia
2,3,Tas,Launceston and North East,Northern Midlands,Inner Regional Australia
3,4,NSW,New England and North West,Armidale Regional,Outer Regional Australia
4,5,Vic,,,Unknown


### 3.Person 


In [8]:

# Extract relevant columns for the person dimension
person_df = fatality_df[['Age', 'Age Group', 'National Remoteness Areas']].copy()

# Rename columns to standardized names
person_df.columns = ['Age', 'Age_Group', 'Remoteness']

# Remove duplicate records to avoid redundancy in the dimension table
dim_person_df = person_df.drop_duplicates().reset_index(drop=True)

# Add a surrogate primary key as Person_ID
dim_person_df.insert(0, 'Person_ID', range(1, len(dim_person_df) + 1))

# Optional: save the dimension table to a CSV file
dim_person_df.to_csv('dim_person.csv', index=False)

# Display the resulting dimension table
dim_person_df.head()


Unnamed: 0,Person_ID,Age,Age_Group,Remoteness
0,1,74,65_to_74,Inner Regional Australia
1,2,19,17_to_25,Inner Regional Australia
2,3,33,26_to_39,Inner Regional Australia
3,4,32,26_to_39,Outer Regional Australia
4,5,62,40_to_64,Unknown


### 4.Vehicle 

In [9]:

# Extract relevant fields related to vehicle involvement
vehicle_df = crash_df[['Bus \nInvolvement', 'Heavy Rigid Truck Involvement', 'Articulated Truck Involvement']].copy()

# Rename columns to standardized names
vehicle_df.columns = ['Bus_Involved', 'Heavy_Truck', 'Articulated_Truck']

# Remove duplicate records to build a unique dimension table
dim_vehicle_df = vehicle_df.drop_duplicates().reset_index(drop=True)

# Add a surrogate primary key
dim_vehicle_df.insert(0, 'Vehicle_ID', range(1, len(dim_vehicle_df) + 1))

# Optional: save the dimension table to CSV
dim_vehicle_df.to_csv('dim_vehicle.csv', index=False)

# Display the resulting vehicle dimension table
dim_vehicle_df.head()


Unnamed: 0,Vehicle_ID,Bus_Involved,Heavy_Truck,Articulated_Truck
0,1,No,No,No
1,2,-9,-9,-9
2,3,No,No,Yes
3,4,Yes,No,Yes
4,5,No,Yes,No


### 5.Road Type

In [10]:

# Extract the relevant column for road type
road_df = crash_df[['National Road Type']].copy()

# Rename the column for consistency
road_df.columns = ['Road_Type']

# Remove rows with missing values and drop duplicates
road_df = road_df.dropna().drop_duplicates().reset_index(drop=True)

# Add a surrogate primary key
road_df.insert(0, 'Road_ID', range(1, len(road_df) + 1))

# Optional: save the dimension table to CSV
road_df.to_csv('dim_road_type.csv', index=False)

# Display the resulting road type dimension table
road_df.head()


Unnamed: 0,Road_ID,Road_Type
0,1,Arterial Road
1,2,Local Road
2,3,National or State Highway
3,4,Undetermined
4,5,Sub-arterial Road


### 6.Population

In [39]:

# Load the Excel sheet
pop_df = pd.read_excel(
    'Population estimates by LGA, Significant Urban Area, Remoteness Area, Commonwealth Electoral Division and State Electoral Division, 2001 to 2023.xlsx',
    sheet_name='Table 1',
    skiprows=5
)

# Extract 2021 data
pop_clean_df = pop_df[['Unnamed: 1', 2021]].copy()
pop_clean_df.columns = ['LGA_Name', 'Population']

# Drop rows with missing values and header-like rows
pop_clean_df = pop_clean_df.dropna()
pop_clean_df = pop_clean_df[pop_clean_df['LGA_Name'] != 'Local Government Area']

# Remove duplicates
pop_clean_df = pop_clean_df.drop_duplicates().reset_index(drop=True)

# Add surrogate key
pop_clean_df.insert(0, 'Population_ID', range(1, len(pop_clean_df) + 1))

#  REMOVE the Year column (not needed anymore)
# Do NOT add: pop_clean_df['Year'] = 2021

# Export
pop_clean_df.to_csv('dim_population.csv', index=False)

# Preview
pop_clean_df.head()



Unnamed: 0,Population_ID,LGA_Name,Population
0,1,Albury,56067
1,2,Armidale,29332
2,3,Ballina,46196
3,4,Balranald,2208
4,5,Bathurst,43674


### 7.Dwelling

In [12]:
# Load the dwelling count CSV file (skip initial 11 lines with headers/notes)
dwelling_df = pd.read_csv(
    'LGA (count of dwellings).csv',
    skiprows=11,
    usecols=[0, 1],  # Only read the LGA Name and Dwelling Count columns
    names=['LGA_Name', 'Dwelling_Count'],
    quotechar='"'
)

# Drop rows with missing values
dwelling_df = dwelling_df.dropna()

# Filter rows where Dwelling_Count is numeric (remove note rows or text lines)
dwelling_df = dwelling_df[
    dwelling_df['Dwelling_Count'].astype(str).str.replace(',', '').str.isnumeric()
]

# Remove commas and convert to integer
dwelling_df['Dwelling_Count'] = dwelling_df['Dwelling_Count'].str.replace(',', '').astype(int)

# Add a surrogate primary key
dwelling_df.insert(0, 'Dwelling_ID', range(1, len(dwelling_df) + 1))

# Optional: save to CSV file
dwelling_df.to_csv('dim_dwelling.csv', index=False)

# Display the result
dwelling_df.head()


Unnamed: 0,Dwelling_ID,LGA_Name,Dwelling_Count
0,1,Albury,25430
1,2,Armidale Regional,12955
2,3,Ballina,20889
3,4,Balranald,1091
4,5,Bathurst Regional,18458


### 8.Holiday

In [13]:

# Extract holiday-related columns
holiday_df = crash_df[['Christmas Period', 'Easter Period']].copy()

# Rename columns for clarity
holiday_df.columns = ['Christmas_Flag', 'Easter_Flag']

# Remove rows with missing values and drop duplicate combinations
holiday_df = holiday_df.dropna().drop_duplicates().reset_index(drop=True)

# Add a surrogate primary key
holiday_df.insert(0, 'Holiday_ID', range(1, len(holiday_df) + 1))

# Optional: save to CSV
holiday_df.to_csv('dim_holiday.csv', index=False)

# Display the result
holiday_df.head()


Unnamed: 0,Holiday_ID,Christmas_Flag,Easter_Flag
0,1,Yes,No
1,2,No,No
2,3,No,Yes


### 9.Speed Limit

In [17]:
# Extract the speed limit column
speed_df = crash_df[['Speed Limit']].copy()

# Rename column for consistency
speed_df.columns = ['Speed_Limit_Value']

# Remove missing values and invalid entries (e.g., -9 means unknown/missing)
speed_df = speed_df[speed_df['Speed_Limit_Value'].notna()]
speed_df = speed_df[speed_df['Speed_Limit_Value'] != -9]

# Drop duplicate speed values
speed_df = speed_df.drop_duplicates().reset_index(drop=True)

# Add a surrogate primary key
speed_df.insert(0, 'Speed_ID', range(1, len(speed_df) + 1))

# Clean the speed limit values
speed_df['Speed_Limit_Value'] = speed_df['Speed_Limit_Value'].replace('<40', '40')
speed_df['Speed_Limit_Value'] = speed_df['Speed_Limit_Value'].astype(int)


# Optional: save to CSV
speed_df.to_csv('dim_speed_limit.csv', index=False)

# Display the final speed limit dimension table
speed_df.head(50)


Unnamed: 0,Speed_ID,Speed_Limit_Value
0,1,100
1,2,80
2,3,50
3,4,90
4,5,60
5,6,70
6,7,110
7,8,40
8,9,20
9,10,10


### 10.Crash Type

In [18]:

# Extract the crash type column
crash_type_df = crash_df[['Crash Type']].copy()

# Rename column for consistency
crash_type_df.columns = ['Crash_Type']

# Drop rows with missing values and remove duplicates
crash_type_df = crash_type_df.dropna().drop_duplicates().reset_index(drop=True)

# Add a surrogate primary key
crash_type_df.insert(0, 'Crash_Type_ID', range(1, len(crash_type_df) + 1))

# Optional: save to CSV
crash_type_df.to_csv('dim_crash_type.csv', index=False)

# Display the result
crash_type_df.head()


Unnamed: 0,Crash_Type_ID,Crash_Type
0,1,Single
1,2,Multiple


## Fact Tables

### Fact Table 1：Fact_Crash

In [53]:
# Load dimension tables
dim_time = pd.read_csv('dim_time.csv')
dim_location = pd.read_csv('dim_location.csv')
dim_vehicle = pd.read_csv('dim_vehicle.csv')
dim_road_type = pd.read_csv('dim_road_type.csv')
dim_holiday = pd.read_csv('dim_holiday.csv')
dim_speed_limit = pd.read_csv('dim_speed_limit.csv')
dim_crash_type = pd.read_csv('dim_crash_type.csv')
dim_population = pd.read_csv('dim_population.csv')  # No 'Year' column now
dim_dwelling = pd.read_csv('dim_dwelling.csv')



# Rename dim_time columns to use underscores
dim_time.columns = [col.strip().replace(" ", "_") for col in dim_time.columns]

# Merge with dim_time
fact_crash = crash_df.merge(
    dim_time,
    left_on=['Year', 'Month', 'Dayweek', 'Time', 'Time of Day'],
    right_on=['Year', 'Month', 'Dayweek', 'Time', 'Time_of_Day'],
    how='left'
)

# Merge with location dimension
fact_crash = fact_crash.merge(
    dim_location,
    left_on=['State', 'SA4 Name 2021', 'National LGA Name 2021', 'National Remoteness Areas'],
    right_on=['State', 'SA4_Name', 'LGA_Name', 'Remoteness_Category'],
    how='left'
)

# Merge with vehicle dimension
fact_crash = fact_crash.merge(
    dim_vehicle,
    left_on=['Bus \nInvolvement', 'Heavy Rigid Truck Involvement', 'Articulated Truck Involvement'],
    right_on=['Bus_Involved', 'Heavy_Truck', 'Articulated_Truck'],
    how='left'
)

# Merge with road type
fact_crash = fact_crash.merge(
    dim_road_type,
    left_on='National Road Type',
    right_on='Road_Type',
    how='left'
)

# Merge with holiday flags
fact_crash = fact_crash.merge(
    dim_holiday,
    left_on=['Christmas Period', 'Easter Period'],
    right_on=['Christmas_Flag', 'Easter_Flag'],
    how='left'
)

# Merge with speed limit
fact_crash = fact_crash.merge(
    dim_speed_limit,
    left_on='Speed Limit',
    right_on='Speed_Limit_Value',
    how='left'
)

# Merge with crash type
fact_crash = fact_crash.merge(
    dim_crash_type,
    left_on='Crash Type',
    right_on='Crash_Type',
    how='left'
)

# Merge with population by LGA only (no year)
fact_crash = fact_crash.merge(
    dim_population[['Population_ID', 'LGA_Name']],
    left_on='National LGA Name 2021',
    right_on='LGA_Name',
    how='left'
)

# Merge with dwelling
fact_crash = fact_crash.merge(
    dim_dwelling[['Dwelling_ID', 'LGA_Name']],
    left_on='National LGA Name 2021',
    right_on='LGA_Name',
    how='left'
)

# Add fatality count
fact_crash['Number_Fatalities'] = crash_df['Number Fatalities'].fillna(0).astype(int)

# Final fact_crash table
fact_crash_final = fact_crash[[
    'Crash ID',
    'Date_ID',
    'Location_ID',
    'Vehicle_ID',
    'Road_ID',
    'Holiday_ID',
    'Speed_ID',
    'Crash_Type_ID',
    'Population_ID',
    'Dwelling_ID',
    'Number_Fatalities'
]].drop_duplicates()




# Drop duplicate Crash_IDs
fact_crash_final = fact_crash_final.drop_duplicates(subset=['Crash ID'])

# Ensure all ID fields are integers
id_fields = ['Date_ID', 'Location_ID', 'Vehicle_ID', 'Road_ID', 'Holiday_ID',
             'Speed_ID', 'Crash_Type_ID', 'Population_ID', 'Dwelling_ID','Number_Fatalities']
for col in id_fields:
    fact_crash_final[col] = fact_crash_final[col].fillna(0).astype(int)

# Export cleaned CSV
fact_crash_final.to_csv('fact_crash.csv', index=False)


fact_crash_final.head()

Unnamed: 0,Crash ID,Date_ID,Location_ID,Vehicle_ID,Road_ID,Holiday_ID,Speed_ID,Crash_Type_ID,Population_ID,Dwelling_ID,Number_Fatalities
0,20241115,1,1,1,1,1,1,1,115,115,1
1,20241125,2,2,1,2,2,2,1,52,52,1
2,20246013,3,3,1,2,1,3,2,520,525,1
3,20241002,4,4,1,3,2,1,2,0,2,1
4,20242261,5,5,0,4,2,0,2,0,0,1


### Fact Table 2：Fact_Fatality


In [27]:


# Extract relevant fields from the dataset
fact_fatality_df = fatality_df[['Crash ID', 'Age', 'Age Group', 'Year', 'Month', 'Dayweek', 'Time', 'Time of day',
                                'State', 'SA4 Name 2021', 'National LGA Name 2021', 'National Remoteness Areas',
                                'National Road Type']].copy()

# Rename columns for clarity
fact_fatality_df.columns = ['Crash_ID', 'Age', 'Age_Group', 'Year', 'Month', 'Dayweek', 'Time', 'Time_of_Day',
                            'State', 'SA4_Name', 'LGA_Name', 'Remoteness', 'Road_Type']

# Add surrogate primary key for each fatality record
fact_fatality_df.insert(0, 'Fatality_ID', range(1, len(fact_fatality_df) + 1))

# Optional: save to CSV
# fact_fatality_df.to_csv('fact_fatality_raw.csv', index=False)

# Display the result
fact_fatality_df.head()


Unnamed: 0,Fatality_ID,Crash_ID,Age,Age_Group,Year,Month,Dayweek,Time,Time_of_Day,State,SA4_Name,LGA_Name,Remoteness,Road_Type
0,1,20241115,74,65_to_74,2024,12,Friday,04:00:00,Night,NSW,Riverina,Wagga Wagga,Inner Regional Australia,Arterial Road
1,2,20241125,19,17_to_25,2024,12,Friday,06:15:00,Day,NSW,Sydney - Baulkham Hills and Hawkesbury,Hawkesbury,Inner Regional Australia,Local Road
2,3,20246013,33,26_to_39,2024,12,Friday,09:43:00,Day,Tas,Launceston and North East,Northern Midlands,Inner Regional Australia,Local Road
3,4,20241002,32,26_to_39,2024,12,Friday,10:35:00,Day,NSW,New England and North West,Armidale Regional,Outer Regional Australia,National or State Highway
4,5,20242261,62,40_to_64,2024,12,Friday,11:30:00,Day,Vic,,,Unknown,Undetermined
