In [2]:
cd /content/sample_data

/content/sample_data


In [53]:
import pandas as pd

# Read the original data
data = pd.read_csv('Hwy26.csv')

# Define relevant columns for each category
crash_columns = ['Crash ID', 'Record Type', 'Crash Month', 'Crash Day', 'Crash Year',
                 'Week Day Code', 'Crash Hour', 'Crash Severity', 'Weather Condition',
                 'Road Surface Condition', 'Light Condition', 'Traffic Control Device (TCD)',
                 'Investigating Agency']

vehicle_columns = ['Vehicle ID', 'Crash ID', 'Vehicle Type Code', 'Vehicle Movement Code',
                   'Vehicle Action Code', 'Vehicle Cause 1 Code', 'Vehicle Cause 2 Code',
                   'Vehicle Cause 3 Code', 'Vehicle Event 1 Code', 'Vehicle Event 2 Code',
                   'Vehicle Event 3 Code', 'Vehicle Exceeded Posted Speed Flag',
                   'Vehicle Hit & Run Flag', 'Safety Equipment Used Quantity',
                   'Safety Equipment Un-used Quantity', 'Safety Equipment Use Unknown Quantity']

participant_columns = ['Participant ID', 'Vehicle ID', 'Crash ID', 'Age', 'Sex', 'Driver License Status',
                       'Injury Severity', 'Participant Safety Equipment Use Code', 'Airbag Deployment',
                       'Non-Motorist Movement Code', 'Non-Motorist Travel Direction From',
                       'Non-Motorist Travel Direction To', 'Non-Motorist Location', 'Participant Action',
                       'Participant Error 1 Code', 'Participant Error 2 Code', 'Participant Error 3 Code',
                       'Participant Cause 1 Code', 'Participant Cause 2 Code', 'Participant Cause 3 Code',
                       'Participant Event 1 Code', 'Participant Event 2 Code', 'Participant Event 3 Code',
                       'BAC Test Results Code', 'Alcohol Use Reported', 'Drug Use Reported',
                       'Participant Marijuana Use Reported', 'Participant Striker Flag']

# Extract data for each category
crashes_data = data[crash_columns]
vehicles_data = data[vehicle_columns]
participants_data = data[participant_columns]

# Remove records with null Vehicle ID from vehicles_data
vehicles_data = vehicles_data.dropna(subset=['Vehicle ID'])

# Output to new files
crashes_data.to_csv('crashes_data.csv', index=False)
vehicles_data.to_csv('vehicles_data.csv', index=False)
participants_data.to_csv('participants_data.csv', index=False)

print("Data clustering and output completed successfully!")


Data clustering and output completed successfully!


In [54]:
# Existence Assertions Validation
assert crashes_data['Crash ID'].notnull().all(), "Assertion Error: Missing Crash ID in some crash records"
print("All Existence assertions passed successfully!")

All Existence assertions passed successfully!


In [58]:
#Existence Assertion
assert vehicles_data['Vehicle ID'].notnull().all(), "Assertion Error: Missing Vehicle ID in some vehicle records"
print("All Existence assertions passed successfully!")


All Existence assertions passed successfully!


In [19]:
#Inter-record assertion
assert vehicles_data['Vehicle ID'].isin(participants_data['Vehicle ID']).all(), "Assertion Error: Orphaned vehicles found with no corresponding participants"

print("All Inter-record assertions passed successfully!")

All Inter-record assertions passed successfully!


In [59]:
#Intra-Record assertion
assert participants_data['Age'].between(0, 9).all(), "Assertion Error: Invalid participant age detected"

print("All Inter-record assertions passed successfully!")

AssertionError: Assertion Error: Invalid participant age detected

In [51]:
#Resolving Intra-Record assertion
assert participants_data['Age'][participants_data['Age'].notnull()].between(0, 9).all(), "Assertion Error: Invalid participant age detected"

print("All Inter-record assertions passed successfully!")


All Inter-record assertions passed successfully!


In [10]:
#Inter-record Check Assertions

assert participants_data['Crash ID'].isin(crashes_data['Crash ID']).all(), "Assertion Error: Participant ID not associated with a valid Crash ID"
print("All Inter-record assertions passed successfully!")

All Inter-record assertions passed successfully!


In [12]:
#Summary Check Assertions

assert len(vehicles_data) <= len(crashes_data), "Assertion Error: Total count of vehicles exceeds total count of crashes"
print("All Summary assertions passed successfully!")

All Summary assertions passed successfully!


In [14]:
#standard deviation Assertions

assert crashes_data['Week Day Code'].value_counts().std() < 50, "Assertion Error: Weekday distribution is not consistent"
print("All standard deviation assertions passed successfully!")

All standard deviation assertions passed successfully!
