Main Focus:

This notebook focuses on how external factors influence accident rates and identifies key high-risk locations.

- **Impact of External Factors:**
    - Analyze how weather conditions, holidays, and weekends affect accident frequency and severity.
    
- **Key Locations:**
    - Identify the most dangerous intersections or streets.
    - Map accident severity geographically to highlight high-risk areas.

In [None]:
# Data processing and manipulation
import dask.dataframe as dd                         
import pandas as pd                                 
import numpy as np                                  
import re                                           

# Visualization libraries
import matplotlib.pyplot as plt                     
from matplotlib.ticker import StrMethodFormatter     
import seaborn as sns                              

# Machine learning and preprocessing
from sklearn.preprocessing import LabelEncoder      

# Memory management
import gc                                         


•	How does weather affect accident rates?



FIRST IDEA

In [None]:


# Load the dataset
df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet')

# Filter relevant columns
df_filtered = df[['Severity', 'Weather_Condition']].dropna()

# Compute the filtered DataFrame
df_filtered = df_filtered.compute()

# Encode the 'Weather_Condition' using Label Encoding
label_encoder = LabelEncoder()
df_filtered['Weather_Condition_Encoded'] = label_encoder.fit_transform(df_filtered['Weather_Condition'])

# Loop through each severity level from 1 to 4
for severity in range(1, 5):
    # Filter the DataFrame for the current severity level
    df_severity = df_filtered[df_filtered['Severity'] == severity]

    # Count occurrences of each weather condition
    severity_counts = df_severity['Weather_Condition'].value_counts()
    total_severity = severity_counts.sum()  # Total count of weather conditions
    nan_count = df_severity['Weather_Condition'].isna().sum()  # Count of NaN values

    # Get the top 5 weather conditions
    top_5_severity = severity_counts.nlargest(5)
    print(f"Top 5 Weather Conditions for Severity Level {severity}:\n{top_5_severity}")

    # Plotting the bar chart for top 5 weather conditions
    plt.figure(figsize=(12, 6))
    top_5_severity.plot(kind='bar', color='skyblue')

    # Formatting the bar plot
    plt.title(f'Top 5 Weather Conditions in Severity Level {severity} Accidents')
    plt.xlabel('Weather Condition')
    plt.ylabel('Counts')
    plt.xticks(rotation=45, ha='right')
    plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))  # Format y-axis as integer
    plt.tight_layout()  # Adjust layout to prevent clipping
    plt.show()

    # Prepare data for heatmap
    # Filter the DataFrame to include only the current severity level and top 5 weather conditions
    df_top5 = df_filtered[df_filtered['Weather_Condition'].isin(top_5_severity.index)]

    # Create a pivot table to count occurrences of each weather condition by severity
    pivot_table = pd.crosstab(df_top5['Weather_Condition'], df_top5['Severity'])

    # Generate the heatmap for top 5 weather conditions by severity
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot_table, annot=True, fmt='d', cmap='YlGnBu', cbar_kws={'label': 'Count'})
    plt.title(f'Heatmap of Top 5 Weather Conditions by Accident Severity Level {severity}')
    plt.xlabel('Severity Level')
    plt.ylabel('Weather Condition')
    plt.xticks(rotation=45)
    plt.tight_layout()  # Adjust layout to prevent clipping
    plt.show()

# Calculate the correlation matrix for severity and encoded weather conditions
correlation_matrix = df_filtered[['Severity', 'Weather_Condition_Encoded']].corr()

# Display the correlation matrix
print("Correlation Matrix:\n", correlation_matrix)

# Plot the heatmap for the correlation matrix
plt.figure(figsize=(8, 4))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Correlation between Severity and Weather Conditions')
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()

SECOND IDEA

In [None]:
import dask.dataframe as dd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet')

# Filter for relevant columns
df_filtered = df[['Severity', 'Weather_Condition']]

# Drop rows with NaN values in the filtered DataFrame
df_filtered = df_filtered.dropna()

# Compute the filtered DataFrame
df_filtered = df_filtered.compute()

# Count occurrences of each weather condition
weather_counts = df_filtered['Weather_Condition'].value_counts()

# Get the top 5 weather conditions
top_5_weather = weather_counts.head(5).index

# Filter the DataFrame to include only the top 5 weather conditions
df_top5 = df_filtered[df_filtered['Weather_Condition'].isin(top_5_weather)]

# Create a pivot table to count occurrences of each weather condition by severity
pivot_table = pd.crosstab(df_top5['Weather_Condition'], df_top5['Severity'])

# Generate the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, fmt='d', cmap='YlGnBu', cbar_kws={'label': 'Count'})
plt.title('Heatmap of Top 5 Weather Conditions by Accident Severity')
plt.xlabel('Severity Level')
plt.ylabel('Weather Condition')
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()

•	Do accidents increase during specific holidays or weekends?



In [None]:
GOTTA CODE THIS

•	Can you visualize the top 10 most dangerous intersections or streets in the U.S.?



In [None]:


# Load the dataset
df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet')

# Define a function to classify street types
def classify_street_type(street):
    if pd.isna(street):  # Check for NA values
        return 'Unknown'  # Assign 'Unknown' for NA values
    elif re.search(r'\bSt\b|\bStreet\b|\bSt.\b|\bS\b', street, re.IGNORECASE):
        return 'Street'
    elif re.search(r'\bAve\b|\bAvenue\b|\bAv.\b|\bA.\b', street, re.IGNORECASE):
        return 'Avenue'
    elif re.search(r'\bDr\b|\bDrive\b|\bDr.\b|\bD.\b', street, re.IGNORECASE):
        return 'Drive'
    elif re.search(r'\bPike\b|\bPk\b|\bP.\b', street, re.IGNORECASE):
        return 'Pike'
    elif re.search(r'\bHighway\b|\bHwy\b|\bHwy.\b|\bH.\b', street, re.IGNORECASE):
        return 'Highway'
    elif re.search(r'\bBlvd\b|\bBoulevard\b|\bBlvd.\b|\bB.\b', street, re.IGNORECASE):
        return 'Boulevard'
    elif re.search(r'\bLn\b|\bLane\b|\bLn.\b|\bL.\b', street, re.IGNORECASE):
        return 'Lane'
    elif re.search(r'\bCt\b|\bCourt\b|\bCt.\b|\bC.\b', street, re.IGNORECASE):
        return 'Court'
    elif re.search(r'\bPl\b|\bPlace\b|\bPl.\b|\bP.\b', street, re.IGNORECASE):
        return 'Place'
    elif re.search(r'\bTer\b|\bTerrace\b|\bTer.\b|\bT.\b', street, re.IGNORECASE):
        return 'Terrace'
    elif re.search(r'\bCir\b|\bCircle\b|\bCir.\b|\bC.\b', street, re.IGNORECASE):
        return 'Circle'
    else:
        return 'Other'

# Apply the classification function to the 'Street' column
df['Street_Type'] = df['Street'].map(classify_street_type, meta=('x', 'object'))

# Count NA values in 'Street' column
na_count = df['Street'].isna().sum().compute()
print(f"Number of NA values in 'Street' column: {na_count}")

# Aggregate accidents by street type
accidents_by_street_type = df.groupby('Street_Type')['Severity'].count().compute()

# Print the counts per street type
print("\nCounts of accidents by street type:")
print(accidents_by_street_type)

# Sort the results for better visualization
accidents_by_street_type = accidents_by_street_type.sort_values(ascending=False)

# Ensure the total count matches expected shape
total_accidents = accidents_by_street_type.sum()
print(f"\nTotal accidents counted: {total_accidents}")

# Plotting the results
plt.figure(figsize=(12, 6))
accidents_by_street_type.plot(kind='bar', color='skyblue')

# Formatting the plot
plt.title('Accidents by Street Type')
plt.xlabel('Street Type')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45)
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()

•	How does the severity of accidents differ when mapped geographically?

TABLEAU