In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# Define base paths
BASE_PATH = r"C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data"
ACCIDENTS_PATH = os.path.join(BASE_PATH, "Accidents")
CASUALTIES_PATH = os.path.join(BASE_PATH, "Casualties")
VEHICLES_PATH = os.path.join(BASE_PATH, "Vehicles")

# Set plotting style
plt.style.use('seaborn')
sns.set_palette("husl")

# Create directory for saving visualizations
def create_output_directory():
    if not os.path.exists('visualizations'):
        os.makedirs('visualizations')
    print("Created 'visualizations' directory for outputs")

# Data loading functions
def load_accident_data(year):
    try:
        file_path = os.path.join(ACCIDENTS_PATH, f'Accidents_{year}.csv')
        print(f"Attempting to load: {file_path}")
        return pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Could not load accident data for {year}: {e}")
        return None

def load_casualty_data(year):
    try:
        file_path = os.path.join(CASUALTIES_PATH, f'Casualties_{year}.csv')
        print(f"Attempting to load: {file_path}")
        return pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Could not load casualty data for {year}: {e}")
        return None

def load_vehicle_data(year):
    try:
        file_path = os.path.join(VEHICLES_PATH, f'Vehicles_{year}.csv')
        print(f"Attempting to load: {file_path}")
        return pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Could not load vehicle data for {year}: {e}")
        return None

def load_all_data(years):
    accidents_data = []
    casualties_data = []
    vehicles_data = []
    
    for year in years:
        print(f"\nLoading {year} data...")
        
        # Load each dataset
        acc_df = load_accident_data(year)
        cas_df = load_casualty_data(year)
        veh_df = load_vehicle_data(year)
        
        # Add year column and append if loading successful
        if acc_df is not None and cas_df is not None and veh_df is not None:
            acc_df['Year'] = year
            cas_df['Year'] = year
            veh_df['Year'] = year
            
            accidents_data.append(acc_df)
            casualties_data.append(cas_df)
            vehicles_data.append(veh_df)
            print(f"Successfully loaded {year} data")
    
    # Combine all years if data was loaded
    if accidents_data:
        return (pd.concat(accidents_data, ignore_index=True),
                pd.concat(casualties_data, ignore_index=True),
                pd.concat(vehicles_data, ignore_index=True))
    else:
        return None, None, None

def preprocess_data(accidents_df):
    """Preprocess the accidents dataframe"""
    if accidents_df is not None:
        accidents_df['Date'] = pd.to_datetime(accidents_df['Date'])
        accidents_df['Month'] = accidents_df['Date'].dt.month
        accidents_df['Hour'] = pd.to_datetime(accidents_df['Time']).dt.hour
        accidents_df['Year'] = accidents_df['Date'].dt.year
    return accidents_df

def create_visualizations(accidents_df, casualties_df, vehicles_df):
    """Create and save all visualizations"""
    
    def save_plot(plt, name):
        plt.savefig(f'visualizations/{name}.png', bbox_inches='tight', dpi=300)
        plt.close()

    # 1. Yearly Trends
    plt.figure(figsize=(12, 6))
    yearly_accidents = accidents_df.groupby('Year').size()
    sns.lineplot(x=yearly_accidents.index, y=yearly_accidents.values, marker='o')
    plt.title('Total Accidents by Year (2015-2018)')
    plt.xlabel('Year')
    plt.ylabel('Number of Accidents')
    save_plot(plt, 'yearly_trends')

    # 2. Monthly Patterns
    plt.figure(figsize=(12, 8))
    monthly_accidents = pd.crosstab(accidents_df['Year'], accidents_df['Month'])
    sns.heatmap(monthly_accidents, annot=True, fmt='d', cmap='YlOrRd')
    plt.title('Accidents by Month and Year')
    save_plot(plt, 'monthly_patterns')

    # 3. Severity Analysis
    plt.figure(figsize=(12, 6))
    severity_by_year = pd.crosstab(accidents_df['Year'], accidents_df['Accident_Severity'])
    severity_by_year.plot(kind='bar', stacked=True)
    plt.title('Accident Severity Distribution by Year')
    plt.tight_layout()
    save_plot(plt, 'severity_analysis')

    # 4. Time of Day Analysis
    plt.figure(figsize=(14, 8))
    time_day = pd.crosstab(accidents_df['Hour'], accidents_df['Day_of_Week'])
    sns.heatmap(time_day, cmap='YlOrRd', annot=True, fmt='d')
    plt.title('Accidents by Hour and Day of Week')
    save_plot(plt, 'time_of_day')

    # 5. Weather Conditions
    plt.figure(figsize=(15, 6))
    weather = pd.crosstab(accidents_df['Weather_Conditions'], 
                         accidents_df['Accident_Severity'])
    weather.plot(kind='bar', stacked=True)
    plt.title('Accident Severity by Weather Conditions')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    save_plot(plt, 'weather_conditions')

    # 6. Road Type
    plt.figure(figsize=(12, 6))
    sns.countplot(data=accidents_df, x='Road_Type', hue='Accident_Severity')
    plt.title('Accidents by Road Type and Severity')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    save_plot(plt, 'road_types')

    # 7. Casualty Analysis
    plt.figure(figsize=(12, 6))
    sns.histplot(data=casualties_df, x='Age_of_Casualty', hue='Casualty_Severity',
                multiple="stack", bins=30)
    plt.title('Age Distribution of Casualties by Severity')
    save_plot(plt, 'casualty_analysis')

    # 8. Vehicle Types
    plt.figure(figsize=(15, 6))
    top_vehicles = vehicles_df['Vehicle_Type'].value_counts().head(10)
    sns.barplot(x=top_vehicles.values, y=top_vehicles.index)
    plt.title('Top 10 Vehicle Types in Accidents')
    plt.tight_layout()
    save_plot(plt, 'vehicle_types')

    # 9. Speed Limit Analysis
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=accidents_df, x='Speed_limit', y='Number_of_Casualties')
    plt.title('Casualties by Speed Limit')
    save_plot(plt, 'speed_limit')

def generate_statistics(accidents_df, casualties_df, vehicles_df):
    """Generate and save summary statistics"""
    stats = {
        'Total Accidents': len(accidents_df),
        'Fatal Accidents': len(accidents_df[accidents_df['Accident_Severity'] == 'Fatal']),
        'Serious Accidents': len(accidents_df[accidents_df['Accident_Severity'] == 'Serious']),
        'Slight Accidents': len(accidents_df[accidents_df['Accident_Severity'] == 'Slight']),
        'Total Casualties': len(casualties_df),
        'Total Vehicles': len(vehicles_df),
        'Average Casualties per Accident': len(casualties_df) / len(accidents_df)
    }
    
    # Save statistics to file
    with open('visualizations/statistics.txt', 'w') as f:
        f.write("Traffic Accident Analysis Statistics\n")
        f.write("==================================\n\n")
        for stat, value in stats.items():
            if isinstance(value, int):
                f.write(f"{stat}: {value:,}\n")
            else:
                f.write(f"{stat}: {value:.2f}\n")
    
    return stats

def main():
    # Create output directory
    create_output_directory()
    
    # Load data
    print("Loading data...")
    years = [2015, 2016, 2017, 2018]
    accidents_df, casualties_df, vehicles_df = load_all_data(years)
    
    if accidents_df is not None:
        # Preprocess data
        print("Preprocessing data...")
        accidents_df = preprocess_data(accidents_df)
        
        # Create visualizations
        print("Creating visualizations...")
        create_visualizations(accidents_df, casualties_df, vehicles_df)
        
        # Generate statistics
        print("Generating statistics...")
        stats = generate_statistics(accidents_df, casualties_df, vehicles_df)
        
        print("\nAnalysis complete! Results saved in 'visualizations' directory")
        print("\nKey Statistics:")
        for stat, value in stats.items():
            if isinstance(value, int):
                print(f"{stat}: {value:,}")
            else:
                print(f"{stat}: {value:.2f}")
    else:
        print("Analysis could not be completed due to data loading errors")

if __name__ == "__main__":
    main()

  plt.style.use('seaborn')
  return pd.read_csv(file_path)


Created 'visualizations' directory for outputs
Loading data...

Loading 2015 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2015.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2015.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2015.csv
Successfully loaded 2015 data

Loading 2016 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2016.csv


  return pd.read_csv(file_path)


Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2016.csv
Could not load casualty data for 2016: [Errno 2] No such file or directory: 'C:\\Users\\Wolfrank\\Desktop\\GiGabyte\\CodeWolf\\UKDataProject\\Data\\Casualties\\Casualties_2016.csv'
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2016.csv

Loading 2017 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2017.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2017.csv


  return pd.read_csv(file_path)
  return pd.read_csv(file_path)


Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2017.csv


  return pd.read_csv(file_path)


Successfully loaded 2017 data

Loading 2018 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2018.csv


  return pd.read_csv(file_path)


Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2018.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2018.csv


  return pd.read_csv(file_path)


Successfully loaded 2018 data
Preprocessing data...


ValueError: time data "13/01/2015" doesn't match format "%m/%d/%Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# Define base paths
BASE_PATH = r"C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data"
ACCIDENTS_PATH = os.path.join(BASE_PATH, "Accidents")
CASUALTIES_PATH = os.path.join(BASE_PATH, "Casualties")
VEHICLES_PATH = os.path.join(BASE_PATH, "Vehicles")

# Set plotting style
plt.style.use('seaborn')
sns.set_palette("husl")

# Create directory for saving visualizations
def create_output_directory():
    if not os.path.exists('visualizations'):
        os.makedirs('visualizations')
    print("Created 'visualizations' directory for outputs")

# Data loading functions
def load_accident_data(year):
    try:
        file_path = os.path.join(ACCIDENTS_PATH, f'Accidents_{year}.csv')
        print(f"Attempting to load: {file_path}")
        return pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Could not load accident data for {year}: {e}")
        return None

def load_casualty_data(year):
    try:
        file_path = os.path.join(CASUALTIES_PATH, f'Casualties_{year}.csv')
        print(f"Attempting to load: {file_path}")
        return pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Could not load casualty data for {year}: {e}")
        return None

def load_vehicle_data(year):
    try:
        file_path = os.path.join(VEHICLES_PATH, f'Vehicles_{year}.csv')
        print(f"Attempting to load: {file_path}")
        return pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Could not load vehicle data for {year}: {e}")
        return None

def load_all_data(years):
    accidents_data = []
    casualties_data = []
    vehicles_data = []
    
    for year in years:
        print(f"\nLoading {year} data...")
        
        # Load each dataset
        acc_df = load_accident_data(year)
        cas_df = load_casualty_data(year)
        veh_df = load_vehicle_data(year)
        
        # Add year column and append if loading successful
        if acc_df is not None and cas_df is not None and veh_df is not None:
            acc_df['Year'] = year
            cas_df['Year'] = year
            veh_df['Year'] = year
            
            accidents_data.append(acc_df)
            casualties_data.append(cas_df)
            vehicles_data.append(veh_df)
            print(f"Successfully loaded {year} data")
    
    # Combine all years if data was loaded
    if accidents_data:
        return (pd.concat(accidents_data, ignore_index=True),
                pd.concat(casualties_data, ignore_index=True),
                pd.concat(vehicles_data, ignore_index=True))
    else:
        return None, None, None

def preprocess_data(accidents_df):
    """Preprocess the accidents dataframe with UK date format"""
    if accidents_df is not None:
        try:
            # Convert dates using UK format (day first)
            accidents_df['Date'] = pd.to_datetime(accidents_df['Date'], format='%d/%m/%Y', dayfirst=True)
            accidents_df['Month'] = accidents_df['Date'].dt.month
            
            # Convert time - check if Time column exists and handle accordingly
            if 'Time' in accidents_df.columns:
                try:
                    accidents_df['Hour'] = pd.to_datetime(accidents_df['Time'], format='%H:%M').dt.hour
                except ValueError:
                    print("Warning: Time format different than expected. Using basic hour extraction.")
                    accidents_df['Hour'] = accidents_df['Time'].str.split(':').str[0].astype(int)
            
            accidents_df['Year'] = accidents_df['Date'].dt.year
            print("Data preprocessing completed successfully")
        except Exception as e:
            print(f"Error during preprocessing: {e}")
            return None
    return accidents_df

def create_visualizations(accidents_df, casualties_df, vehicles_df):
    """Create and save all visualizations"""
    
    def save_plot(plt, name):
        plt.savefig(f'visualizations/{name}.png', bbox_inches='tight', dpi=300)
        plt.close()

    try:
        # 1. Yearly Trends
        plt.figure(figsize=(12, 6))
        yearly_accidents = accidents_df.groupby('Year').size()
        sns.lineplot(x=yearly_accidents.index, y=yearly_accidents.values, marker='o')
        plt.title('Total Accidents by Year (2015-2018)')
        plt.xlabel('Year')
        plt.ylabel('Number of Accidents')
        save_plot(plt, 'yearly_trends')

        # 2. Monthly Patterns
        plt.figure(figsize=(12, 8))
        monthly_accidents = pd.crosstab(accidents_df['Year'], accidents_df['Month'])
        sns.heatmap(monthly_accidents, annot=True, fmt='d', cmap='YlOrRd')
        plt.title('Accidents by Month and Year')
        save_plot(plt, 'monthly_patterns')

        # 3. Severity Analysis
        plt.figure(figsize=(12, 6))
        severity_by_year = pd.crosstab(accidents_df['Year'], accidents_df['Accident_Severity'])
        severity_by_year.plot(kind='bar', stacked=True)
        plt.title('Accident Severity Distribution by Year')
        plt.tight_layout()
        save_plot(plt, 'severity_analysis')

        # 4. Time of Day Analysis
        plt.figure(figsize=(14, 8))
        time_day = pd.crosstab(accidents_df['Hour'], accidents_df['Day_of_Week'])
        sns.heatmap(time_day, cmap='YlOrRd', annot=True, fmt='d')
        plt.title('Accidents by Hour and Day of Week')
        save_plot(plt, 'time_of_day')

        # 5. Weather Conditions
        plt.figure(figsize=(15, 6))
        weather = pd.crosstab(accidents_df['Weather_Conditions'], 
                            accidents_df['Accident_Severity'])
        weather.plot(kind='bar', stacked=True)
        plt.title('Accident Severity by Weather Conditions')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        save_plot(plt, 'weather_conditions')

        # 6. Road Type
        plt.figure(figsize=(12, 6))
        sns.countplot(data=accidents_df, x='Road_Type', hue='Accident_Severity')
        plt.title('Accidents by Road Type and Severity')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        save_plot(plt, 'road_types')

        # 7. Casualty Analysis
        plt.figure(figsize=(12, 6))
        sns.histplot(data=casualties_df, x='Age_of_Casualty', hue='Casualty_Severity',
                    multiple="stack", bins=30)
        plt.title('Age Distribution of Casualties by Severity')
        save_plot(plt, 'casualty_analysis')

        # 8. Vehicle Types
        plt.figure(figsize=(15, 6))
        top_vehicles = vehicles_df['Vehicle_Type'].value_counts().head(10)
        sns.barplot(x=top_vehicles.values, y=top_vehicles.index)
        plt.title('Top 10 Vehicle Types in Accidents')
        plt.tight_layout()
        save_plot(plt, 'vehicle_types')

        # 9. Speed Limit Analysis
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=accidents_df, x='Speed_limit', y='Number_of_Casualties')
        plt.title('Casualties by Speed Limit')
        save_plot(plt, 'speed_limit')

        print("All visualizations created successfully")
    except Exception as e:
        print(f"Error creating visualizations: {e}")

def generate_statistics(accidents_df, casualties_df, vehicles_df):
    """Generate and save summary statistics"""
    try:
        stats = {
            'Total Accidents': len(accidents_df),
            'Fatal Accidents': len(accidents_df[accidents_df['Accident_Severity'] == 'Fatal']),
            'Serious Accidents': len(accidents_df[accidents_df['Accident_Severity'] == 'Serious']),
            'Slight Accidents': len(accidents_df[accidents_df['Accident_Severity'] == 'Slight']),
            'Total Casualties': len(casualties_df),
            'Total Vehicles': len(vehicles_df),
            'Average Casualties per Accident': len(casualties_df) / len(accidents_df)
        }
        
        # Save statistics to file
        with open('visualizations/statistics.txt', 'w') as f:
            f.write("Traffic Accident Analysis Statistics\n")
            f.write("==================================\n\n")
            for stat, value in stats.items():
                if isinstance(value, int):
                    f.write(f"{stat}: {value:,}\n")
                else:
                    f.write(f"{stat}: {value:.2f}\n")
        
        return stats
    except Exception as e:
        print(f"Error generating statistics: {e}")
        return None

def main():
    try:
        # Create output directory
        create_output_directory()
        
        # Load data
        print("Loading data...")
        years = [2015, 2016, 2017, 2018]
        accidents_df, casualties_df, vehicles_df = load_all_data(years)
        
        if accidents_df is not None:
            # Preprocess data
            print("Preprocessing data...")
            accidents_df = preprocess_data(accidents_df)
            
            if accidents_df is not None:
                # Create visualizations
                print("Creating visualizations...")
                create_visualizations(accidents_df, casualties_df, vehicles_df)
                
                # Generate statistics
                print("Generating statistics...")
                stats = generate_statistics(accidents_df, casualties_df, vehicles_df)
                
                if stats:
                    print("\nAnalysis complete! Results saved in 'visualizations' directory")
                    print("\nKey Statistics:")
                    for stat, value in stats.items():
                        if isinstance(value, int):
                            print(f"{stat}: {value:,}")
                        else:
                            print(f"{stat}: {value:.2f}")
            else:
                print("Error during data preprocessing")
        else:
            print("Analysis could not be completed due to data loading errors")
    except Exception as e:
        print(f"Error in main execution: {e}")

if __name__ == "__main__":
    main()

  plt.style.use('seaborn')
  return pd.read_csv(file_path)


Created 'visualizations' directory for outputs
Loading data...

Loading 2015 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2015.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2015.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2015.csv
Successfully loaded 2015 data

Loading 2016 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2016.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2016.csv
Could not load casualty data for 2016: [Errno 2] No such file or directory: 'C:\\Users\\Wolfrank\\Desktop\\GiGabyte\\CodeWolf\\UKDataProject\\Data\\Casualties\\Casualties_2016.csv'
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2016.csv


  return pd.read_csv(file_path)



Loading 2017 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2017.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2017.csv


  return pd.read_csv(file_path)
  return pd.read_csv(file_path)


Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2017.csv


  return pd.read_csv(file_path)
  return pd.read_csv(file_path)


Successfully loaded 2017 data

Loading 2018 data...
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Accidents\Accidents_2018.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Casualties\Casualties_2018.csv
Attempting to load: C:\Users\Wolfrank\Desktop\GiGabyte\CodeWolf\UKDataProject\Data\Vehicles\Vehicles_2018.csv
Successfully loaded 2018 data


  return pd.read_csv(file_path)
  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):


Preprocessing data...
Data preprocessing completed successfully
Creating visualizations...


  with pd.option_context('mode.use_inf_as_na', True):
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)
  data_subset = grouped_data.get_group(pd_key)


All visualizations created successfully
Generating statistics...

Analysis complete! Results saved in 'visualizations' directory

Key Statistics:
Total Accidents: 392,673
Fatal Accidents: 0
Serious Accidents: 0
Slight Accidents: 0
Total Casualties: 517,779
Total Vehicles: 723,180
Average Casualties per Accident: 1.32


<Figure size 1200x600 with 0 Axes>

<Figure size 1500x600 with 0 Axes>