In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os
import glob
import json
import gzip
import pandas as pd

data_dir = '/home/reaf/MOT-analysis/notebooks/mot_data/'
json_files = glob.glob(os.path.join(data_dir, '*.json.gz'))
all_data = []

for i, file_path in enumerate(json_files):

    if i % 100 == 0 and i > 0:
        break
    try:
        file_records = []
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                try:
                    if line.strip():  
                        record = json.loads(line)
                        file_records.append(record)
                except json.JSONDecodeError as e:
                    print(f"Error in {os.path.basename(file_path)}, line {i+1}: {str(e)[:100]}")
                    continue
        
        all_data.extend(file_records)
        print(f"Loaded: {os.path.basename(file_path)} - {len(file_records)} records")
    except Exception as e:
        print(f"Failed to process {os.path.basename(file_path)}: {str(e)[:100]}")

# Create DataFrame
if all_data:
    df = pd.DataFrame(all_data)
    
else:
    print("No data was successfully loaded.")

Loaded: delta-light-vehicle_24-03-2025_171.json.gz - 15 records
Loaded: delta-light-vehicle_25-03-2025_265.json.gz - 506 records
Loaded: delta-light-vehicle_25-03-2025_142.json.gz - 499 records
Loaded: delta-light-vehicle_25-03-2025_363.json.gz - 363 records
Loaded: delta-light-vehicle_24-03-2025_24.json.gz - 9 records
Loaded: delta-light-vehicle_24-03-2025_70.json.gz - 6 records
Loaded: delta-light-vehicle_25-03-2025_285.json.gz - 491 records
Loaded: delta-light-vehicle_25-03-2025_196.json.gz - 456 records
Loaded: delta-light-vehicle_24-03-2025_150.json.gz - 6 records
Loaded: delta-light-vehicle_24-03-2025_104.json.gz - 10 records
Loaded: delta-light-vehicle_25-03-2025_165.json.gz - 497 records
Loaded: delta-light-vehicle_24-03-2025_345.json.gz - 12 records
Loaded: delta-light-vehicle_25-03-2025_369.json.gz - 450 records
Loaded: delta-light-vehicle_24-03-2025_128.json.gz - 4 records
Loaded: delta-light-vehicle_24-03-2025_9.json.gz - 5 records
Loaded: delta-light-vehicle_24-03-2025_203

Cars I am looking at (small hatchback):

- ford fiesta mk6: 2008-2017
- toyota yaris mk3: 2011-2020
- mazda2 mk2: 2007-2014
- panda 312: 2011-
- vw polo mk5: 2009-2017



In [3]:
df.head()

Unnamed: 0,registration,firstUsedDate,registrationDate,manufactureDate,primaryColour,secondaryColour,engineSize,model,make,fuelType,lastMotTestDate,motTests,lastUpdateTimestamp,dataSource,lastUpdateDate,lastRunDate,lastRunTimestamp,modification
0,L13UMG,2008-08-08,2008-08-08,2008-08-08,White,Not Stated,1596.0,FIESTA,FORD,Petrol,2025-03-23T15:24:56.000Z,"[{'completedDate': '2011-08-30T14:52:54.000Z',...",2025-03-23 15:24:56.000000,dvsa,2025-03-23,2025-03-24,2025-03-24 05:29:23.392803,UPDATED
1,RA07HSZ,2007-08-08,2007-08-08,2007-08-08,Silver,Not Stated,2460.0,TRANSPORTER,VOLKSWAGEN,Diesel,2025-03-23T10:16:40.000Z,"[{'completedDate': '2010-09-08T17:31:31.000Z',...",2025-03-23 10:16:40.000000,dvsa,2025-03-23,2025-03-24,2025-03-24 05:29:23.392803,UPDATED
2,YA17USZ,2017-06-29,2017-06-29,2017-06-29,Blue,Not Stated,1995.0,320,BMW,Diesel,2025-03-23T11:44:47.000Z,"[{'completedDate': '2020-06-21T06:58:49.000Z',...",2025-03-23 11:44:47.000000,dvsa,2025-03-23,2025-03-24,2025-03-24 05:29:23.392803,UPDATED
3,NU10MPO,2010-03-19,2010-03-19,2010-03-19,Blue,Not Stated,1598.0,MINI,MINI,Petrol,2025-03-23T16:56:58.000Z,"[{'completedDate': '2013-03-22T09:56:43.000Z',...",2025-03-23 16:56:58.000000,dvsa,2025-03-23,2025-03-24,2025-03-24 05:29:23.392803,UPDATED
4,WO18VNJ,2018-06-22,2018-06-22,2018-06-22,White,Not Stated,1242.0,500,FIAT,Petrol,2025-03-23T13:03:18.000Z,"[{'completedDate': '2021-06-03T08:37:46.000Z',...",2025-03-23 13:03:18.000000,dvsa,2025-03-23,2025-03-24,2025-03-24 05:29:23.392803,UPDATED


In [4]:
df_copy = df.copy()

In [5]:
df['registrationDate'] = pd.to_datetime(df['registrationDate'])
df['manufactureDate'] = pd.to_datetime(df['manufactureDate'])
df['firstUsedDate'] = pd.to_datetime(df['firstUsedDate'])


In [6]:
ford_fiesta = df[(df["model"] == "FIESTA") &  (df["make"] == "FORD")].copy()
# Filter for Ford Fiesta mk6 (2008-2017)
ford_fiesta_mk6 = ford_fiesta[(ford_fiesta["registrationDate"] >= pd.to_datetime('2008-01-01')) & 
                             (ford_fiesta["registrationDate"] <= pd.to_datetime('2017-12-31'))]

In [7]:
mazda_2 = df[(df["model"] == "2") & (df["make"] == "MAZDA")].copy()
mazda_2_mk2 = mazda_2[(mazda_2["registrationDate"] >= pd.to_datetime('2007-01-01')) & 
                             (mazda_2["registrationDate"] <= pd.to_datetime('2014-12-31'))]

In [8]:
toyota_yaris = df[(df["model"] == "YARIS") & (df["make"] == "TOYOTA")].copy()
toyota_yaris_mk3 = toyota_yaris[(toyota_yaris["registrationDate"] >= pd.to_datetime('2011-01-01')) & 
                             (toyota_yaris["registrationDate"] <= pd.to_datetime('2020-12-31'))]

In [9]:
fiat_panda = df[(df["model"] == "PANDA") & (df["make"] == "FIAT")].copy()
fiat_panda_mk4 = fiat_panda[(fiat_panda["registrationDate"] >= pd.to_datetime('2011-01-01')) & 
                             (fiat_panda["registrationDate"] <= pd.to_datetime('2020-12-31'))]

In [10]:
vw_polo = df[(df["model"] == "POLO") & (df["make"] == "VOLKSWAGEN")].copy()
vw_polo_mk5 = vw_polo[(vw_polo["registrationDate"] >= pd.to_datetime('2009-01-01')) & 
                             (vw_polo["registrationDate"] <= pd.to_datetime('2017-12-31'))]

In [11]:
ford_fiesta_mk6.head()

Unnamed: 0,registration,firstUsedDate,registrationDate,manufactureDate,primaryColour,secondaryColour,engineSize,model,make,fuelType,lastMotTestDate,motTests,lastUpdateTimestamp,dataSource,lastUpdateDate,lastRunDate,lastRunTimestamp,modification
0,L13UMG,2008-08-08,2008-08-08,2008-08-08,White,Not Stated,1596.0,FIESTA,FORD,Petrol,2025-03-23T15:24:56.000Z,"[{'completedDate': '2011-08-30T14:52:54.000Z',...",2025-03-23 15:24:56.000000,dvsa,2025-03-23,2025-03-24,2025-03-24 05:29:23.392803,UPDATED
19,WV13UXH,2013-04-09,2013-04-09,2013-04-09,Black,Not Stated,1499.0,FIESTA,FORD,Diesel,2025-03-24T12:14:40.000Z,"[{'completedDate': '2016-03-15T16:46:07.000Z',...",2025-03-24 12:14:40.000000,dvsa,2025-03-24,2025-03-25,2025-03-25 05:27:04.334721,UPDATED
34,SJ15LTA,2015-03-31,2015-03-31,2015-03-31,Blue,Not Stated,1242.0,FIESTA,FORD,Petrol,2025-03-24T10:53:19.000Z,"[{'completedDate': '2018-09-17T15:03:52.000Z',...",2025-03-24 10:53:19.000000,dvsa,2025-03-24,2025-03-25,2025-03-25 05:27:04.334721,UPDATED
159,SV59NYP,2009-09-04,2009-09-04,2009-09-04,Red,Not Stated,1242.0,FIESTA,FORD,Petrol,2025-03-24T15:53:32.000Z,"[{'completedDate': '2012-08-13T09:06:10.000Z',...",2025-03-24 15:53:32.000000,dvsa,2025-03-24,2025-03-25,2025-03-25 05:27:04.334721,UPDATED
229,MH08RAN,2011-03-29,2011-03-29,2011-03-29,White,Not Stated,1242.0,FIESTA,FORD,Petrol,2025-03-24T18:44:53.000Z,"[{'completedDate': '2014-03-25T14:30:07.000Z',...",2025-03-24 18:44:53.000000,dvsa,2025-03-24,2025-03-25,2025-03-25 05:27:04.334721,UPDATED


In [14]:
# Let's parse the motTests column from ford_fiesta_mk6 dataframe
# First, let's check a single sample to understand the structure
sample_mot_tests = ford_fiesta_mk6['motTests'].iloc[0]

# Creating a function to parse the motTests column
def parse_mot_tests(df):
    # Create a new dataframe to store flattened MOT test data
    mot_data = []
    
    for idx, row in df.iterrows():
        registration = row['registration']
        registration_date = row['registrationDate']
        mot_tests = eval(row['motTests']) if isinstance(row['motTests'], str) else row['motTests']
        
        for test in mot_tests:
            completed_date = pd.to_datetime(test.get('completedDate', None))
            
            # Calculate age of car at test time (in years)
            if pd.notnull(completed_date) and pd.notnull(registration_date):
                car_age_years = (completed_date.replace(tzinfo=None) - registration_date.replace(tzinfo=None)).days / 365.25
            else:
                car_age_years = None
                
            # Extract defects
            defects = test.get('defects', [])
            defect_list = []
            for defect in defects:
                defect_info = {
                    'type': defect.get('type', None),
                    'text': defect.get('text', None),
                    'dangerous': defect.get('dangerous', False),
                    'defectCategory': defect.get('defectCategory', None)
                }
                defect_list.append(defect_info)
            
            test_data = {
                'registration': registration,
                'completedDate': completed_date,
                'testResult': test.get('testResult', None),
                'odometerValue': test.get('odometerValue', None),
                'odometerUnit': test.get('odometerUnit', None),
                'motTestNumber': test.get('motTestNumber', None),
                'carAgeYears': car_age_years,
                'defects': defect_list,
                'defectCount': len(defect_list)
            }
            mot_data.append(test_data)
    
    # Create DataFrame from the list of dictionaries
    mot_df = pd.DataFrame(mot_data)
    return mot_df

# Apply the function to parse MOT test data
ford_fiesta_mot_tests = parse_mot_tests(ford_fiesta_mk6)

# Display the first few rows of the parsed data
print(f"Total MOT test records: {len(ford_fiesta_mot_tests)}")
ford_fiesta_mot_tests.head()

Total MOT test records: 8211


Unnamed: 0,registration,completedDate,testResult,odometerValue,odometerUnit,motTestNumber,carAgeYears,defects,defectCount
0,L13UMG,2011-08-30 14:52:54+00:00,PASSED,18135.0,MI,,3.058179,"[{'type': 'ADVISORY', 'text': 'Offside Rear Ty...",1
1,L13UMG,2012-12-17 14:27:32+00:00,PASSED,26910.0,MI,,4.358658,[],0
2,L13UMG,2013-11-23 09:39:17+00:00,FAILED,41314.0,MI,,5.292266,"[{'type': 'FAIL', 'text': 'Nearside Front Fron...",6
3,L13UMG,2013-11-23 12:02:07+00:00,PASSED,41314.0,MI,,5.292266,[],0
4,L13UMG,2015-01-05 10:48:06+00:00,FAILED,63293.0,MI,,6.409309,"[{'type': 'FAIL', 'text': 'Anti-lock braking s...",3


In [17]:
ford_fiesta_mot_tests


Unnamed: 0,registration,completedDate,testResult,odometerValue,odometerUnit,motTestNumber,carAgeYears,defects,defectCount
0,L13UMG,2011-08-30 14:52:54+00:00,PASSED,18135.0,MI,,3.058179,"[{'type': 'ADVISORY', 'text': 'Offside Rear Ty...",1
1,L13UMG,2012-12-17 14:27:32+00:00,PASSED,26910.0,MI,,4.358658,[],0
2,L13UMG,2013-11-23 09:39:17+00:00,FAILED,41314.0,MI,,5.292266,"[{'type': 'FAIL', 'text': 'Nearside Front Fron...",6
3,L13UMG,2013-11-23 12:02:07+00:00,PASSED,41314.0,MI,,5.292266,[],0
4,L13UMG,2015-01-05 10:48:06+00:00,FAILED,63293.0,MI,,6.409309,"[{'type': 'FAIL', 'text': 'Anti-lock braking s...",3
...,...,...,...,...,...,...,...,...,...
8206,VK09NFG,2023-03-21 07:34:36+00:00,PASSED,76087.0,MI,,13.875428,"[{'type': 'ADVISORY', 'text': 'Exhaust has a m...",2
8207,VK09NFG,2024-03-26 07:38:59+00:00,FAILED,79610.0,MI,,14.891170,"[{'type': 'PRS', 'text': 'Nearside Position la...",2
8208,VK09NFG,2024-03-26 07:39:00+00:00,PASSED,79610.0,MI,,14.891170,[],0
8209,VK09NFG,2025-03-24 06:35:55+00:00,FAILED,88780.0,MI,,15.885010,"[{'type': 'FAIL', 'text': 'Engine MIL inoperat...",11


In [20]:
def extract_defect_details(mot_df):
    """
    Extract detailed information about defects from MOT test data
    including car age, mileage, and defect types.
    """
    defect_details = []
    
    for _, row in mot_df.iterrows():
        # Skip if there are no defects
        if row['defectCount'] == 0:
            continue
        
        # Extract basic test information
        car_age = row['carAgeYears']
        mileage = row['odometerValue']
        test_date = row['completedDate']
        test_result = row['testResult']
        registration = row['registration']
        
        # Process each defect in the test
        defects = row['defects']
        for defect in defects:
            defect_type = defect.get('type', 'Unknown')
            defect_text = defect.get('text', 'No description')
            defect_category = defect.get('defectCategory', 'Not categorized')
            dangerous = defect.get('dangerous', False)
            
            # Determine component category from text
            component = 'Other'
            text_lower = defect_text.lower()
            if any(word in text_lower for word in ['tyre', 'wheel']):
                component = 'Tyres/Wheels'
            elif any(word in text_lower for word in ['brake', 'abs']):
                component = 'Brakes'
            elif any(word in text_lower for word in ['suspension', 'shock', 'spring']):
                component = 'Suspension'
            elif any(word in text_lower for word in ['steering']):
                component = 'Steering'
            elif any(word in text_lower for word in ['light', 'lamp', 'beam']):
                component = 'Lights'
            elif any(word in text_lower for word in ['exhaust', 'emission']):
                component = 'Exhaust'
            elif any(word in text_lower for word in ['rust', 'corrosion']):
                component = 'Corrosion'
            elif any(word in text_lower for word in ['wiper', 'windscreen', 'window']):
                component = 'Visibility'
            
            defect_details.append({
                'registration': registration,
                'test_date': test_date,
                'test_result': test_result,
                'car_age_years': car_age,
                'mileage': mileage,
                'defect_type': defect_type,
                'defect_text': defect_text,
                'component': component,
                'dangerous': dangerous
            })
    
    # Convert to DataFrame
    defects_df = pd.DataFrame(defect_details)
    return defects_df

# Extract detailed defect information from Ford Fiesta MOT tests
ford_fiesta_defects = extract_defect_details(ford_fiesta_mot_tests)

# Display basic statistics
print(f"Total defects found: {len(ford_fiesta_defects)}")
print(f"Defect types distribution:\n{ford_fiesta_defects['defect_type'].value_counts()}")
print(f"Component categories distribution:\n{ford_fiesta_defects['component'].value_counts()}")

# Show the first few rows
ford_fiesta_defects.head()

Total defects found: 15540
Defect types distribution:
defect_type
ADVISORY        10680
FAIL             3309
PRS               704
MINOR             512
USER ENTERED      335
Name: count, dtype: int64
Component categories distribution:
component
Tyres/Wheels    5001
Lights          2499
Brakes          2495
Other           2243
Suspension      1820
Exhaust          740
Visibility       502
Steering         216
Corrosion         24
Name: count, dtype: int64


Unnamed: 0,registration,test_date,test_result,car_age_years,mileage,defect_type,defect_text,component,dangerous
0,L13UMG,2011-08-30 14:52:54+00:00,PASSED,3.058179,18135.0,ADVISORY,Offside Rear Tyre worn close to the legal limi...,Tyres/Wheels,False
1,L13UMG,2013-11-23 09:39:17+00:00,FAILED,5.292266,41314.0,FAIL,Nearside Front Front position lamp(s) not work...,Lights,False
2,L13UMG,2013-11-23 09:39:17+00:00,FAILED,5.292266,41314.0,FAIL,Nearside Front Tyre tread depth below requirem...,Tyres/Wheels,False
3,L13UMG,2013-11-23 09:39:17+00:00,FAILED,5.292266,41314.0,FAIL,Nearside Front brake disc in such a condition ...,Brakes,False
4,L13UMG,2013-11-23 09:39:17+00:00,FAILED,5.292266,41314.0,FAIL,Nearside Rear Tyre tread depth below requireme...,Tyres/Wheels,False


{'vehicle': 'Ford Fiesta MK6',
 'total_tests': 8211,
 'failed_tests': 1916,
 'failure_rate': 0.23334551211789065,
 'avg_defects': 1.8925831202046035,
 'avg_mileage': 55759.17735434225,
 'defect_types': {None: 15540}}

In [None]:
# Function to create co-occurrence network for a specific age group
def create_component_cooccurrence_network(defects_df):
    # Group defects by registration and test_date
    grouped = defects_df.groupby(['registration', 'test_date'])
    
    # Count co-occurrences
    cooccurrence_counts = Counter()
    
    for name, group in grouped:
        # If more than one component has issues in the same test, they co-occur
        components = list(group['defect_text'].unique())
        if len(components) > 1:
            # Count all pairwise combinations
            for i in range(len(components)):
                for j in range(i+1, len(components)):
                    # Sort component names to ensure consistent pairs
                    pair = tuple(sorted([components[i], components[j]]))
                    cooccurrence_counts[pair] += 1
    
    # Create network graph
    G = nx.Graph()
    
    # Add nodes (components) - ONLY if they appear in this age group's defects
    components_in_age_group = defects_df['defect_text'].unique()
    for component in components_in_age_group:
        # Count how many failures of this component type in this age group
        count = len(defects_df[defects_df['defect_text'] == component])
        if count > 0:  # Only add components with actual failures
            G.add_node(component, count=count)
    
    # Add edges (co-occurrences) - only between nodes that exist in the graph
    for (comp1, comp2), count in cooccurrence_counts.items():
        if comp1 in G.nodes() and comp2 in G.nodes() and count >= 1:
            G.add_edge(comp1, comp2, weight=count)
    
    return G, cooccurrence_counts

# Group Ford Fiesta defects by car age (rounded to nearest year)
ford_fiesta_defects['car_age_rounded'] = ford_fiesta_defects['car_age_years'].round().astype('Int64')

# Get range of ages
min_age = ford_fiesta_defects['car_age_rounded'].min()
max_age = ford_fiesta_defects['car_age_rounded'].max()

# Filter out potential NaN values
age_range = [age for age in range(int(min_age), int(max_age) + 1) 
            if age in ford_fiesta_defects['car_age_rounded'].dropna().unique()]

# Create a list of all components with failures across all age groups
# This ensures consistent colors across different age graphs
components_with_failures = list(ford_fiesta_defects['defect_text'].unique())

# Generate a consistent color map only for components with failures
color_map = {comp: plt.cm.tab10(i % 10) for i, comp in enumerate(components_with_failures)}

# Store centrality data for later comparison
age_centrality = {}

# Create a separate network graph for each year of car age
for age in age_range:
    # Filter data for this age
    age_defects = ford_fiesta_defects[ford_fiesta_defects['car_age_rounded'] == age]
    
    # Skip if too few data points
    if len(age_defects) < 10:
        print(f"Skipping age {age} - insufficient data points")
        continue
    
    # Create a new figure for this age
    plt.figure(figsize=(12, 9))
    
    # Create network for this age
    G, cooccurrence_counts = create_component_cooccurrence_network(age_defects)
    
    # Skip if no edges
    if G.number_of_edges() == 0:
        plt.text(0.5, 0.5, f"No co-occurrences for age {age}", 
                ha='center', va='center', fontsize=12)
        plt.axis('off')
        plt.title(f"Age {age} years: No failure co-occurrences", fontsize=14)
        plt.tight_layout()
        plt.show()
        continue
    
    # Position nodes - use same seed for more consistent layouts
    pos = nx.spring_layout(G, seed=42, k=0.3)
    
    # Node size based on frequency of component failures
    # Scale relative to the number of defects for this age
    max_count = max([G.nodes[node]['count'] for node in G.nodes()]) if G.nodes else 1
    node_sizes = [200 + (G.nodes[node]['count'] * 600 / max_count) for node in G.nodes()]
    
    # Edge width based on co-occurrence count
    max_weight = max([G[u][v]['weight'] for u, v in G.edges()]) if G.edges else 1
    edge_widths = [1.5 + (G[u][v]['weight'] * 5 / max_weight) for u, v in G.edges()]
    
    # Node colors based on component type - using only components that appear in the graph
    node_colors = [color_map[node] for node in G.nodes()]
    
    # Draw the network
    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, alpha=0.8,
                          node_color=node_colors, linewidths=1, edgecolors='black')
    nx.draw_networkx_edges(G, pos, width=edge_widths, alpha=0.7, edge_color='gray')
    
    # Add node labels with component name and count
    node_labels = {node: f"{node}\n({G.nodes[node]['count']})" for node in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10, font_weight='bold')
    
    # Add edge labels (only if not too crowded)
    if G.number_of_edges() <= 15:
        edge_labels = {(u, v): G[u][v]['weight'] for u, v in G.edges()}
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=9)
    
    # Calculate centrality for this age
    centrality = nx.degree_centrality(G)
    age_centrality[age] = centrality
    
    # Add a text box with component occurrence counts
    component_counts = sorted([(comp, G.nodes[comp]['count']) for comp in G.nodes()], 
                             key=lambda x: x[1], reverse=True)
    count_text = "Component Failures:\n" + "\n".join([f"{comp}: {count}" for comp, count in component_counts])
    plt.figtext(0.01, 0.05, count_text, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
    
    # Add a text box with top co-occurrence counts
    if cooccurrence_counts:
        top_cooccur = sorted(cooccurrence_counts.items(), key=lambda x: x[1], reverse=True)[:7]
        cooccur_text = "Top Co-occurrences:\n" + "\n".join([f"{p[0]} & {p[1]}: {count}" for (p, count) in top_cooccur])
        plt.figtext(0.01, 0.50, cooccur_text, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
    
    # Add summary information
    total_defects = len(age_defects)
    unique_vehicles = len(age_defects['registration'].unique())
    summary_text = (f"Total Failures: {total_defects}\n"
                   f"Unique Vehicles: {unique_vehicles}\n"
                   f"Components: {len(G.nodes())}\n"
                   f"Co-occurrences: {len(cooccurrence_counts)}")
    plt.figtext(0.99, 0.99, summary_text, fontsize=11, ha="right", va="top", 
               bbox=dict(facecolor='white', edgecolor='black', alpha=0.8))
    
    plt.title(f"Ford Fiesta Component Failure Network at Age {age} Years", fontsize=16)
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# Create a legend figure to show the color mapping only for components that appear in at least one graph
# First collect all components that actually appear in any graphs
active_components = set()
for age in age_range:
    age_defects = ford_fiesta_defects[ford_fiesta_defects['car_age_rounded'] == age]
    if len(age_defects) >= 10:  # Same threshold as in visualization loop
        active_components.update(age_defects['defect_text'].unique())

# Create legend only for components that appeared in at least one graph
plt.figure(figsize=(12, 4))
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color_map[comp], 
                      markersize=14, label=comp) for comp in sorted(active_components)]
plt.legend(handles=handles, loc='center', ncol=3, fontsize=12)
plt.axis('off')
plt.title("Component Color Legend (Components with Failures Only)", fontsize=14)
plt.tight_layout()
plt.show()

# Rest of your summary code with similar filtering