In [1]:
# Code to find unique values in the Make column
import pandas as pd

# Load your CSV file
df = pd.read_csv('data/cleaned_aviation_data_1.csv')  # Replace with your actual file path if different

# Get unique values in the Make column
unique_makes = df['Make'].unique()

# Print the total number of unique makes
print(f"Total number of unique aircraft makes: {len(unique_makes)}")

# Print the unique values (sorted alphabetically for easier review)
print("\nUnique values in the Make column:")
for make in sorted(unique_makes):
    print(make)

# Alternative: Get value counts to see frequency of each make
print("\nTop 20 makes by frequency:")
make_counts = df['Make'].value_counts().head(20)
print(make_counts)

# Find potential inconsistencies (similar makes with different capitalization or spelling)
lowercase_makes = {}
for make in unique_makes:
    if not isinstance(make, str):
        continue
    
    lowercase = make.lower()
    if lowercase not in lowercase_makes:
        lowercase_makes[lowercase] = []
    lowercase_makes[lowercase].append(make)

# Print makes that appear with different capitalizations or spellings
print("\nPotential inconsistencies (different versions of the same make):")
for lowercase, variations in lowercase_makes.items():
    if len(variations) > 1:
        print(f"  {lowercase}: {variations}")

Total number of unique aircraft makes: 3395

Unique values in the Make column:
177Mf Llc
2007 Savage Air Llc
2021Fx3 Llc
5 Rivers Llc
781569 Inc
A. Schleicher Gmbh & Co.
Aardema Robert John
Ab Sportine Aviacija
Abbett Gerry
Abbey Victor
Ackland Jeffrey Dean
Acro
Acro Sport
Adams
Adams Dennis Allen
Adams Donald L
Adams John R Jr
Adkins
Adolphson
Adrian George W
Advertising Mgmt & Consulting
Aermacchi
Aero Adventure
Aero At Sp Zoo
Aero Bristell
Aero Commander
Aero Smart Solutions Inc
Aero Sp Z O O
Aero Sp Z O O (Gobosh)
Aero Tek Inc.
Aero Vodochody
Aero Vodochody Aero. Works
Aero Works
Aero-Ace
Aerofab
Aerofab Inc
Aerofab Inc.
Aerolite
Aeromot
Aeronca
Aeronca Aircraft Corporation
Aeroprakt
Aeropro Cz
Aeropro Cz S R O
Aeros
Aeros Ltd
Aeros Ltd/Skyranger Aircraft
Aerospatiale
Aerospatiale Alenia
Aerosport Ltd
Aerostar
Aerostar Acft Corp Of Texas
Aerostar Aircraft Corporation
Aerostar International
Aerostar International Inc
Aerostar S A
Aerotechnik
Aerotek
Aerotek Inc
Aerotrek
Aerotrike
Ae

  df = pd.read_csv('data/cleaned_aviation_data_1.csv')  # Replace with your actual file path if different


In [2]:
# Create a standardization mapping for the most common manufacturers
make_mapping = {
    # Cessna variations
    'Cessna Aircraft': 'Cessna',
    'Cessna Aircraft Co': 'Cessna',
    'Cessna Aircraft Co.': 'Cessna',
    
    # Piper variations
    'Piper Aircraft': 'Piper',
    'Piper Aircraft Corporation': 'Piper',
    'Piper Aircraft Inc': 'Piper',
    'Piper Aircraft, Inc.': 'Piper',
    
    # Beech variations
    'Beech Aircraft Corp': 'Beech',
    'Beech Aircraft Corporation': 'Beech',
    'Beechcraft': 'Beech',
    'Beechcraft Corporation': 'Beech',
    
    # Bell variations
    'Bell Helicopter': 'Bell',
    'Bell Helicopter Textron': 'Bell',
    
    # Robinson variations
    'Robinson Helicopter': 'Robinson',
    'Robinson Helicopter Co': 'Robinson',
    'Robinson Helicopter Company': 'Robinson',
    
    # Air Tractor variations
    'Air Tractor': 'Air Tractor',
    'Air Tractor Inc': 'Air Tractor',
    'Air Tractor Inc.': 'Air Tractor',
    'Air Tractor, Inc.': 'Air Tractor',
    
    # Cirrus variations
    'Cirrus': 'Cirrus',
    'Cirrus Design': 'Cirrus',
    'Cirrus Design Corp': 'Cirrus',
    'Cirrus Design Corp.': 'Cirrus',
    'Cirrus Design Corporation': 'Cirrus',
    
    # Boeing variations
    'Boeing Company': 'Boeing',
    'Boeing-Stearman': 'Boeing',
    
    # Other common manufacturers
    'Aeronca Aircraft Corporation': 'Aeronca',
    'Bellanca': 'Bellanca',
    'Maule': 'Maule',
    'Schweizer': 'Schweizer',
    'Schweizer Aircraft Corp': 'Schweizer',
    'Hughes Helicopters Inc': 'Hughes',
    'Champion': 'Champion',
    'Luscombe': 'Luscombe',
    'Eurocopter': 'Eurocopter'
}

# Apply the standardization mapping
df['Make_Standardized'] = df['Make'].replace(make_mapping)

# Check results for the top makes to see if standardization worked
print("Top makes after standardization:")
print(df['Make_Standardized'].value_counts().head(20))

# Compare original vs standardized counts
print("\nComparison of top manufacturers before and after standardization:")
original_top = df['Make'].value_counts().head(10)
standardized_top = df['Make_Standardized'].value_counts().head(10)

comparison = pd.DataFrame({
    'Original Count': original_top,
    'Standardized Count': standardized_top.reindex(original_top.index).fillna(0).astype(int)
})
print(comparison)

Top makes after standardization:
Make_Standardized
Cessna         5206
Piper          3098
Beech          1113
Robinson        614
Bell            513
Air Tractor     345
Cirrus          308
Mooney          274
Boeing          192
Schweizer       179
Bellanca        173
Aeronca         164
Maule           161
Hughes          137
Champion        126
Luscombe        113
Eurocopter      111
Stinson         108
Vans            107
Grumman          97
Name: count, dtype: int64

Comparison of top manufacturers before and after standardization:
                     Original Count  Standardized Count
Make                                                   
Cessna                         5177                5206
Piper                          3066                3098
Beech                          1098                1113
Bell                            490                 513
Mooney                          274                 274
Robinson Helicopter             219                   0
Air Trac

In [3]:
# Check Model column values and their frequencies
model_counts = df['Model'].value_counts()

print(f"Number of unique values in Model column: {len(df['Model'].unique())}")
print("\nTop 30 most common Model values:")
print(model_counts.head(30))

# Look for potential model standardization issues
# First, convert to lowercase for comparison
df['Model_lower'] = df['Model'].str.lower() if df['Model'].dtype == 'object' else df['Model']

# Find models that might be the same but with different formatting
model_lower_counts = df['Model_lower'].value_counts()
potential_duplicates = []

# Check common models for variations
common_models = ['172', 'pa28', '182', 'sr22', 'r44']
for model in common_models:
    similar_models = [m for m in model_lower_counts.index if model in str(m).lower()]
    if len(similar_models) > 1:
        print(f"\nPotential variations of model {model}:")
        for similar in similar_models:
            count = model_lower_counts[similar]
            print(f"  {similar}: {count}")

# Check for models with numbers and letters that might need standardization
# For example, "172" vs "C-172" vs "Cessna 172"
print("\nSample of models that might need standardization:")
for make in ["Cessna", "Piper", "Beech"]:
    # Get standardized make
    std_make = make
    if make in make_mapping:
        std_make = make_mapping[make]
    
    # Filter dataset for this make
    make_data = df[df['Make_Standardized'] == std_make]
    
    print(f"\n{make} models:")
    print(make_data['Model'].value_counts().head(10))

Number of unique values in Model column: 4782

Top 30 most common Model values:
Model
172          614
PA28         229
182          225
152          211
SR22         204
172S         183
R44          180
180          177
172N         165
A36          163
R44 II       148
PA-18-150    143
172M         141
150          127
PA-28-140    117
R22 BETA     114
206          106
140           96
172P          96
R22           93
206B          91
PA18          84
170B          81
7AC           79
M20J          77
7GCBC         76
A185F         76
PA-28-180     74
PA-28-161     73
269C          72
Name: count, dtype: int64

Potential variations of model 172:
  172: 614
  172s: 183
  172n: 165
  172m: 141
  172p: 96
  172r: 71
  172rg: 44
  172k: 38
  172h: 28
  172f: 23
  r172k: 22
  172l: 19
  172d: 14
  172g: 14
  172e: 13
  172b: 12
  172i: 11
  172a: 11
  r172: 8
  r172e: 6
  172sp: 6
  172c: 6
  172 - r: 3
  172 - n: 2
  172 - s: 2
  p172d: 2
  172 - h: 1
  172q: 1
  cessna fr172: 1
  ce-1

In [4]:
# Create model family standardization
def standardize_model_family(model):
    model_str = str(model).upper().strip()
    
    # Cessna families
    if '172' in model_str:
        return 'Cessna 172 Family'
    elif '182' in model_str:
        return 'Cessna 182 Family'
    elif '152' in model_str:
        return 'Cessna 152 Family'
    elif '150' in model_str:
        return 'Cessna 150 Family'
    elif '180' in model_str and not 'PA' in model_str:  # Avoid matching PA-28-180
        return 'Cessna 180 Family'
    
    # Piper families
    elif any(x in model_str for x in ['PA28', 'PA-28']):
        return 'Piper PA-28 Family'
    elif any(x in model_str for x in ['PA18', 'PA-18']):
        return 'Piper PA-18 Family'
    elif any(x in model_str for x in ['PA32', 'PA-32']):
        return 'Piper PA-32 Family'
    
    # Robinson helicopters
    elif model_str.startswith('R44'):
        return 'Robinson R44 Family'
    elif model_str.startswith('R22'):
        return 'Robinson R22 Family'
    
    # Beechcraft families
    elif 'A36' in model_str:
        return 'Beech A36 Family'
    elif model_str == '58' or model_str.startswith('58 '):
        return 'Beech 58 Family'
    
    # If no match, return the original
    return model

# Create a more specific model standardization for variants
def standardize_model_variant(model, make):
    if not isinstance(model, str) or not isinstance(make, str):
        return model
    
    model_str = model.upper().strip()
    make_std = make.upper().strip()
    
    # Standardize Cessna 172 variants
    if '172' in model_str and 'CESS' in make_std:
        # Remove any "CESSNA" or "C-" prefix
        clean_model = model_str.replace('CESSNA ', '').replace('CE-', '').replace('C-', '')
        
        # Extract the base model and variant
        if 'RG' in clean_model:
            return '172RG'  # Retractable Gear version
        elif clean_model.startswith('172') and len(clean_model) > 3:
            base = '172'
            variant = clean_model.replace('172', '').strip('-. ')
            if variant:
                return f'172{variant[0]}'  # Take just the first character of the variant
            else:
                return '172'
        else:
            return '172'
    
    # Standardize Piper PA-28 variants
    elif any(x in model_str for x in ['PA28', 'PA-28']) and 'PIPER' in make_std:
        if 'ARROW' in model_str or 'R' in model_str:
            return 'PA-28R'  # Arrow models
        elif '161' in model_str:
            return 'PA-28-161'  # Warrior II
        elif '181' in model_str:
            return 'PA-28-181'  # Archer II/III
        elif '140' in model_str:
            return 'PA-28-140'  # Cruiser/Cherokee 140
        elif '180' in model_str:
            return 'PA-28-180'  # Cherokee 180
        else:
            return 'PA-28'
    
    # If no specific standardization, return the original
    return model

# Apply the standardization
df['Model_Family'] = df['Model'].apply(standardize_model_family)
df['Model_Standardized'] = df.apply(lambda x: standardize_model_variant(x['Model'], x['Make_Standardized']), axis=1)

# Check the results
print("Top Model Families:")
print(df['Model_Family'].value_counts().head(10))

print("\nTop Standardized Model Variants:")
print(df['Model_Standardized'].value_counts().head(10))

Top Model Families:
Model_Family
Cessna 172 Family      1566
Piper PA-28 Family      818
Cessna 150 Family       701
Cessna 182 Family       690
Cessna 180 Family       340
Robinson R44 Family     334
Piper PA-32 Family      284
Robinson R22 Family     240
Cessna 152 Family       218
SR22                    204
Name: count, dtype: int64

Top Standardized Model Variants:
Model_Standardized
172      652
PA-28    335
182      225
152      211
SR22     204
172S     192
R44      180
180      177
172N     168
A36      163
Name: count, dtype: int64


In [5]:
# Check Aircraft.Category values and frequencies
category_counts = df['Aircraft.Category'].value_counts()

print(f"Number of unique values in Aircraft.Category column: {len(df['Aircraft.Category'].unique())}")
print("\nAll Aircraft.Category values with counts:")
print(category_counts)

# Check for potential standardization issues by converting to lowercase
df['Category_lower'] = df['Aircraft.Category'].str.lower() if df['Aircraft.Category'].dtype == 'object' else df['Aircraft.Category']
category_lower_counts = df['Category_lower'].value_counts()

# Check for similar categories that might need standardization
print("\nPotential category standardization issues:")
similar_categories = {}
for category in category_lower_counts.index:
    if not isinstance(category, str):
        continue
    # Look for similar categories
    for other_category in category_lower_counts.index:
        if not isinstance(other_category, str):
            continue
        if category != other_category and (category in other_category or other_category in category):
            if category not in similar_categories:
                similar_categories[category] = []
            if other_category not in similar_categories[category]:
                similar_categories[category].append(other_category)

# Print potential similar categories
for category, similar in similar_categories.items():
    if similar:
        print(f"Category '{category}' might be similar to: {', '.join(similar)}")
        print(f"  '{category}' count: {category_lower_counts[category]}")
        for s in similar:
            print(f"  '{s}' count: {category_lower_counts[s]}")
        print()

# Create a standardization mapping for Aircraft.Category
category_mapping = {
    # Add your mappings here based on the analysis
}

# Apply the standardization
if category_mapping:
    df['Aircraft.Category_Standardized'] = df['Aircraft.Category'].replace(category_mapping)
    
    # Check the results
    print("Standardized Aircraft.Category values with counts:")
    print(df['Aircraft.Category_Standardized'].value_counts())

Number of unique values in Aircraft.Category column: 13

All Aircraft.Category values with counts:
Aircraft.Category
Airplane             17420
Helicopter            2070
Glider                 349
Weight-Shift           158
Gyrocraft              136
Balloon                 87
Powered Parachute       87
Ultralight              18
WSFT                     7
Unknown                  4
Rocket                   1
Blimp                    1
ULTR                     1
Name: count, dtype: int64

Potential category standardization issues:
Category 'ultralight' might be similar to: ultr
  'ultralight' count: 18
  'ultr' count: 1

Category 'ultr' might be similar to: ultralight
  'ultr' count: 1
  'ultralight' count: 18



In [6]:
# Create standardization mapping for Aircraft.Category
category_mapping = {
    # Handle abbreviations
    'ULTR': 'Ultralight',
    'WSFT': 'Weight-Shift',
    
    # Ensure consistent capitalization if needed
    'Unknown': 'Unknown'  # This is a placeholder example
}

# Apply the standardization
df['Aircraft.Category_Standardized'] = df['Aircraft.Category'].replace(category_mapping)

# Check the results
print("Standardized Aircraft.Category values with counts:")
print(df['Aircraft.Category_Standardized'].value_counts())

# Check how the standardization affected counts
print("\nComparison of Aircraft Categories before and after standardization:")
original_category_counts = df['Aircraft.Category'].value_counts()
standard_category_counts = df['Aircraft.Category_Standardized'].value_counts()

# Create a comparison dataframe for categories that changed
category_comparison = pd.DataFrame({
    'Original Count': original_category_counts,
    'Standardized Count': standard_category_counts.reindex(original_category_counts.index).fillna(0).astype(int)
})
print(category_comparison)

Standardized Aircraft.Category values with counts:
Aircraft.Category_Standardized
Airplane             17420
Helicopter            2070
Glider                 349
Weight-Shift           165
Gyrocraft              136
Balloon                 87
Powered Parachute       87
Ultralight              19
Unknown                  4
Rocket                   1
Blimp                    1
Name: count, dtype: int64

Comparison of Aircraft Categories before and after standardization:
                   Original Count  Standardized Count
Aircraft.Category                                    
Airplane                    17420               17420
Helicopter                   2070                2070
Glider                        349                 349
Weight-Shift                  158                 165
Gyrocraft                     136                 136
Balloon                        87                  87
Powered Parachute              87                  87
Ultralight                     18       

In [7]:
# Check Engine.Type values and frequencies
engine_counts = df['Engine.Type'].value_counts()

print(f"Number of unique values in Engine.Type column: {len(df['Engine.Type'].unique())}")
print("\nAll Engine.Type values with counts:")
print(engine_counts)

# Check for potential standardization issues by converting to lowercase
df['Engine_lower'] = df['Engine.Type'].str.lower() if df['Engine.Type'].dtype == 'object' else df['Engine.Type']
engine_lower_counts = df['Engine_lower'].value_counts()

# Check for similar engine types that might need standardization
print("\nPotential engine type standardization issues:")
similar_engines = {}
for engine in engine_lower_counts.index:
    if not isinstance(engine, str):
        continue
    # Look for similar engine types
    for other_engine in engine_lower_counts.index:
        if not isinstance(other_engine, str):
            continue
        if engine != other_engine and (engine in other_engine or other_engine in engine):
            if engine not in similar_engines:
                similar_engines[engine] = []
            if other_engine not in similar_engines[engine]:
                similar_engines[engine].append(other_engine)

# Print potential similar engine types
for engine, similar in similar_engines.items():
    if similar:
        print(f"Engine type '{engine}' might be similar to: {', '.join(similar)}")
        print(f"  '{engine}' count: {engine_lower_counts[engine]}")
        for s in similar:
            print(f"  '{s}' count: {engine_lower_counts[s]}")
        print()

# Create a standardization mapping for Engine.Type
engine_mapping = {
    # You'll define mappings here based on the analysis results
}

# Apply the standardization if you've defined mappings
if engine_mapping:
    df['Engine.Type_Standardized'] = df['Engine.Type'].replace(engine_mapping)
    
    # Check the results
    print("Standardized Engine.Type values with counts:")
    print(df['Engine.Type_Standardized'].value_counts())

Number of unique values in Engine.Type column: 13

All Engine.Type values with counts:
Engine.Type
Reciprocating      15735
Turbo Prop           990
Turbo Shaft          915
Turbo Fan            365
Turbo Jet             90
Unknown               16
Electric               8
NONE                   2
Hybrid Rocket          1
Geared Turbofan        1
LR                     1
UNK                    1
Name: count, dtype: int64

Potential engine type standardization issues:
Engine type 'unknown' might be similar to: unk
  'unknown' count: 16
  'unk' count: 1

Engine type 'unk' might be similar to: unknown
  'unk' count: 1
  'unknown' count: 16



In [8]:
# Create standardization mapping for Engine.Type
engine_mapping = {
    # Handle unknown values
    'UNK': 'Unknown',
    
    # Standardize turbine engine terminology
    'Turbo Prop': 'Turboprop',
    'Turbo Shaft': 'Turboshaft',
    'Turbo Fan': 'Turbofan',
    'Turbo Jet': 'Turbojet',
    'Geared Turbofan': 'Turbofan',
    
    # Standardize other uncommon types
    'NONE': 'None',
    'LR': 'Unknown'  # Unless you know what LR specifically refers to
}

# Apply the standardization
df['Engine.Type_Standardized'] = df['Engine.Type'].replace(engine_mapping)

# Check the results
print("Standardized Engine.Type values with counts:")
print(df['Engine.Type_Standardized'].value_counts())

# Check how the standardization affected counts
print("\nComparison of Engine Types before and after standardization:")
original_counts = df['Engine.Type'].value_counts()
standard_counts = df['Engine.Type_Standardized'].value_counts()

# Create a comparison dataframe
engine_comparison = pd.DataFrame({
    'Original Count': original_counts,
    'Standardized Count': standard_counts.reindex(original_counts.index).fillna(0).astype(int)
})
print(engine_comparison)

Standardized Engine.Type values with counts:
Engine.Type_Standardized
Reciprocating    15735
Turboprop          990
Turboshaft         915
Turbofan           366
Turbojet            90
Unknown             18
Electric             8
None                 2
Hybrid Rocket        1
Name: count, dtype: int64

Comparison of Engine Types before and after standardization:
                 Original Count  Standardized Count
Engine.Type                                        
Reciprocating             15735               15735
Turbo Prop                  990                   0
Turbo Shaft                 915                   0
Turbo Fan                   365                   0
Turbo Jet                    90                   0
Unknown                      16                  18
Electric                      8                   8
NONE                          2                   0
Hybrid Rocket                 1                   1
Geared Turbofan               1                   0
LR         

In [9]:
# Check Aircraft.damage values and frequencies
damage_counts = df['Aircraft.damage'].value_counts()

print(f"Number of unique values in Aircraft.damage column: {len(df['Aircraft.damage'].unique())}")
print("\nAll Aircraft.damage values with counts:")
print(damage_counts)

# Check for potential standardization issues by converting to lowercase
df['Damage_lower'] = df['Aircraft.damage'].str.lower() if df['Aircraft.damage'].dtype == 'object' else df['Aircraft.damage']
damage_lower_counts = df['Damage_lower'].value_counts()

# Check for similar damage categories that might need standardization
print("\nPotential damage category standardization issues:")
similar_damages = {}
for damage in damage_lower_counts.index:
    if not isinstance(damage, str):
        continue
    # Look for similar damage categories
    for other_damage in damage_lower_counts.index:
        if not isinstance(other_damage, str):
            continue
        if damage != other_damage and (damage in other_damage or other_damage in damage):
            if damage not in similar_damages:
                similar_damages[damage] = []
            if other_damage not in similar_damages[damage]:
                similar_damages[damage].append(other_damage)

# Print potential similar damage categories
for damage, similar in similar_damages.items():
    if similar:
        print(f"Damage category '{damage}' might be similar to: {', '.join(similar)}")
        print(f"  '{damage}' count: {damage_lower_counts[damage]}")
        for s in similar:
            print(f"  '{s}' count: {damage_lower_counts[s]}")
        print()

Number of unique values in Aircraft.damage column: 4

All Aircraft.damage values with counts:
Aircraft.damage
Substantial    17965
Destroyed       1917
Minor            417
Unknown           40
Name: count, dtype: int64

Potential damage category standardization issues:


In [10]:
# Check Weather.Condition values and frequencies
weather_counts = df['Weather.Condition'].value_counts()

print(f"Number of unique values in Weather.Condition column: {len(df['Weather.Condition'].unique())}")
print("\nAll Weather.Condition values with counts:")
print(weather_counts)

# Check for potential standardization issues by converting to lowercase
df['Weather_lower'] = df['Weather.Condition'].str.lower() if df['Weather.Condition'].dtype == 'object' else df['Weather.Condition']
weather_lower_counts = df['Weather_lower'].value_counts()

# Check for similar weather conditions that might need standardization
print("\nPotential weather condition standardization issues:")
similar_conditions = {}
for condition in weather_lower_counts.index:
    if not isinstance(condition, str):
        continue
    # Look for similar weather conditions
    for other_condition in weather_lower_counts.index:
        if not isinstance(other_condition, str):
            continue
        if condition != other_condition and (condition in other_condition or other_condition in condition):
            if condition not in similar_conditions:
                similar_conditions[condition] = []
            if other_condition not in similar_conditions[condition]:
                similar_conditions[condition].append(other_condition)

# Print potential similar weather conditions
for condition, similar in similar_conditions.items():
    if similar:
        print(f"Weather condition '{condition}' might be similar to: {', '.join(similar)}")
        print(f"  '{condition}' count: {weather_lower_counts[condition]}")
        for s in similar:
            print(f"  '{s}' count: {weather_lower_counts[s]}")
        print()

Number of unique values in Weather.Condition column: 3

All Weather.Condition values with counts:
Weather.Condition
VMC    19232
IMC      944
Unk      163
Name: count, dtype: int64

Potential weather condition standardization issues:


In [11]:
# Check Broad.phase.of.flight values and frequencies
flight_phase_counts = df['Broad.phase.of.flight'].value_counts()

print(f"Number of unique values in Broad.phase.of.flight column: {len(df['Broad.phase.of.flight'].unique())}")
print("\nAll Broad.phase.of.flight values with counts:")
print(flight_phase_counts)

# Check for potential standardization issues by converting to lowercase
df['Flight_phase_lower'] = df['Broad.phase.of.flight'].str.lower() if df['Broad.phase.of.flight'].dtype == 'object' else df['Broad.phase.of.flight']
flight_phase_lower_counts = df['Flight_phase_lower'].value_counts()

# Check for similar flight phases that might need standardization
print("\nPotential flight phase standardization issues:")
similar_phases = {}
for phase in flight_phase_lower_counts.index:
    if not isinstance(phase, str):
        continue
    # Look for similar flight phases
    for other_phase in flight_phase_lower_counts.index:
        if not isinstance(other_phase, str):
            continue
        if phase != other_phase and (phase in other_phase or other_phase in phase):
            if phase not in similar_phases:
                similar_phases[phase] = []
            if other_phase not in similar_phases[phase]:
                similar_phases[phase].append(other_phase)

# Print potential similar flight phases
for phase, similar in similar_phases.items():
    if similar:
        print(f"Flight phase '{phase}' might be similar to: {', '.join(similar)}")
        print(f"  '{phase}' count: {flight_phase_lower_counts[phase]}")
        for s in similar:
            print(f"  '{s}' count: {flight_phase_lower_counts[s]}")
        print()

Number of unique values in Broad.phase.of.flight column: 6

All Broad.phase.of.flight values with counts:
Broad.phase.of.flight
Landing    30
Takeoff    25
Cruise     23
Other      13
Unknown     4
Name: count, dtype: int64

Potential flight phase standardization issues:


In [18]:
# Save to CSV
yearly_safety.to_csv('yearly_safety_summary.csv', index=False)
