# Imports & Setup

In [1]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akshatsoni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akshatsoni/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/akshatsoni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#laod file
df = pd.read_csv("AFE.csv")

In [4]:
df[['Narrative', 'Root_Cause_Keywords']].head()

Unnamed: 0,Narrative,Root_Cause_Keywords
0,"Pump failed to respond, irregular cycling obse...","pump, hydraulic"
1,Compressor cycling irregularly; passenger disc...,"compressor, a/c"
2,Bus Tie fault reported; unexpected shutdown le...,"bus_tie, electrical"
3,Reports of checklist execution due to Ventilat...,"ventilation_fan, a/c"
4,Flight Computer produced erroneous reading...,"flight_computer, avionics"


## Basic Text Cleaning

In [115]:
# initial tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [116]:
# cleaning function
def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) 
              for word in tokens 
              if word not in stop_words and len(word) > 2]
    
    return " ".join(tokens)

In [118]:
#testing the function
clean_text(df['Narrative'].iloc[1])

'compressor cycling irregularly passenger discomfort passenger discomfort climb'

In [119]:
#create anothe col named clean_narr..
df['Clean_Narrative'] = df['Narrative'].apply(clean_text)

In [120]:
df[['Narrative', 'Clean_Narrative']].sample(10)

Unnamed: 0,Narrative,Clean_Narrative
118954,Electrical smell and short circuit near Batter...,electrical smell short circuit near battery ma...
89269,Bus Tie fault reported; irregular cycling led ...,bus tie fault reported irregular cycling led c...
42294,AC/heating issue: Ventilation Fan pressure los...,heating issue ventilation fan pressure loss cr...
86366,Power Bus fault reported; voltage drop led to ...,power bus fault reported voltage drop led avio...
150187,Reports of avionics warning due to Ventilation...,report avionics warning due ventilation fan pr...
93833,Crew noted reduced thrust after Wiring Harness...,crew noted reduced thrust wiring harness showe...
7254,Brake Unit actuator showed voltage drop during...,brake unit actuator showed voltage drop descen...
104421,"During Descent, Wiring Harness experienced flu...",descent wiring harness experienced fluctuation...
145192,Crew noted manual reset after Bus Tie showed f...,crew noted manual reset bus tie showed fluctua...
68166,Battery fault reported; unexpected shutdown le...,battery fault reported unexpected shutdown led...


## Domain Keyword Normalization

In [121]:
FAULT_VOCAB = {
    'electrical': {
        'wire', 'wiring', 'cable', 'connector', 'short', 'shorting',
        'electrical', 'power', 'bus', 'circuit', 'breaker', 'voltage'
    },
    'air_conditioning': {
        'ac', 'air', 'conditioning', 'cooling', 'ventilation',
        'pressurization', 'temperature', 'bleed'
    },
    'hydraulic': {
        'hydraulic', 'fluid', 'leak', 'pressure', 'actuator', 'pump', 'valve'
    },
    'engine': {
        'engine', 'turbine', 'compressor', 'fan', 'rpm', 'vibration'
    },
    'sensor': {
        'sensor', 'probe', 'indicator', 'warning', 'fault', 'signal'
    }
}
#keywords map

In [122]:
# tokenizer helper
def tokenize(text):
    if not text:
        return set()
    return set(text.split())

In [123]:
def score_fault_type(clean_text):
    tokens = tokenize(clean_text)
    
    scores = {}
    for fault_type, vocab in FAULT_VOCAB.items():
        scores[fault_type] = len(tokens.intersection(vocab))
    
    # pick highest scoring fault type
    best_fault = max(scores, key=scores.get)
    
    # confidence threshold (VERY IMPORTANT)
    if scores[best_fault] >= 2:
        return best_fault
    else:
        return 'other'

In [124]:
df['Primary_Fault_Type'] = df['Clean_Narrative'].apply(score_fault_type)

In [125]:
df['Primary_Fault_Type'].value_counts()

Primary_Fault_Type
other         100625
electrical     35639
hydraulic       9281
sensor          4755
Name: count, dtype: int64

In [126]:
df['Clean_Root_Cause'] = df['Root_Cause_Keywords'].apply(clean_text)

In [127]:
def consistency_check(row):
    if row['Primary_Fault_Type'] == 'other':
        return 0
    
    root_tokens = tokenize(row['Clean_Root_Cause'])
    fault_vocab = FAULT_VOCAB.get(row['Primary_Fault_Type'], set())
    
    return int(len(root_tokens.intersection(fault_vocab)) >= 1)

In [128]:
df['Consistency_Flag'] = df.apply(consistency_check, axis=1)

In [53]:
consistency_score = df['Consistency_Flag'].mean()*100
consistency_score

56.5395874916833

In [130]:
df.groupby('Primary_Fault_Type')['Consistency_Flag'].mean().sort_values(ascending=False)

Primary_Fault_Type
hydraulic     0.939662
electrical    0.829232
sensor        0.613249
other         0.000000
Name: Consistency_Flag, dtype: float64

In [131]:
df

Unnamed: 0,Report_ID,Date,Aircraft_Make,Aircraft_Model,Flight_Phase,System,Component,Severity,Location,Narrative,Root_Cause_Keywords,Model_Age,Clean_Narrative,Primary_Fault_Type,Clean_Root_Cause,Consistency_Flag
0,4059770,2021-01-18,Airbus,A350,Cruise,Hydraulic,Pump,Low,NJ,"Pump failed to respond, irregular cycling obse...","pump, hydraulic",14,pump failed respond irregular cycling observed...,other,pump hydraulic,0
1,4021362,2016-07-03,Embraer,E190,Climb,A/C (Environmental),Compressor,Moderate,GA,Compressor cycling irregularly; passenger disc...,"compressor, a/c",21,compressor cycling irregularly passenger disco...,other,compressor,0
2,4127324,2010-09-01,Airbus,A320,Climb,Electrical,Bus Tie,Low,AZ,Bus Tie fault reported; unexpected shutdown le...,"bus_tie, electrical",25,bus tie fault reported unexpected shutdown led...,other,bus tie electrical,0
3,4140509,2024-10-25,Boeing,777-300,Taxi,A/C (Environmental),Ventilation Fan,Moderate,MD,Reports of checklist execution due to Ventilat...,"ventilation_fan, a/c",30,report checklist execution due ventilation fan...,other,ventilation fan,0
4,4144297,2024-04-19,ATR,ATR72,Landing,Avionics,Flight Computer,Low,CO,Flight Computer produced erroneous reading...,"flight_computer, avionics",29,flight computer produced erroneous reading ove...,other,flight computer avionics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150295,4040287,2017-03-21,Airbus,A330,Cruise,Electrical,Circuit Breaker,Low,NJ,Circuit Breaker fault reported; irregular cycl...,"circuit_breaker, electrical",21,circuit breaker fault reported irregular cycli...,electrical,circuit breaker electrical,1
150296,4022943,2011-11-16,Embraer,E175,Cruise,Electrical,Bus Tie,Moderate,IL,"During Cruise, Bus Tie experienced leak, crew ...","bus_tie, electrical",7,cruise bus tie experienced leak crew observed ...,other,bus tie electrical,0
150297,4012699,2023-07-20,ATR,ATR72,Descent,A/C (Environmental),Temperature Sensor,Low,MD,Temperature Sensor cycling irregularly; avioni...,"temperature_sensor, a/c",29,temperature sensor cycling irregularly avionic...,sensor,temperature sensor,1
150298,4086513,2010-02-06,Bombardier,Q400,Descent,Electrical,Circuit Breaker,Low,TX,"During Descent, Circuit Breaker experienced sh...","circuit_breaker, electrical",13,descent circuit breaker experienced short circ...,electrical,circuit breaker electrical,1


In [132]:
nlp_df = df[
    [
        'Report_ID',
        'System',
        'Component',
        'Flight_Phase',
        'Model_Age',
        'Severity',
        'Clean_Narrative',
        'Primary_Fault_Type'
    ]
]

nlp_df.to_csv("AFE_nlp.csv", index=False)

# NLP part is DoNe!