# Transform the data for use in RNN

**Note:** All company-specific file names, column names, and identifiers have been obfuscated or generalized to protect proprietary information while maintaining the analytical structure of the code.

In [None]:
import os
import pandas as pd
import re
import numpy as np
from scipy.stats import zscore


# Load each CSV into a DataFrame
raw_data_dir = os.path.join('private', 'data', 'raw')

# List of file names
file_names = [
    'partslist.csv',
    'merged_rmaorders.csv',
    'hist_repair_rma.csv',
    'flights.csv',
    'flightresets.csv',
    'passenger_count.csv',
    'mtbf.csv',
    'productinfo.csv',
]

# Dictionary to store DataFrames
dataframes = {}
pd.reset_option('display.float_format')

# Load each CSV into a DataFrame and store in the dictionary
for file_name in file_names:
    df_name = file_name.split('.')[0] + '_df'
    dataframes[df_name] = pd.read_csv(
        os.path.join(raw_data_dir, file_name),
        parse_dates=True,  # Try to parse date columns
        infer_datetime_format=True,  # Use format inference for dates
        low_memory=False  # Avoid mixed type inference warnings
    )

    # Temp store current dataframe
    df = dataframes[df_name]

    # Change objects to columns
    object_columns = df.select_dtypes(include=['object']).columns
    
    if len(object_columns) > 0:
        print(f"\nConverting object columns to datetimes or strings for {df_name}:")
        for col in object_columns:
            # Try converting to datetime
            try:
                df[col] = pd.to_datetime(df[col], errors='raise')
            except:
                # Convert to string dtype
                df[col] = df[col].astype("string")

    print(f"\nDataframe: {df_name}")
    print("\nColumn Types:")
    type_counts = df.dtypes.value_counts()
    for dtype, count in type_counts.items():
        print(f"  {dtype}: {count} columns")

    datetime_columns = df.select_dtypes(include=['datetime64']).columns.tolist()
    if datetime_columns:
        print("\nDatetime Columns:")
        for col in datetime_columns:
            print(f"  - {col}")

# DataFrames
inflight_parts_df = dataframes['partslist_df']
rma_df = dataframes['merged_rmaorders_df']
hist_rma_df = dataframes['hist_repair_rma_df']
flight_data_df = dataframes['flights_df']
flightresets_df = dataframes['flightresets_df']
flightpassengers_df = dataframes['passenger_count_df']
mtbf_df = dataframes['mtbf_df']
productinfo_df = dataframes['productinfo_df']

# Label encode line manager to preserve privacy
productinfo_df['product_manager'] = productinfo_df['product_manager'].astype('category').cat.codes

## Fix Issues as they are discovered
# Determine validity of missing fields 
actual_missing_tails = inflight_parts_df[
    (inflight_parts_df['Tail'].isnull()) & 
    (inflight_parts_df['status'] != 'Not in Service Yet')
]
print(f"inflight_parts_df: Actual missing tails: {len(actual_missing_tails)}")
actual_missing_hist_ship_dates = hist_rma_df[
    (hist_rma_df['ReceivedDate'].isnull()) & 
    (hist_rma_df['ReceivedAtPartner'].isnull())
]
print(f"\nhist_rma_df: Actual missing hist_ship_dates: {len(actual_missing_hist_ship_dates)}")
print(f" actual hist_ship_dates: {len(actual_missing_hist_ship_dates)} missing ({len(actual_missing_hist_ship_dates)/len(hist_rma_df)*100:.1f}%)")

# Drop 'unused' fields
rma_df.drop(['Message'], axis=1, inplace=True)
rma_df.drop(['Subject'], axis=1, inplace=True)
rma_df.drop(['Progress'], axis=1, inplace=True)
rma_df.drop(['Solution'], axis=1, inplace=True)
rma_df.drop(['AircraftType'], axis=1, inplace=True)
rma_df.drop(['FinalDocRevision'], axis=1, inplace=True)
rma_df.drop(['AircraftTailSerialNumber'], axis=1, inplace=True)
rma_df.drop(['AircraftTailNumber'], axis=1, inplace=True)
# Drop sensitive customer/contact information columns
sensitive_columns = ['ContactPersonPhone', 'ContactPersonEmail', 'CustAccount', 'Personnel_Number', 'SvcCallInitiator', 'CustAccountUser', 'CustAccountOwner']
for col in sensitive_columns:
    if col in rma_df.columns:
        rma_df.drop([col], axis=1, inplace=True)
rma_missing = rma_df.isnull().sum()
if rma_missing.any():
    print("\nNew RMA Columns with missing values:")
    for col, count in rma_missing[rma_missing > 0].items():
        print(f"  {col}: {count} missing ({count/len(rma_df)*100:.1f}%)")

hist_rma_df.drop(['ServiceBulletinInfo'], axis=1, inplace=True)
hist_rma_df.drop(['ServiceBulletinNumber'], axis=1, inplace=True)
hist_rma_df.drop(['ServiceBulletin'], axis=1, inplace=True)
hist_rma_df.drop(['AlertCategoryCode'], axis=1, inplace=True)
hist_rma_missing = hist_rma_df.isnull().sum()
if hist_rma_missing.any():
    print("\nNew Hist-RMA Columns with missing values:")
    for col, count in hist_rma_missing[hist_rma_missing > 0].items():
        print(f"  {col}: {count} missing ({count/len(hist_rma_df)*100:.1f}%)")
productinfo_df.drop(['productgroup'], axis=1, inplace=True)
productinfo_df.drop(['actualdate'], axis=1, inplace=True)
productinfo_df.drop(['notes'], axis=1, inplace=True)
productinfo_df.drop(['milestonedate'], axis=1, inplace=True)
productinfo_df.drop(['milestonestatus'], axis=1, inplace=True)
productinfo_df.drop(['milestone'], axis=1, inplace=True)
productinfo_df.drop(['functionalspec'], axis=1, inplace=True) # Generic spec field
productinfo_df.drop(['conformitydescription'], axis=1, inplace=True)

# Convert missed Datetime fields
print("Converting Missed Datetime fields")
flight_data_df['FlightStartTime'] = pd.to_datetime(flight_data_df['FlightStartTime'], format='ISO8601', errors='coerce')
flight_data_df['FlightEndTime'] = pd.to_datetime(flight_data_df['FlightEndTime'], format='ISO8601', errors='coerce')
flightresets_df['FlightStartTime'] = pd.to_datetime(flightresets_df['FlightStartTime'], format='ISO8601',  errors='coerce')
flightresets_df['FlightEndTime'] = pd.to_datetime(flightresets_df['FlightEndTime'], format='ISO8601',  errors='coerce')
hist_rma_df['ReceivedDate'] = pd.to_datetime(hist_rma_df['ReceivedDate'], errors='coerce')
hist_rma_df['ShipDate'] = pd.to_datetime(hist_rma_df['ShipDate'], errors='coerce')
# Get rid of insert/update dates
hist_rma_df.drop(['InsertDate'], axis=1, inplace=True)
flight_data_df.drop(['FileCreatedTime'], axis=1, inplace=True)
flight_data_df.drop(['InsertDate'], axis=1, inplace=True)
flightpassengers_df.drop(['InsertDate'], axis=1, inplace=True)
mtbf_df.drop(['InsertDate'], axis=1, inplace=True)
mtbf_df.drop(['UpdateDate'], axis=1, inplace=True)

# I want to see earliest dates and latest dates in all the dfs
# I dont want placeholder dates, replace will null
print("Earliest and Latest Dates")
for df_name, df in dataframes.items():
    min_valid_date = pd.Timestamp('2012-01-01')
    print(f"\n{df_name}")
    
    # Get all datetime columns regardless of timezone
    datetime_cols = [col for col in df.columns if pd.api.types.is_datetime64_dtype(df[col])]
    
    for col in datetime_cols:
        print(f"  {col} (before): {df[col].min()} - {df[col].max()}")
        # Convert timezone-aware columns to regular
        if hasattr(df[col].dtype, 'tz') and df[col].dtype.tz is not None:
            df[col] = df[col].dt.tz_localize(None)
            print(f"Removed timezone from {col}")
        # Replace dates before min_valid_date with NaT
        invalid_count = (df[col] < min_valid_date).sum()
        df.loc[df[col] < min_valid_date, col] = pd.NaT
        print(f"  {col} (after): {df[col].min()} - {df[col].max()} ({invalid_count} values replaced with NaT)")

# I need to find out how to merge data
# inflight_parts_df gets Left joined to productinfo_df on PartNumber
merged_inflight_parts_df = inflight_parts_df.merge(productinfo_df, how='left', left_on='PartNumber', right_on='part_number')

# Determine new column names
rma_standardized = rma_df.copy()
rma_standardized['source'] = 'current_rma'
rma_standardized['rma_number'] = rma_standardized['ServiceRequestId']
rma_standardized['part_number'] = rma_standardized['ItemId']
rma_standardized['serial_number'] = rma_standardized['SerialId']
rma_standardized['status'] = rma_standardized['RepairStatus']
rma_standardized['received_date'] = rma_standardized['UnitReceivedDate']  
rma_standardized['ship_date'] = rma_standardized['ActualShipDate'] 
rma_standardized['warranty_end_date'] = rma_standardized['WarrantyEndDate']  
rma_standardized['customer'] = 'ANONYMIZED_CUSTOMER'  # Customer info anonymized
rma_standardized['part_description'] = rma_standardized['Description']
rma_standardized['fault_code'] = rma_standardized['ComplaintId']
rma_standardized['lru_name'] = None  # No direct match in current RMA

hist_standardized = hist_rma_df.copy()
hist_standardized['source'] = 'historical_rma'
hist_standardized['rma_number'] = hist_standardized['RMA']
hist_standardized['part_number'] = hist_standardized['PN']
hist_standardized['serial_number'] = hist_standardized['SN']
hist_standardized['status'] = hist_standardized['StatusDescription']
hist_standardized['received_date'] = hist_standardized['ReceivedDate']  
hist_standardized['ship_date'] = hist_standardized['ShipDate']  
hist_standardized['customer'] = 'ANONYMIZED_CUSTOMER'  # Customer info anonymized
hist_standardized['part_description'] = hist_standardized['PartDescription']
hist_standardized['fault_code'] = hist_standardized['FaultCode']
hist_standardized['lru_name'] = hist_standardized['LRUName']

hist_standardized['workshop_location'] = None  # No equivalent in historical data
hist_standardized['flight_hours'] = None  # No equivalent in historical data
hist_standardized['return_reason'] = hist_standardized['FaultCode'] 
hist_standardized['warranty_end_date'] = None  # No equivalent in historical data

rma_standardized['workshop_location'] = rma_standardized['RepairLocation']
rma_standardized['flight_hours'] = rma_standardized['FlightHours']
rma_standardized['return_reason'] = rma_standardized['ReturnReason']
rma_standardized['warranty_end_date'] =  rma_standardized['WarrantyEndDate']

final_columns = [
    'source', 'rma_number', 'part_number', 'serial_number', 'customer',
    'status', 'received_date', 'ship_date', 'part_description', 
    'fault_code', 'lru_name', 'workshop_location', 'flight_hours',
    'return_reason', 'warranty_end_date'
]

rma_for_concat = rma_standardized[final_columns]
hist_for_concat = hist_standardized[final_columns]

combined_rma_df = pd.concat([rma_for_concat, hist_for_concat], ignore_index=True)

print("flight_data_df columns:", flight_data_df.columns.tolist())
print("flightresets_df columns:", flightresets_df.columns.tolist())
print('flightresets_df', flightresets_df[flightresets_df['FlightStartTime'] > '2023-01-01'].head())

merged_flight_data_df = flight_data_df.merge(flightresets_df, how='left', on='FlightID')

# Fix float fields that should be integers
merged_flight_data_df['RawResets'] = pd.array(
    merged_flight_data_df['RawResets'].to_numpy(), 
    dtype=pd.Int64Dtype()
)

print("merged_flight_data_df columns:", merged_flight_data_df.columns.tolist())

print("Unique RawResets values in flightresets_df:")
print(flightresets_df['RawResets'].unique())
# Print all unique vvalues of RawResets in merged_flight_data_df
print("Unique RawResets values in merged_flight_data_df:")
print(merged_flight_data_df['RawResets'].unique())

# It appears that this merge is not working, 
# check for common flight ids between the two dataframes
common_flight_ids = set(flight_data_df['FlightID']).intersection(set(flightresets_df['FlightID']))
print(f"Common Flight IDs: {len(common_flight_ids)} of {len(flight_data_df)} and {len(flightresets_df)}")

merged_flight_data_df = merged_flight_data_df.merge(flightpassengers_df, how='left', on='FlightID'  )

# Drop all duplicate columns from the merged_flight_data_df
columns_to_drop = [
    # Duplicates from flightresets_df merge
    'Airline_y', 
    'DepartureCode_y', 
    'ArrivalCode_y', 
    'FlightNumber_y', 
    'TailNumber_y', 
    'AircraftType_y',
    
    # Duplicates from flightpassengers_df merge (keeping the passenger data)
    'asset_id',  # Keep the original 'TailNumber_x'
    'FlightNumber',  # Keep the original 'FlightNumber_x'
    'DepartureCode',  # Keep the original 'DepartureCode_x'
    'ArrivalCode',  # Keep the original 'ArrivalCode_x'
    'FlightStartTime',  # Keep the original 'FlightStartTime_x'
    'FlightEndTime'  # Keep the original 'FlightEndTime_x'
]

# Drop these columns
merged_flight_data_df = merged_flight_data_df.drop(columns_to_drop, axis=1)

# Rename the remaining columns to remove the _x suffix
columns_to_rename = {
    'FlightID_x ': 'FlightID',
    'Airline_x': 'Airline',
    'DepartureCode_x': 'DepartureCode',
    'ArrivalCode_x': 'ArrivalCode',
    'FlightStartTime_x': 'FlightStartTime',
    'FlightEndTime_x': 'FlightEndTime',
    'TailNumber_x': 'asset_id',
    'FlightNumber_x': 'FlightNumber',
    'AircraftType_x': 'AircraftType'
}

# Between flightstarttime_y and flightstarttime keep the one that isnt null
# Between flightendtime_y and flightendtime keep the one that isnt null
merged_flight_data_df = merged_flight_data_df.rename(columns=columns_to_rename)
print("Merged_flight_data_df columns:", merged_flight_data_df.columns.tolist())

merged_flight_data_df['FlightStartTime'] = merged_flight_data_df['FlightStartTime_y'].combine_first(merged_flight_data_df['FlightStartTime'])
merged_flight_data_df['FlightEndTime'] = merged_flight_data_df['FlightEndTime_y'].combine_first(merged_flight_data_df['FlightEndTime'])
# Now I have 4 dataframes, merged_inflight_parts_df, combined_rma_df, merged_flight_data_df, mtbf_df

# Print date range
print("\nDate Range for Dataframes:")
print(merged_flight_data_df['FlightStartTime'].min(), "to", merged_flight_data_df['FlightEndTime'].max())

# Calculate flight duration
merged_flight_data_df['FlightDuration'] = merged_flight_data_df['FlightEndTime'] - merged_flight_data_df['FlightStartTime']
negative_duration = (merged_flight_data_df['FlightDuration'] < pd.Timedelta(0))
merged_flight_data_df.loc[negative_duration, 'FlightDuration'] = pd.NaT

print("Merged Flight Data Columns:", merged_flight_data_df.columns.tolist())
print(merged_flight_data_df.head())
# print head starting in jan 1 2023
print(merged_flight_data_df[merged_flight_data_df['FlightStartTime'] > '2023-01-01'].head())


# Analyze RMA reasonings
# Get all unique return reasons and their counts
unique_return_reasons = combined_rma_df['return_reason'].unique()
print(f"\nUnique Return Reasons ({len(unique_return_reasons)}):")
return_reason_counts = combined_rma_df['return_reason'].value_counts()
print(return_reason_counts)

def categorize_aerospace_return_reason(reason):
    if pd.isna(reason):
        return 'Unknown'
        
    reason_str = str(reason).upper().strip()
    
    # V-Code classification (common aerospace fault codes)
    if re.match(r'^V\d+$', reason_str) or reason_str.startswith('|V') or re.search(r'V\d+,\s*V\d+', reason_str):
        return 'V_Code'
        
    # E-Code classification
    if re.match(r'^E\d+$', reason_str) or reason_str.startswith('|E') or 'E17-MB' in reason_str:
        return 'E_Code'
    
    # No Fault Found / Could Not Confirm
    if reason_str in ['NFF', '|NFF|'] or 'NO FAULT FOUND' in reason_str:
        return 'No_Fault_Found'
        
    if reason_str in ['CNC', '|CNC|'] or 'COULD NOT CONFIRM' in reason_str or 'CANNOT CONFIRM' in reason_str:
        return 'Could_Not_Confirm'
    
    # Hardware issues
    if 'PHYSICAL DAMAGE' in reason_str or 'BROKEN' in reason_str or 'CRACK' in reason_str or 'SCRATCH' in reason_str:
        return 'Physical_Damage'
        
    if 'DISPLAY' in reason_str or 'SCREEN' in reason_str or 'BLANK' in reason_str or 'BLACK SCREEN' in reason_str:
        return 'Display_Issue'
        
    if 'TOUCH' in reason_str or 'PHANTOM TOUCH' in reason_str:
        return 'Touch_Screen_Issue'
        
    if 'POWER' in reason_str or 'NO POWER' in reason_str or 'NOT POWER' in reason_str or 'WILL NOT TURN ON' in reason_str:
        return 'Power_Issue'
        
    if 'BOOT' in reason_str or 'NO START' in reason_str or 'WILL NOT BOOT' in reason_str:
        return 'Boot_Issue'
        
    if 'AUDIO' in reason_str or 'JACK' in reason_str:
        return 'Audio_Issue'
        
    if 'USB' in reason_str or 'GENERIC USB' in reason_str:
        return 'USB_Issue'
    
    if 'ETHERNET' in reason_str or 'NETWORK' in reason_str or 'CONNECT' in reason_str:
        return 'Network_Issue'
        
    # Software issues
    if 'SOFTWARE' in reason_str or 'CORRUPT' in reason_str or 'SW' in reason_str or 'FIRMWARE' in reason_str:
        return 'Software_Issue'
        
    # Common specific failures
    if 'MEZZ' in reason_str or 'BOARD' in reason_str or 'PCB' in reason_str or 'MAIN BOARD' in reason_str:
        return 'Board_Failure'
        
    if 'BATTERY' in reason_str:
        return 'Battery_Issue'
        
    if 'FAN' in reason_str:
        return 'Fan_Issue'
        
    if 'CRYPTO' in reason_str or 'SECURITY' in reason_str:
        return 'Security_Component_Issue'
        
    if 'DEMO' in reason_str or 'CERTIFICATION' in reason_str or 'RECERTIF' in reason_str:
        return 'Certification_Test'
        
    # Catch-all categories
    if 'INOP' in reason_str or 'FAIL' in reason_str or 'FAULT' in reason_str:
        return 'General_Inoperative'
        
    if reason_str == 'OTHER' or reason_str == 'V10':
        return 'Other'
        
    # Numeric codes and special codes
    if re.match(r'^\d+-\d+-\d+-\d+$', reason_str) or re.match(r'^SPEC-', reason_str):
        return 'Special_Code'
        
    # Anything else
    return 'Miscellaneous'

# Apply the categorization to create a new column
combined_rma_df['return_reason_category'] = combined_rma_df['return_reason'].apply(categorize_aerospace_return_reason)

# Standardize part numbers
merged_inflight_parts_df['PartNumber'] = merged_inflight_parts_df['PartNumber'].str.upper()
combined_rma_df['part_number'] = combined_rma_df['part_number'].str.upper()
mtbf_df['PartNumber'] = mtbf_df['PartNumber'].str.upper()

# Calculate Repairs duration
combined_rma_df['repair_duration'] = combined_rma_df['ship_date'] - combined_rma_df['received_date']
# Remove negative repair durations
negative_duration = (combined_rma_df['repair_duration'] < pd.Timedelta(0))
combined_rma_df.loc[negative_duration, 'repair_duration'] = pd.NaT

# Handle Outlier data for all of the dataframes
# Calculate the z-score for the duration columns
def remove_outliers_zscore(df, column, threshold=3):
    # Remove outliers using z-score method
    if df[column].dtype == 'timedelta64[ns]':
        # Convert timedelta to seconds for z-score calculation
        seconds = df[column].dt.total_seconds()
        z_scores = zscore(seconds, nan_policy='omit')
    else:
        z_scores = zscore(df[column], nan_policy='omit')
    
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < threshold)
    return df[filtered_entries]

# Check data types and fix repair_duration if needed
print(f"repair_duration dtype: {combined_rma_df['repair_duration'].dtype}")

# Convert repair_duration to timedelta
if not pd.api.types.is_timedelta64_dtype(combined_rma_df['repair_duration']):
    try:
        if combined_rma_df['repair_duration'].dtype == 'object' or pd.api.types.is_string_dtype(combined_rma_df['repair_duration']):
            combined_rma_df['repair_duration'] = pd.to_timedelta(combined_rma_df['repair_duration'])
        else:
            if combined_rma_df['repair_duration'].median() < 1000:
                combined_rma_df['repair_duration'] = pd.to_timedelta(combined_rma_df['repair_duration'], unit='D')
                print("Converted numeric repair_duration to timedelta (assuming days)")
            else:
                combined_rma_df['repair_duration'] = pd.to_timedelta(combined_rma_df['repair_duration'], unit='s')
                print("Converted numeric repair_duration to timedelta (assuming seconds)")
    except Exception as e:
        print(f"Error converting repair_duration: {e}")
        combined_rma_df['repair_duration_days'] = np.nan
        
        # Try to calculate repair duration in days from received_date and ship_date
        mask = ~combined_rma_df['received_date'].isna() & ~combined_rma_df['ship_date'].isna()
        if mask.any():
            duration = (combined_rma_df.loc[mask, 'ship_date'] - combined_rma_df.loc[mask, 'received_date']).dt.total_seconds() / (24*60*60)
            combined_rma_df.loc[mask, 'repair_duration_days'] = duration

## Handle Outliers in repair duration
print("\nHandling outliers in repair duration...")
# Determine which repair duration column to use
if pd.api.types.is_timedelta64_dtype(combined_rma_df['repair_duration']):
    repair_col = 'repair_duration'
    repair_duration_df = combined_rma_df.dropna(subset=[repair_col])
    repair_duration_days = repair_duration_df[repair_col].dt.total_seconds() / (24 * 60 * 60)
    print("Using repair_duration column (timedelta)")
elif 'repair_duration_days' in combined_rma_df.columns:
    repair_col = 'repair_duration_days'
    repair_duration_df = combined_rma_df.dropna(subset=[repair_col])
    repair_duration_days = repair_duration_df[repair_col]
    print("Using repair_duration_days column (float)")
else:
    print("No valid repair duration column available")
    repair_duration_days = None

if repair_duration_days is not None:
    # Calculate statistics before handling outliers
    repair_mean_before = repair_duration_days.mean()
    repair_median_before = repair_duration_days.median()
    repair_max_before = repair_duration_days.max()
    repair_min_before = repair_duration_days.min()

    print(f"Repair duration before (in days): mean={repair_mean_before:.2f}, median={repair_median_before:.2f}, min={repair_min_before:.2f}, max={repair_max_before:.2f}")

    # Apply outlier handling using the z-score method on days
    z_scores = zscore(repair_duration_days, nan_policy='omit')
    abs_z_scores = np.abs(z_scores)
    threshold = 3

    # Identify outliers
    outliers = abs_z_scores > threshold
    outlier_indices = repair_duration_df.index[outliers]

    if len(outlier_indices) > 0:
        # Calculate the threshold in days
        mean_val = repair_duration_days.mean()
        std_val = repair_duration_days.std()
        upper_limit_days = mean_val + threshold * std_val
        lower_limit_days = max(0, mean_val - threshold * std_val)  # Ensure non-negative

        print(f"Identified {len(outlier_indices)} outliers")
        print(f"Upper limit: {upper_limit_days:.2f} days, Lower limit: {lower_limit_days:.2f} days")

        if repair_col == 'repair_duration':
            # Convert back to timedelta
            upper_limit_td = pd.Timedelta(days=upper_limit_days)
            lower_limit_td = pd.Timedelta(days=lower_limit_days)
            
            # Apply caps to original dataframe
            combined_rma_df.loc[combined_rma_df.index.isin(outlier_indices) & (combined_rma_df[repair_col] > upper_limit_td), repair_col] = upper_limit_td
            combined_rma_df.loc[combined_rma_df.index.isin(outlier_indices) & (combined_rma_df[repair_col] < lower_limit_td), repair_col] = lower_limit_td
        else:
            # Apply caps to days column
            combined_rma_df.loc[combined_rma_df.index.isin(outlier_indices) & (combined_rma_df[repair_col] > upper_limit_days), repair_col] = upper_limit_days
            combined_rma_df.loc[combined_rma_df.index.isin(outlier_indices) & (combined_rma_df[repair_col] < lower_limit_days), repair_col] = lower_limit_days

        # Calculate statistics after handling outliers
        if repair_col == 'repair_duration':
            repair_duration_df = combined_rma_df.dropna(subset=[repair_col])
            repair_duration_days_after = repair_duration_df[repair_col].dt.total_seconds() / (24 * 60 * 60)
        else:
            repair_duration_df = combined_rma_df.dropna(subset=[repair_col])
            repair_duration_days_after = repair_duration_df[repair_col]
            
        repair_mean_after = repair_duration_days_after.mean()
        repair_median_after = repair_duration_days_after.median()
        repair_max_after = repair_duration_days_after.max()
        repair_min_after = repair_duration_days_after.min()

        print(f"Repair duration after (in days): mean={repair_mean_after:.2f}, median={repair_median_after:.2f}, min={repair_min_after:.2f}, max={repair_max_after:.2f}")
        print(f"Number of outliers capped: {len(outlier_indices)}")
    else:
        print("No outliers found in repair duration data")

# Handle outliers in flight duration data
print("\nHandling outliers in flight duration...")
flight_duration_df = merged_flight_data_df.dropna(subset=['FlightDuration'])

# Convert to hours for easier interpretation
flight_duration_hours = flight_duration_df['FlightDuration'].dt.total_seconds() / 3600

flight_mean_before = flight_duration_hours.mean()
flight_median_before = flight_duration_hours.median()
flight_max_before = flight_duration_hours.max()
flight_min_before = flight_duration_hours.min()

# Apply outlier handling
z_scores = zscore(flight_duration_hours, nan_policy='omit')
abs_z_scores = np.abs(z_scores)
threshold = 3

# Identify which values to cap
outliers = abs_z_scores > threshold
outlier_indices = flight_duration_df.index[outliers]

if len(outlier_indices) > 0:
    # Calculate the threshold values in hours
    mean_val = flight_duration_hours.mean()
    std_val = flight_duration_hours.std()
    upper_limit_hours = mean_val + threshold * std_val
    lower_limit_hours = max(0, mean_val - threshold * std_val)  # Ensure non-negative

    # Convert back to timedelta for capping
    upper_limit_td = pd.Timedelta(hours=upper_limit_hours)
    lower_limit_td = pd.Timedelta(hours=lower_limit_hours)

    # Apply the caps to the original dataframe
    merged_flight_data_df.loc[merged_flight_data_df.index.isin(outlier_indices) & (merged_flight_data_df['FlightDuration'] > upper_limit_td), 'FlightDuration'] = upper_limit_td
    merged_flight_data_df.loc[merged_flight_data_df.index.isin(outlier_indices) & (merged_flight_data_df['FlightDuration'] < lower_limit_td), 'FlightDuration'] = lower_limit_td

flight_duration_df = merged_flight_data_df.dropna(subset=['FlightDuration'])
flight_duration_hours_after = flight_duration_df['FlightDuration'].dt.total_seconds() / 3600
flight_mean_after = flight_duration_hours_after.mean()
flight_median_after = flight_duration_hours_after.median()
flight_max_after = flight_duration_hours_after.max()
flight_min_after = flight_duration_hours_after.min()

print(f"Flight duration after (in hours): mean={flight_mean_after:.2f}, median={flight_median_after:.2f}, min={flight_min_after:.2f}, max={flight_max_after:.2f}")
print(f"Number of outliers capped: {len(outlier_indices)}")

# Save dataframes as parquet files
output_dir = os.path.join('private', 'data', 'transformed')
os.makedirs(output_dir, exist_ok=True)

# Data Fixes
combined_rma_df['customer'] = combined_rma_df['customer'].astype(str)

# Check the variance of all columns in merged_flight_data_df, no columns should have 0 variance...
print("\nChecking variance of all columns in merged_flight_data_df:")
numeric_columns = merged_flight_data_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("merged_flight_data_df Numeric columns:", numeric_columns)
print("Variance of merged_flight_data_df numeric columns:")
print(merged_flight_data_df[numeric_columns].var())

# Save the dataframes
merged_inflight_parts_df.to_parquet(os.path.join(output_dir, 'merged_inflight_parts.parquet'))
combined_rma_df.to_parquet(os.path.join(output_dir, 'combined_rma.parquet'))
merged_flight_data_df.to_parquet(os.path.join(output_dir, 'merged_flight_data.parquet'))
mtbf_df.to_parquet(os.path.join(output_dir, 'mtbf.parquet'))

print("\nDataframes saved as parquet files")