In [None]:
# Aerospace Component Failure Prediction Model - Data Extraction
#
# NOTE: This notebook contains anonymized/obfuscated data for public demonstration.
# Sensitive company information, customer details, and personal identifiers 
# have been removed or replaced with generic placeholders.
#
# Environment variables are used for database connections and API endpoints
# to protect sensitive infrastructure details.

import pandas as pd
import numpy as np
from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()
DATABASE_HOST = os.getenv('DATABASE_HOST')
DATABASE_NAME = os.getenv('DATABASE_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT', '5432')

# Database connection and query
connection = psycopg2.connect(
    host=DATABASE_HOST,
    database=DATABASE_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    port=DB_PORT
)

query = """
    SELECT 
        aircraft_groups.program_id,
        aircraft_groups.location_type,
        service_start_date,
        equipment_details.equipment_entry_id,
        entry_type,
        part_number,
        description,
        delivery_equipment_entry_id,
        delivery_quantity,
        group_type,
        group_number,
        group_name,
        asset_id,
        status,
        customer_id,
        install_location
    FROM equipment_installations
    LEFT JOIN equipment_details ON equipment_installations.id = equipment_details.equipment_entry_id
    LEFT JOIN delivery_records ON delivery_records.equipment_entry_id = equipment_installations.id
    LEFT JOIN delivery_allocation_detail ON delivery_allocation_detail.delivery_equipment_entry_id = delivery_records.id
    LEFT JOIN delivery ON delivery_records.delivery_id = delivery.id
    LEFT JOIN aircraft_groups ON delivery.shipset_id = aircraft_groups.id
    LEFT JOIN shipset_detail ON aircraft_groups.id = shipset_detail.shipset_id
    LEFT JOIN aircraft_registry ON aircraft_groups.id = aircraft_registry.id
    LEFT JOIN install_location_detail ON equipment_installations.install_location_id = install_location_detail.install_location_id
    WHERE entry_type = 'Equipment'
        AND group_type = 'aircraft_groups'
        AND status IS NOT NULL
        AND status <> 'Cancelled'
        AND equipment_details.inactive_date IS NULL
        AND delivery_allocation_detail.inactive_date IS NULL
        AND shipset_detail.inactive_date IS NULL
        AND install_location_detail.inactive_date IS NULL
    ORDER BY status
        """

# Execute query and load directly to pandas
df = pd.read_sql_query(query, connection)
connection.close()

# Rename columns to match desired schema
column_mapping = {
    'program_id': 'program_id',
    'location_type': 'location_type',
    'service_start_date': 'EntryIntoServiceDate',
    'equipment_entry_id': 'equipment_entry_id',
    'entry_type': 'EntryType',
    'part_number': 'PartNumber',
    'description': 'Description',
    'delivery_equipment_entry_id': 'delivery_equipment_entry_id',
    'delivery_quantity': 'DeliveryQuantity',
    'group_type': 'group_type',
    'group_number': 'group_number',
    'group_name': 'group_name',
    'asset_id': 'Tail',
    'status': 'status',
    'customer_id': 'customer_id',
    'install_location': 'InstallLocation'
}
df = df.rename(columns=column_mapping)

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'partslist.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather RMA Records

In [None]:
import logging
import msal
import requests
import os
from dotenv import load_dotenv
from datetime import datetime
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.ERROR,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 1. Load environment variables
load_dotenv()

AUTH_ENDPOINT = os.getenv('AUTH_ENDPOINT')
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
SCOPE = os.getenv('SCOPE')
API_BASE_URL = os.getenv('API_BASE_URL')

# 2. Configure auth settings
config = {
    "authority": AUTH_ENDPOINT,
    "client_id": CLIENT_ID,
    "client_secret": CLIENT_SECRET,
    "scope": [SCOPE]
}

try:
    # logger.info("Initializing MSAL application")
    app = msal.ConfidentialClientApplication(
        config["client_id"],
        authority=config["authority"],
        client_credential=config["client_secret"]
    )

    # logger.info("Acquiring token")
    result = app.acquire_token_for_client(scopes=config["scope"])
    
    if "access_token" in result:
        # logger.info("Token acquired successfully")
        access_token = result["access_token"]
    else:
        logger.error(f"Failed to acquire token. Error: {result.get('error')}")
        logger.error(f"Error description: {result.get('error_description')}")
        raise Exception("Failed to acquire token")

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    all_records = []
    next_link = API_BASE_URL + 'service_requests'

    while next_link:
        try:
            logger.info(f"[{datetime.now()}] Fetching data from: {next_link}")
            response = requests.get(next_link, headers=headers)
            response.raise_for_status()
            
            data = response.json()
            records = data.get("value", [])
            current_batch_size = len(records)
            all_records.extend(records)
            
            logger.info(f"[{datetime.now()}] Batch size: {current_batch_size}")
            logger.info(f"[{datetime.now()}] Current batch records: {len(records)}")
            logger.info(f"[{datetime.now()}] Total records so far: {len(all_records)}")
            
            # Get next page link if exists
            next_link = data.get('@odata.nextLink')
            if next_link:
                logger.info(f"[{datetime.now()}] Next link found: {next_link}")
            else:
                logger.info(f"[{datetime.now()}] No more pages to fetch")
            
        except Exception as e:
            logger.error(f"[{datetime.now()}] Error fetching data: {str(e)}", exc_info=True)
            raise

        logger.info(f"Total records retrieved: {len(all_records)}")
        
        # Convert to DataFrame and save to CSV
        rma_df = pd.DataFrame(all_records)

        columns_to_keep = [
        'ServiceRequestId', 'DeliveryLocation_LocationId', 'Message', 'QuotationAmount',
        'ProjId', 'request_type', 'CallDueDateTime', 'QuotationAmountType',
        'Subject', 'ComplaintId',
        'ProjectIntegrationId', 'CauseId', 'Progress', 'SLARefDateTime',
        'ServiceObjectId', 'InvoiceName', 'CallActionDateTime', 'SolutionId',
        'RepairWithAccessories', 'CallStatusId', 'TaskDueDateTime',
        'Solution', 'InternalMsg', 'CertificateType', 'FinalDocDate',
        'WarrantyEndDate', 'OptionalRevision', 'VisualInspectionResult',
        'RepairLocation', 'ExpectedShipDate', 'IsBroadbandUnit',
        'portal_status', 'AircraftType', 'RemovedDate', 'FlightHours',
        'AircraftTailSerialNumber', 'ReturnReason', 'ActualShipDate',
        'FinalDocRevision', 'QuoteApprovedDate', 'repair_type',
        'AircraftTailNumber', 'QuoteSentDate', 'WarrantyStartDate',
        'UnitReceivedDate'
    ]

    rma_df = rma_df[columns_to_keep]

    # Get part data for each rma order
    all_so_records = []
    next_link = API_BASE_URL + 'service_objects'

    while next_link:
        try:
            logger.info(f"[{datetime.now()}] Fetching data from: {next_link}")
            response = requests.get(next_link, headers=headers)
            response.raise_for_status()
            
            data = response.json()
            records = data.get("value", [])
            current_batch_size = len(records)
            all_so_records.extend(records)
            
            logger.info(f"[{datetime.now()}] Batch size: {current_batch_size}")
            logger.info(f"[{datetime.now()}] Current batch records: {len(records)}")
            logger.info(f"[{datetime.now()}] Total records so far: {len(all_so_records)}")
            
            # Get next page link if exists
            next_link = data.get('@odata.nextLink')
            if next_link:
                logger.info(f"[{datetime.now()}] Next link found: {next_link}")
            else:
                logger.info(f"[{datetime.now()}] No more pages to fetch")
            
        except Exception as e:
            logger.error(f"[{datetime.now()}] Error fetching data: {str(e)}", exc_info=True)
            raise

    logger.info(f"Total records retrieved: {len(all_so_records)}")

    so_df = pd.DataFrame(all_so_records)

    # keeps only the columns we need - removing sensitive contact info
    so_columns_to_keep = [
        'ServiceObjectId',
        'Status',
        'Description',
        'ItemId',
        'MachineTypeId',
        'ProjId',
        'SerialId',
        'WarrantyStartDate_Cust',
        'RepairStatusRefRecId',
        'RepairStatus',
        'RepairStatusRefTableId'
    ]
    
    so_df = so_df[so_columns_to_keep]

    # Merge RMA orders and Service Objects
    # Get overlapping columns
    common_cols = set(rma_df.columns) & set(so_df.columns)
    print("Common columns:", common_cols)

    # Merge with suffix handling
    merged_df = pd.merge(
        rma_df,
        so_df,
        on='ServiceObjectId',
        how='left',
        suffixes=('_rma', '_service')  # Clear suffixes to avoid conflicts
    )

    # Drop duplicate columns if needed
    duplicate_cols = [col for col in merged_df.columns if col.endswith('_service')]
    merged_df = merged_df.drop(columns=duplicate_cols)

    # Print the number of columns and rows
    print("\nColumns in the dataset:")
    print(merged_df.columns.tolist())
    print(f"Rows: {len(merged_df)}")
    print(f"\nDataset saved to {output_path}")

    # Save to CSV
    data_dir = os.path.join('private', 'data', 'raw')
    os.makedirs(data_dir, exist_ok=True)
    output_path = os.path.join(data_dir, 'merged_rmaorders.csv')
    merged_df.to_csv(output_path, index=False)

except Exception as e:
    logger.error(f"An error occurred: {str(e)}", exc_info=True)
    raise

# Gather Product information

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()
DATABASE_HOST = os.getenv('DATABASE_HOST')
DATABASE_NAME = os.getenv('DATABASE_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT', '5432')

# Database connection and query
connection = psycopg2.connect(
    host=DATABASE_HOST,
    database=DATABASE_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    port=DB_PORT
)

query = """
SELECT 
    -- Basic identification
    product.part_number,
    productinformation.productname,
    product.producttype,
    
    -- Product categorization
    productinformation.productfamily,
    productpcddetails.productgroup,
    productinformation.definitionlevel,
    productinformation.lrucategoryclass,
    productpcddetails.conformitydescription,
    
    -- Manufacturing information
    productinformation.manufacturingtype,
    productinformation.repairtype,
    productinformation.pmastatus,
    
    -- Production status
    productinformation.newdesignrecommendation,
    productinformation.useequipmentfamily,
    
    -- Technical specifications
    productinputpower.operatingmode,
    productinputpower.maximuminputpower,
    productlruspecifics.internalstorage,
    productlruspecifics.resolution,
    productlruspecifics.frontpanelusb,
    productlruspecifics.oneethernetport,
    productlruspecifics.functionalspec,
    
    -- Lifecycle information
    productlifecyclephase.lifecyclephase,
    productmilestone.milestone,
    productmilestone.milestonedate,
    productmilestone.milestonestatus,
    productmilestone.actualdate,
    productmilestone.notes,
    
    -- Manager information
    productinformation.product_manager,
    
    -- Performance/reliability metrics
    productreliability.target_reliability,
    productpower.engineering AS power_engineering,
    productpower.marketing AS power_marketing,
    productpower.contractual AS power_contractual,
    productweight.engineering AS weight_engineering,
    productweight.marketing AS weight_marketing,
    productweight.contractual AS weight_contractual
FROM product
LEFT JOIN productbase ON product.part_number = productbase.part_number
LEFT JOIN productinformation ON product.part_number = productinformation.part_number
LEFT JOIN productinputpower ON product.part_number = productinputpower.part_number
LEFT JOIN productlifecyclephase ON product.part_number = productlifecyclephase.part_number
LEFT JOIN productlruspecifics ON product.part_number = productlruspecifics.part_number
LEFT JOIN productmilestone ON product.part_number = productmilestone.part_number
LEFT JOIN productpcddetails ON product.part_number = productpcddetails.part_number
LEFT JOIN productpower ON product.part_number = productpower.part_number 
LEFT JOIN productreliability ON product.part_number = productreliability.part_number 
LEFT JOIN productweight ON product.part_number = productweight.part_number
"""

# Execute query and load directly to pandas
df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'productinfo.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Flight Events Data

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
SQLSERVER_HOST = os.getenv('SQLSERVER_HOST')
SQLSERVER_DB = os.getenv('SQLSERVER_DB')
SQLSERVER_USER = os.getenv('SQLSERVER_USER')
SQLSERVER_PASSWORD = os.getenv('SQLSERVER_PASSWORD')
SQLSERVER_PORT = os.getenv('SQLSERVER_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={SQLSERVER_HOST},{SQLSERVER_PORT};DATABASE={SQLSERVER_DB};UID={SQLSERVER_USER};PWD={SQLSERVER_PASSWORD}'
)

query = """
    SELECT 
        FlightResetsID,
        FlightID,
        Airline,
        DepartureCode,
        ArrivalCode,
        FlightNumber,
        asset_id,
        FlightStartTime,
        FlightEndTime,
        FlightDuration,
        Class,
        AircraftType,
        SeatResets,
        RawResets,
        Processed
    FROM [dbo].[flight_resets]
    WHERE customer_type = 'Production' 
    AND Airline NOT IN ('DEMO', 'TRAINING')
"""

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'flightresets.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather MTBF Data

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
SQLSERVER_HOST = os.getenv('SQLSERVER_HOST')
SQLSERVER_DB = os.getenv('SQLSERVER_DB')
SQLSERVER_USER = os.getenv('SQLSERVER_USER')
SQLSERVER_PASSWORD = os.getenv('SQLSERVER_PASSWORD')
SQLSERVER_PORT = os.getenv('SQLSERVER_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={SQLSERVER_HOST},{SQLSERVER_PORT};DATABASE={SQLSERVER_DB};UID={SQLSERVER_USER};PWD={SQLSERVER_PASSWORD}'
)

query = """
    SELECT 
        MTBFID,
        Airline,
        PartNumber,
        PartGroup,
        DetailPartGroup,
        Description,
        Month,
        PoweredOnHours,
        FlightHours,
        Failures,
        NFF,
        target_reliability,
        InsertDate,
        UpdateDate,
        UpdateCount
    FROM [dbo].[mtbf_data]
    WHERE customer_type = 'Production'
    """

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'mtbf.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")




# Gather Flight Data

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
SQLSERVER_HOST = os.getenv('SQLSERVER_HOST')
SQLSERVER_DB = os.getenv('SQLSERVER_DB')
SQLSERVER_USER = os.getenv('SQLSERVER_USER')
SQLSERVER_PASSWORD = os.getenv('SQLSERVER_PASSWORD')
SQLSERVER_PORT = os.getenv('SQLSERVER_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={SQLSERVER_HOST},{SQLSERVER_PORT};DATABASE={SQLSERVER_DB};UID={SQLSERVER_USER};PWD={SQLSERVER_PASSWORD}'
)

query = """ 
         SELECT
         FlightID,
         Airline,
         DepartureCode,
          ArrivalCode,
         FlightStartTime,
         FlightEndTime,
         asset_id,
         FlightNumber,
         AircraftType,
         InsertDate AS FileCreatedTime,
         InsertDate
 FROM [dbo].[Flights]
"""
df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'flights.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Passenger Count per flight

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
SQLSERVER_HOST = os.getenv('SQLSERVER_HOST')
SQLSERVER_DB = os.getenv('SQLSERVER_DB')
SQLSERVER_USER = os.getenv('SQLSERVER_USER')
SQLSERVER_PASSWORD = os.getenv('SQLSERVER_PASSWORD')
SQLSERVER_PORT = os.getenv('SQLSERVER_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={SQLSERVER_HOST},{SQLSERVER_PORT};DATABASE={SQLSERVER_DB};UID={SQLSERVER_USER};PWD={SQLSERVER_PASSWORD}'
)

query = """
    SELECT 
        AIMSID,
        FlightID,
        asset_id,
        FlightNumber,
        DepartureCode,
        ArrivalCode,
        FlightStartTime,
        FlightEndTime,
        BusinessClass,
        EconomyClass,
        TotalPassengers,
        InsertDate,
        UpdatedPaxActivity,
        UpdatedPerPassengerRevenue
    FROM [dbo].[PassengerCounts]
    WHERE asset_id IS NOT NULL
    """
    

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'passenger_count.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Historical RMA Records

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
SQLSERVER_HOST = os.getenv('SQLSERVER_HOST')
SQLSERVER_DB = os.getenv('SQLSERVER_DB')
SQLSERVER_USER = os.getenv('SQLSERVER_USER')
SQLSERVER_PASSWORD = os.getenv('SQLSERVER_PASSWORD')
SQLSERVER_PORT = os.getenv('SQLSERVER_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={SQLSERVER_HOST},{SQLSERVER_PORT};DATABASE={SQLSERVER_DB};UID={SQLSERVER_USER};PWD={SQLSERVER_PASSWORD}'
)

query = """
    SELECT 
        Customer,
        RMA,
        PN,
        SN,
        StatusDescription,
        PartDescription,
        LRUName,
        ReceivedDate,
        ReceivedAtPartner,
        FaultCode,
        ShipDate,
        ServiceBulletinInfo,
        ServiceBulletinNumber,
        ServiceBulletin,
        AlertCategoryCode,
        InsertDate
    FROM EXAMPLE_DWH.dbo.repair_records
    WHERE PN IS NOT NULL
    ORDER BY ReceivedDate DESC
"""

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

# Add "hist_" prefix to the filename
output_path = os.path.join(data_dir, 'hist_repair_rma.csv')

# Save to CSV
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Parts Information

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()
DATABASE_HOST = os.getenv('DATABASE_HOST')
DATABASE_NAME = os.getenv('DATABASE_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT', '5432')

# Database connection and query
connection = psycopg2.connect(
    host=DATABASE_HOST,
    database=DATABASE_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    port=DB_PORT
)

query = """
SELECT 
    -- Basic identification
    product.part_number,
    productinformation.productname,
    product.producttype,
    
    -- Product categorization
    productinformation.productfamily,
    productpcddetails.productgroup,
    productinformation.definitionlevel,
    productinformation.lrucategoryclass,
    productpcddetails.conformitydescription,
    
    -- Manufacturing information
    productinformation.manufacturingtype,
    productinformation.repairtype,
    
    -- Production status
    productinformation.newdesignrecommendation,
    productinformation.useequipmentfamily,
    
    -- Technical specifications
    productinputpower.operatingmode,
    productinputpower.maximuminputpower,
    productlruspecifics.internalstorage,
    productlruspecifics.resolution,
    productlruspecifics.frontpanelusb,
    productlruspecifics.oneethernetport,
    productlruspecifics.functionalspec,
    
    -- Lifecycle information
    productlifecyclephase.lifecyclephase,
    productmilestone.milestone,
    productmilestone.milestonedate,
    productmilestone.milestonestatus,
    productmilestone.actualdate,
    productmilestone.notes,
    
    -- Manager information
    productinformation.product_manager,
    
    -- Performance/reliability metrics
    productreliability.target_reliability,
    productpower.engineering AS power_engineering,
    productpower.marketing AS power_marketing,
    productpower.contractual AS power_contractual,
    productweight.engineering AS weight_engineering,
    productweight.marketing AS weight_marketing,
    productweight.contractual AS weight_contractual
FROM product
LEFT JOIN productbase ON product.part_number = productbase.part_number
LEFT JOIN productinformation ON product.part_number = productinformation.part_number
LEFT JOIN productinputpower ON product.part_number = productinputpower.part_number
LEFT JOIN productlifecyclephase ON product.part_number = productlifecyclephase.part_number
LEFT JOIN productlruspecifics ON product.part_number = productlruspecifics.part_number
LEFT JOIN productmilestone ON product.part_number = productmilestone.part_number
LEFT JOIN productpcddetails ON product.part_number = productpcddetails.part_number
LEFT JOIN productpower ON product.part_number = productpower.part_number 
LEFT JOIN productreliability ON product.part_number = productreliability.part_number 
LEFT JOIN productweight ON product.part_number = productweight.part_number
"""

# Execute query and load directly to pandas
df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'productinfo.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")