# Gather Parts List

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()
DB_HOST = os.getenv('DB_HOST')
DB_NAME = os.getenv('DB_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT', '5432')

# Database connection and query
connection = psycopg2.connect(
    host=DB_HOST,
    database=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    port=DB_PORT
)

query = """
    SELECT 
        shipset.programcode,
        shipset.installlocationtype,
        eisdate,
        equipmententrydetail.equipmententryid,
        entrytype,
        partnumber,
        description,
        deliveryequipmententryid,
        deliveryquantity,
        shipsettype,
        shipsetnumber,
        shipsetname,
        tailnumber,
        servicestatus,
        ownercompanyid,
        installlocation
    FROM equipmententry
    LEFT JOIN equipmententrydetail ON equipmententry.id = equipmententrydetail.equipmententryid
    LEFT JOIN deliveryequipmententry ON deliveryequipmententry.equipmententryid = equipmententry.id
    LEFT JOIN deliveryequipmentallocationdetail ON deliveryequipmentallocationdetail.deliveryequipmententryid = deliveryequipmententry.id
    LEFT JOIN delivery ON deliveryequipmententry.deliveryid = delivery.id
    LEFT JOIN shipset ON delivery.shipsetid = shipset.id
    LEFT JOIN shipsetdetail ON shipset.id = shipsetdetail.shipsetid
    LEFT JOIN programtailnumbereis ON shipset.id = programtailnumbereis.id
    LEFT JOIN installlocationdetail ON equipmententry.installlocationid = installlocationdetail.installlocationid
    WHERE entrytype = 'Equipment'
        AND shipsettype = 'Shipset'
        AND servicestatus IS NOT NULL
        AND servicestatus <> 'Cancelled'
        AND equipmententrydetail.inactivedate IS NULL
        AND deliveryequipmentallocationdetail.inactivedate IS NULL
        AND shipsetdetail.inactivedate IS NULL
        AND installlocationdetail.inactivedate IS NULL
    ORDER BY servicestatus
        """

# Execute query and load directly to pandas
df = pd.read_sql_query(query, connection)
connection.close()

# Rename columns to match desired schema
column_mapping = {
    'programcode': 'ProgramCode',
    'installlocationtype': 'InstallLocationType',
    'eisdate': 'EntryIntoServiceDate',
    'equipmententryid': 'EquipmentEntryId',
    'entrytype': 'EntryType',
    'partnumber': 'PartNumber',
    'description': 'Description',
    'deliveryequipmententryid': 'DeliveryEquipmentEntryId',
    'deliveryquantity': 'DeliveryQuantity',
    'shipsettype': 'ShipsetType',
    'shipsetnumber': 'ShipsetNumber',
    'shipsetname': 'ShipsetName',
    'tailnumber': 'Tail',
    'servicestatus': 'ServiceStatus',
    'ownercompanyid': 'OwnerCompanyId',
    'installlocation': 'InstallLocation'
}
df = df.rename(columns=column_mapping)

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'partslist.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather RMA Records

In [None]:
import logging
import msal
import requests
import os
from dotenv import load_dotenv
from datetime import datetime
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.ERROR,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# 1. Load environment variables
load_dotenv()

CLOUD_INSTANCE = os.getenv('CLOUD_INSTANCE')
CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
SCOPE = os.getenv('SCOPE')
DYNAMICS_URL = os.getenv('DYNAMICS_URL')

# 2. Configure auth settings
config = {
    "authority": CLOUD_INSTANCE,
    "client_id": CLIENT_ID,
    "client_secret": CLIENT_SECRET,
    "scope": [SCOPE]
}

try:
    # logger.info("Initializing MSAL application")
    app = msal.ConfidentialClientApplication(
        config["client_id"],
        authority=config["authority"],
        client_credential=config["client_secret"]
    )

    # logger.info("Acquiring token")
    result = app.acquire_token_for_client(scopes=config["scope"])
    
    if "access_token" in result:
        # logger.info("Token acquired successfully")
        access_token = result["access_token"]
    else:
        logger.error(f"Failed to acquire token. Error: {result.get('error')}")
        logger.error(f"Error description: {result.get('error_description')}")
        raise Exception("Failed to acquire token")

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    all_records = []
    next_link = DYNAMICS_URL + 'SvcCallTables'

    while next_link:
        try:
            logger.info(f"[{datetime.now()}] Fetching data from: {next_link}")
            response = requests.get(next_link, headers=headers)
            response.raise_for_status()
            
            data = response.json()
            records = data.get("value", [])
            current_batch_size = len(records)
            all_records.extend(records)
            
            logger.info(f"[{datetime.now()}] Batch size: {current_batch_size}")
            logger.info(f"[{datetime.now()}] Current batch records: {len(records)}")
            logger.info(f"[{datetime.now()}] Total records so far: {len(all_records)}")
            
            # Get next page link if exists
            next_link = data.get('@odata.nextLink')
            if next_link:
                logger.info(f"[{datetime.now()}] Next link found: {next_link}")
            else:
                logger.info(f"[{datetime.now()}] No more pages to fetch")
            
        except Exception as e:
            logger.error(f"[{datetime.now()}] Error fetching data: {str(e)}", exc_info=True)
            raise

        logger.info(f"Total records retrieved: {len(all_records)}")
        
        # Convert to DataFrame and save to CSV
        rma_df = pd.DataFrame(all_records)

        # Remove unwanted columns
        columns_to_keep = [
        'SvcCallId', 'DeliveryLocation_LocationId', 'Message', 'QuotationAmount',
        'ProjId', 'CallTypeId', 'CallDueDateTime', 'QuotationAmountType',
        'Subject', 'HcmWorker_PersonnelNumber', 'SvcCallInitiator', 'ComplaintId',
        'ProjectIntegrationId', 'CauseId', 'Progress', 'SLARefDateTime',
        'ServiceObjectId', 'InvoiceName', 'CallActionDateTime', 'SolutionId',
        'RepairWithAccessories', 'CallStatusId', 'CustAccount', 'TaskDueDateTime',
        'Solution', 'InternalMsg', 'HSOCertificateType', 'HSOFinalATPDate',
        'HSOWarrantyEndDate', 'HSOOptionalRevision', 'HSOVisualInspectionResult',
        'HSOWorkshopLocation', 'HSOExpectedShipDate', 'HSOIsBroadbandUnit',
        'HSOPortalStatusRMA', 'HSOAirplaneType', 'HSORemovedDate', 'HSOFlightHours',
        'HSOAirPlaneTailSerialNumber', 'HSOReturnReason', 'HSOActualShipDate',
        'HSOFinalATPDocRevision', 'HSOQuoteApprovedDate', 'HSORepairDirection',
        'HSOAirPlaneTailNumber', 'HSOQuoteSentDate', 'HSOWarrantyStartDate',
        'HSOUnitReceivedDate'
    ]

    rma_df = rma_df[columns_to_keep]

    # Get part data for each rma order
    all_so_records = []
    next_link = DYNAMICS_URL + 'DYSCoreServiceObjectTableCollection'

    while next_link:
        try:
            logger.info(f"[{datetime.now()}] Fetching data from: {next_link}")
            response = requests.get(next_link, headers=headers)
            response.raise_for_status()
            
            data = response.json()
            records = data.get("value", [])
            current_batch_size = len(records)
            all_so_records.extend(records)
            
            logger.info(f"[{datetime.now()}] Batch size: {current_batch_size}")
            logger.info(f"[{datetime.now()}] Current batch records: {len(records)}")
            logger.info(f"[{datetime.now()}] Total records so far: {len(all_so_records)}")
            
            # Get next page link if exists
            next_link = data.get('@odata.nextLink')
            if next_link:
                logger.info(f"[{datetime.now()}] Next link found: {next_link}")
            else:
                logger.info(f"[{datetime.now()}] No more pages to fetch")
            
        except Exception as e:
            logger.error(f"[{datetime.now()}] Error fetching data: {str(e)}", exc_info=True)
            raise

    logger.info(f"Total records retrieved: {len(all_so_records)}")

    so_df = pd.DataFrame(all_so_records)

    # keeps only the columns we need
    so_columns_to_keep = [
        'ServiceObjectId',
        'Status',
        'ContactPersonPhone',
        'CustAccountUser',
        'CustAccountOwner',
        'Description',
        'ItemId',
        'MachineTypeId',
        'ContactPersonEmail',
        'ProjId',
        'InventSerialId',
        'WarrantyStartDate_Cust',
        'WorkshopRepairStatusRefRecId',
        'WorkshopRepairStatus',
        'CustAccount',
        'WorkshopRepairStatusRefTableId'
    ]
    
    so_df = so_df[so_columns_to_keep]

    # Merge RMA orders and Service Objects
    # Get overlapping columns
    common_cols = set(rma_df.columns) & set(so_df.columns)
    print("Common columns:", common_cols)

    # Merge with suffix handling
    merged_df = pd.merge(
        rma_df,
        so_df,
        on='ServiceObjectId',
        how='left',
        suffixes=('_rma', '_service')  # Clear suffixes to avoid conflicts
    )

    # Drop duplicate columns if needed
    duplicate_cols = [col for col in merged_df.columns if col.endswith('_service')]
    merged_df = merged_df.drop(columns=duplicate_cols)

    # Print the number of columns and rows
    print("\nColumns in the dataset:")
    print(merged_df.columns.tolist())
    print(f"Rows: {len(merged_df)}")
    print(f"\nDataset saved to {output_path}")

    # Save to CSV
    data_dir = os.path.join('private', 'data', 'raw')
    os.makedirs(data_dir, exist_ok=True)
    output_path = os.path.join(data_dir, 'merged_rmaorders.csv')
    merged_df.to_csv(output_path, index=False)

except Exception as e:
    logger.error(f"An error occurred: {str(e)}", exc_info=True)
    raise

# Gather Product information

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()
DB_HOST = os.getenv('DB_HOST')
DB_NAME = os.getenv('DB_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT', '5432')

# Database connection and query
connection = psycopg2.connect(
    host=DB_HOST,
    database=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    port=DB_PORT
)

query = """
SELECT 
    -- Basic identification
    product.partnumber,
    productinformation.productname,
    product.producttype,
    
    -- Product categorization
    productinformation.productfamily,
    productpcddetails.productgroup,
    productinformation.definitionlevel,
    productinformation.lrucategoryclass,
    productpcddetails.conformitydescription,
    
    -- Manufacturing information
    productinformation.manufacturingtype,
    productinformation.repairtype,
    productinformation.pmastatus,
    
    -- Production status
    productinformation.newdesignrecommendation,
    productinformation.useequipmentfamily,
    
    -- Technical specifications
    productinputpower.operatingmode,
    productinputpower.maximuminputpower,
    productlruspecifics.internalstorage,
    productlruspecifics.resolution,
    productlruspecifics.frontpanelusb,
    productlruspecifics.oneethernetport,
    productlruspecifics.bajfunctionality,
    
    -- Lifecycle information
    productlifecyclephase.lifecyclephase,
    productmilestone.milestone,
    productmilestone.milestonedate,
    productmilestone.milestonestatus,
    productmilestone.actualdate,
    productmilestone.notes,
    
    -- Manager information
    productinformation.linemanager,
    
    -- Performance/reliability metrics
    productreliability.contractualmtbf,
    productpower.engineering AS power_engineering,
    productpower.marketing AS power_marketing,
    productpower.contractual AS power_contractual,
    productweight.engineering AS weight_engineering,
    productweight.marketing AS weight_marketing,
    productweight.contractual AS weight_contractual
FROM product
LEFT JOIN productbase ON product.partnumber = productbase.partnumber
LEFT JOIN productinformation ON product.partnumber = productinformation.partnumber
LEFT JOIN productinputpower ON product.partnumber = productinputpower.partnumber
LEFT JOIN productlifecyclephase ON product.partnumber = productlifecyclephase.partnumber
LEFT JOIN productlruspecifics ON product.partnumber = productlruspecifics.partnumber
LEFT JOIN productmilestone ON product.partnumber = productmilestone.partnumber
LEFT JOIN productpcddetails ON product.partnumber = productpcddetails.partnumber
LEFT JOIN productpower ON product.partnumber = productpower.partnumber 
LEFT JOIN productreliability ON product.partnumber = productreliability.partnumber 
LEFT JOIN productweight ON product.partnumber = productweight.partnumber
"""

# Execute query and load directly to pandas
df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'productinfo.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Resets Data

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
RA_DB_HOST = os.getenv('RA_DB_HOST')
RA_DB_NAME = os.getenv('RA_DB_NAME')
RA_DB_USER = os.getenv('RA_DB_USER')
RA_DB_PASSWORD = os.getenv('RA_DB_PASSWORD')
RA_DB_PORT = os.getenv('RA_DB_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={RA_DB_HOST},{RA_DB_PORT};DATABASE={RA_DB_NAME};UID={RA_DB_USER};PWD={RA_DB_PASSWORD}'
)

query = """
    SELECT 
        FlightResetsID,
        FlightID,
        Airline,
        DepartureCode,
        ArrivalCode,
        FlightNumber,
        TailNumber,
        FlightStartTime,
        FlightEndTime,
        FlightDuration,
        Class,
        AircraftType,
        SeatResets,
        RawResets,
        Processed
    FROM [dbo].[FlightResets]
    WHERE Airline != 'RAVE' 
    AND Airline != 'TEST' 
    AND Airline != 'PUBLIC'
"""

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'flightresets.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather MTBF Data

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
RA_DB_HOST = os.getenv('RA_DB_HOST')
RA_DB_NAME = os.getenv('RA_DB_NAME')
RA_DB_USER = os.getenv('RA_DB_USER')
RA_DB_PASSWORD = os.getenv('RA_DB_PASSWORD')
RA_DB_PORT = os.getenv('RA_DB_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={RA_DB_HOST},{RA_DB_PORT};DATABASE={RA_DB_NAME};UID={RA_DB_USER};PWD={RA_DB_PASSWORD}'
)

query = """
    SELECT 
        MTBFID,
        Airline,
        PartNumber,
        PartGroup,
        DetailPartGroup,
        Description,
        Month,
        PoweredOnHours,
        FlightHours,
        Failures,
        NFF,
        ContractualMTBF,
        InsertDate,
        UpdateDate,
        UpdateCount
    FROM [dbo].[MTBF]
    WHERE Airline != 'RAVE'
    """

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'mtbf.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")




# Gather Flight Data

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
RA_DB_HOST = os.getenv('RA_DB_HOST')
RA_DB_NAME = os.getenv('RA_DB_NAME')
RA_DB_USER = os.getenv('RA_DB_USER')
RA_DB_PASSWORD = os.getenv('RA_DB_PASSWORD')
RA_DB_PORT = os.getenv('RA_DB_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={RA_DB_HOST},{RA_DB_PORT};DATABASE={RA_DB_NAME};UID={RA_DB_USER};PWD={RA_DB_PASSWORD}'
)

query = """ 
         SELECT
         FlightID,
         Airline,
         DepartureCode,
          ArrivalCode,
         FlightStartTime,
         FlightEndTime,
         TailNumber,
         FlightNumber,
         AircraftType,
         InsertDate AS FileCreatedTime,
         InsertDate
 FROM [dbo].[Flight]
"""
df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'flights.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Passenger Count per flight

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
RA_DB_HOST = os.getenv('RA_DB_HOST')
RA_DB_NAME = os.getenv('RA_DB_NAME')
RA_DB_USER = os.getenv('RA_DB_USER')
RA_DB_PASSWORD = os.getenv('RA_DB_PASSWORD')
RA_DB_PORT = os.getenv('RA_DB_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={RA_DB_HOST},{RA_DB_PORT};DATABASE={RA_DB_NAME};UID={RA_DB_USER};PWD={RA_DB_PASSWORD}'
)

query = """
    SELECT 
        AIMSID,
        FlightID,
        TailNumber,
        FlightNumber,
        DepartureCode,
        ArrivalCode,
        FlightStartTime,
        FlightEndTime,
        BusinessClass,
        EconomyClass,
        TotalPassengers,
        InsertDate,
        UpdatedPaxActivity,
        UpdatedPerPassengerRevenue
    FROM [dbo].[ActualPassengerCounts]
    WHERE TailNumber IS NOT NULL
    """
    

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'passenger_count.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Older RMA records

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import pyodbc
import os

# Load ENV Variables
load_dotenv()
RA_DB_HOST = os.getenv('RA_DB_HOST')
RA_DB_NAME = os.getenv('RA_DB_NAME')
RA_DB_USER = os.getenv('RA_DB_USER')
RA_DB_PASSWORD = os.getenv('RA_DB_PASSWORD')
RA_DB_PORT = os.getenv('RA_DB_PORT', '1433')

# Database connection and query
connection = pyodbc.connect(
    f'DRIVER=ODBC Driver 17 for SQL Server;SERVER={RA_DB_HOST},{RA_DB_PORT};DATABASE={RA_DB_NAME};UID={RA_DB_USER};PWD={RA_DB_PASSWORD}'
)

query = """
    SELECT 
        Customer,
        RMA,
        PN,
        SN,
        StatusDescription,
        PartDescription,
        LRUName,
        ReceivedDate,
        Receivedat3P,
        FaultCode,
        ShipDate,
        ServiceBulletinInfo,
        ServiceBulletinNumber,
        ServiceBulletin,
        AlertCategoryCode,
        InsertDate
    FROM RCS_DWH.dbo.RepairRMA
    WHERE PN IS NOT NULL
    ORDER BY ReceivedDate DESC
"""

df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

# Add "arc_" prefix to the filename
output_path = os.path.join(data_dir, 'hist_repair_rma.csv')

# Save to CSV
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")

# Gather Parts Information

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import psycopg2
import os

# Load environment variables
load_dotenv()
DB_HOST = os.getenv('DB_HOST')
DB_NAME = os.getenv('DB_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_PORT = os.getenv('DB_PORT', '5432')

# Database connection and query
connection = psycopg2.connect(
    host=DB_HOST,
    database=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD,
    port=DB_PORT
)

query = """
SELECT 
    -- Basic identification
    product.partnumber,
    productinformation.productname,
    product.producttype,
    
    -- Product categorization
    productinformation.productfamily,
    productpcddetails.productgroup,
    productinformation.definitionlevel,
    productinformation.lrucategoryclass,
    productpcddetails.conformitydescription,
    
    -- Manufacturing information
    productinformation.manufacturingtype,
    productinformation.repairtype,
    
    -- Production status
    productinformation.newdesignrecommendation,
    productinformation.useequipmentfamily,
    
    -- Technical specifications
    productinputpower.operatingmode,
    productinputpower.maximuminputpower,
    productlruspecifics.internalstorage,
    productlruspecifics.resolution,
    productlruspecifics.frontpanelusb,
    productlruspecifics.oneethernetport,
    productlruspecifics.bajfunctionality,
    
    -- Lifecycle information
    productlifecyclephase.lifecyclephase,
    productmilestone.milestone,
    productmilestone.milestonedate,
    productmilestone.milestonestatus,
    productmilestone.actualdate,
    productmilestone.notes,
    
    -- Manager information
    productinformation.linemanager,
    
    -- Performance/reliability metrics
    productreliability.contractualmtbf,
    productpower.engineering AS power_engineering,
    productpower.marketing AS power_marketing,
    productpower.contractual AS power_contractual,
    productweight.engineering AS weight_engineering,
    productweight.marketing AS weight_marketing,
    productweight.contractual AS weight_contractual
FROM product
LEFT JOIN productbase ON product.partnumber = productbase.partnumber
LEFT JOIN productinformation ON product.partnumber = productinformation.partnumber
LEFT JOIN productinputpower ON product.partnumber = productinputpower.partnumber
LEFT JOIN productlifecyclephase ON product.partnumber = productlifecyclephase.partnumber
LEFT JOIN productlruspecifics ON product.partnumber = productlruspecifics.partnumber
LEFT JOIN productmilestone ON product.partnumber = productmilestone.partnumber
LEFT JOIN productpcddetails ON product.partnumber = productpcddetails.partnumber
LEFT JOIN productpower ON product.partnumber = productpower.partnumber 
LEFT JOIN productreliability ON product.partnumber = productreliability.partnumber 
LEFT JOIN productweight ON product.partnumber = productweight.partnumber
"""

# Execute query and load directly to pandas
df = pd.read_sql_query(query, connection)
connection.close()

# Create directory structure
data_dir = os.path.join('private', 'data', 'raw')
os.makedirs(data_dir, exist_ok=True)

output_path = os.path.join(data_dir, 'productinfo.csv')
df.to_csv(output_path, index=False)

# Print the number of columns and rows
print("\nColumns in the dataset:")
print(df.columns.tolist())
print(f"Rows: {len(df)}")
print(f"\nDataset saved to {output_path}")