In [20]:
# Import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_extract, percent_rank
from pyspark.sql import functions as F
from pyspark.sql.window import Window



In [2]:
# Create a Spark Session
spark = SparkSession.builder\
    .appName("Automotive Chip Reliability ETL")\
        .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/03 15:36:12 WARN Utils: Your hostname, Evans-MacBook-Pro-2.local, resolves to a loopback address: 127.0.0.1; using 192.168.0.149 instead (on interface en0)
25/06/03 15:36:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/03 15:36:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load the CSV file into a Spark DataFrame
raw_chip_df = spark.read.csv('data/automotive_chips_reliability_dataset.csv', header=True, inferSchema=True)

In [4]:
raw_chip_df.printSchema()

root
 |-- Chip ID: string (nullable = true)
 |-- Technology Node: integer (nullable = true)
 |-- FSDOI Type: string (nullable = true)
 |-- Gate Density: integer (nullable = true)
 |-- Substrate Type: string (nullable = true)
 |-- Power Consumption: double (nullable = true)
 |-- Thermal Dispersion Efficiency: double (nullable = true)
 |-- Mechanical Stress Resistance: double (nullable = true)
 |-- Temperature Range: string (nullable = true)
 |-- Humidity Levels: integer (nullable = true)
 |-- Vibration Levels: string (nullable = true)
 |-- External Load Stress: integer (nullable = true)
 |-- Thermal Stress: double (nullable = true)
 |-- Mechanical Stress: double (nullable = true)
 |-- Electrical Density: double (nullable = true)
 |-- Electromigration Risk: double (nullable = true)
 |-- Deformation: double (nullable = true)
 |-- Thermal Test Result: double (nullable = true)
 |-- Mechanical Stress Test Result: double (nullable = true)
 |-- Failure Mode: string (nullable = true)
 |-- Failu

In [5]:
raw_chip_df.show(5)

+-------+---------------+-----------+------------+-----------------+-----------------+-----------------------------+----------------------------+-----------------+---------------+----------------+--------------------+------------------+------------------+------------------+---------------------+-------------------+-------------------+-----------------------------+------------------+-----------------+------------+-----------------+-----------------------+------------------------+
|Chip ID|Technology Node| FSDOI Type|Gate Density|   Substrate Type|Power Consumption|Thermal Dispersion Efficiency|Mechanical Stress Resistance|Temperature Range|Humidity Levels|Vibration Levels|External Load Stress|    Thermal Stress| Mechanical Stress|Electrical Density|Electromigration Risk|        Deformation|Thermal Test Result|Mechanical Stress Test Result|      Failure Mode| Failure Rate (%)|MTTF (hours)|Reliability Score|Failure Rate Comparison|Statistical Significance|
+-------+---------------+-------

In [6]:
raw_chip_df.columns

['Chip ID',
 'Technology Node',
 'FSDOI Type',
 'Gate Density',
 'Substrate Type',
 'Power Consumption',
 'Thermal Dispersion Efficiency',
 'Mechanical Stress Resistance',
 'Temperature Range',
 'Humidity Levels',
 'Vibration Levels',
 'External Load Stress',
 'Thermal Stress',
 'Mechanical Stress',
 'Electrical Density',
 'Electromigration Risk',
 'Deformation',
 'Thermal Test Result',
 'Mechanical Stress Test Result',
 'Failure Mode',
 'Failure Rate (%)',
 'MTTF (hours)',
 'Reliability Score',
 'Failure Rate Comparison',
 'Statistical Significance']

In [7]:
# Before we verify the exact meaning of these columns and what they 
# represent, we run some typical/standard transformations 

# Standardize column names
raw_chip_df = raw_chip_df.select([col(c).alias(c.lower()\
    .replace(" ", "_").replace("(", "").replace(")", "")) \
        for c in raw_chip_df.columns])

In [8]:
# Handle missing values
raw_chip_df.select([F.count(F.when(F.col(c).isNull(), \
    c)).alias(c) for c in raw_chip_df.columns]).show()

+-------+---------------+----------+------------+--------------+-----------------+-----------------------------+----------------------------+-----------------+---------------+----------------+--------------------+--------------+-----------------+------------------+---------------------+-----------+-------------------+-----------------------------+------------+--------------+----------+-----------------+-----------------------+------------------------+
|chip_id|technology_node|fsdoi_type|gate_density|substrate_type|power_consumption|thermal_dispersion_efficiency|mechanical_stress_resistance|temperature_range|humidity_levels|vibration_levels|external_load_stress|thermal_stress|mechanical_stress|electrical_density|electromigration_risk|deformation|thermal_test_result|mechanical_stress_test_result|failure_mode|failure_rate_%|mttf_hours|reliability_score|failure_rate_comparison|statistical_significance|
+-------+---------------+----------+------------+--------------+-----------------+------

In [9]:
# Split or Parse Composite Columns
raw_chip_df = raw_chip_df.withColumn("temp_min", regexp_extract\
    ("temperature_range", r"(-?\d+)", 1).cast("int"))
raw_chip_df = raw_chip_df.withColumn("temp_max", regexp_extract\
    ("temperature_range", r"to\s*(-?\d+)", 1).cast("int"))

In [10]:
raw_chip_df.printSchema()

root
 |-- chip_id: string (nullable = true)
 |-- technology_node: integer (nullable = true)
 |-- fsdoi_type: string (nullable = true)
 |-- gate_density: integer (nullable = true)
 |-- substrate_type: string (nullable = true)
 |-- power_consumption: double (nullable = true)
 |-- thermal_dispersion_efficiency: double (nullable = true)
 |-- mechanical_stress_resistance: double (nullable = true)
 |-- temperature_range: string (nullable = true)
 |-- humidity_levels: integer (nullable = true)
 |-- vibration_levels: string (nullable = true)
 |-- external_load_stress: integer (nullable = true)
 |-- thermal_stress: double (nullable = true)
 |-- mechanical_stress: double (nullable = true)
 |-- electrical_density: double (nullable = true)
 |-- electromigration_risk: double (nullable = true)
 |-- deformation: double (nullable = true)
 |-- thermal_test_result: double (nullable = true)
 |-- mechanical_stress_test_result: double (nullable = true)
 |-- failure_mode: string (nullable = true)
 |-- failu

## Automotive Chips Reliability Data Dictionary

|          Column Name          | Data Type | Description |
|-------------------------------|-------------|-----------|
| chip_id                       | string  | A unique identifier for each chip in the dataset. |
| technology_node               | integer | The process node used in the chip fabrication (e.g., 14nm, 7nm, 5nm) |
| fsdoi_type                    | string  | The type of Fully Depleted Silicon on Insulator (FSDOI) used in the chip design (e.g., Thin Layer, Thick Layer) |
| gate_density                  | integer | The density of logic gates on the chip, measured in gates per unit area. |
| substrate_type                | string  | The type of substrate used in the chip (e.g., Silicon, Silicon-Germanium).|
| power_consumption             | double  | The chip's power consumption, measured in watts. |
| thermal_dispersion_efficiency | double  | A measure of the chip's ability to dissipate heat, expressed as a percentage. |
| mechanical_stress_resistance  | double  | The ability of the chip to resist mechanical stress, measured in megapascals (MPa). |
| temperature_range             | string  | The operating temperature range the chip is designed to withstand (e.g., -40°C to 125°C). |
| humidity_levels               | integer | The level of humidity the chip can endure, expressed as a percentage (e.g., 20%, 50%, 80%). |
| vibration_levels              | string  | The expected vibration levels the chip can withstand (e.g., Low, Medium, High). |
| external_load_stress          | integer | The amount of external mechanical load the chip is designed to handle, measured in newtons (N). |
| thermal_stress                | double  | The thermal stress experienced by the chip, measured in megapascals (MPa). |
| mechanical_stress             | double  | The mechanical stress experienced by the chip, measured in megapascals (MPa). |
| electrical_density            | double  | The electrical density on the chip, measured in amperes per unit area. |
| electromigration_risk         | double  | The risk of electromigration occurring in the chip, expressed as a percentage or a scale from 0 to 1. |
| deformation                   | double  | The amount of deformation the chip undergoes under stress, measured in microns. |
| thermal_test_result           | double  | The result of thermal testing, measured in degrees Celsius (°C), indicating how well the chip handles thermal stress. |
| mechanical_stress_test_result | double  | The result of mechanical stress testing, measured in megapascals (MPa). |
| failure_mode                  | string  |  The type of failure observed during testing (e.g., Thermal Failure, Mechanical Failure, Electromigration). |
| failure_rate_                 | double  | The rate of failure for the chip, expressed as a percentage. |
| mttf_hours                    | integer | The Mean Time to Failure, measured in hours, indicating the expected operational lifetime of the chip before failure. |
| reliability_score             | double  | A score assigned to the chip based on its reliability, ranging from 0 to 10. |
| failure_rate_comparison       | string  | A comparison of the failure rates between chips optimized with FSDOI techniques and those without, with categories such as "2x Reduced," "1.5x Reduced," or "No Change." |
| statistical_significance      | string  | The statistical significance of the results, typically indicated by p-values (e.g., p-value < 0.05 for statistically significant results). | 
| temp_min                      | integer | I assume this is minimum temp it can withstand |
| temp_max                      | integer | I assume this is max temp it can withstand |

### Transformation: Feature Engineering

In [14]:
# Drop 'temperature_range' column as it is. It is redundant and that data
# is repeated in the 'temp_min' and 'temp_max' columns, which are integers
# and more useful.
# After dropping, we create a new 'temperature_range' column that calculates
# the difference between the 'temp_max' and 'temp_min', representing the span
# of temperatures the chip can withstand.

raw_chip_df = raw_chip_df.drop('temperature_range')
raw_chip_df = raw_chip_df.withColumn('temperature_range', col('temp_max') - col('temp_min'))

In [16]:
# Stress Rations
# Thermal Stress Ratio
    # = thermal stress / mechanical stress resistance
    # Ratio to express how much of the chips mechanical resistance is
    # being used up by the thermal stress

raw_chip_df = raw_chip_df.withColumn\
    ('thermal_stress_ratio', \
        col('thermal_stress') / col('mechanical_stress_resistance'))

# Mechanical Stress Ratio
    # = mechanical stress / mechanical stress resistance
    # Ratio to express how much of the chips mechanical resistance is 
    # being used up by the mechanical stress

raw_chip_df = raw_chip_df.withColumn\
    ('mechanical_stress_ratio', \
        col('mechanical_stress') / col('mechanical_stress_resistance'))

In [17]:
# Power Efficiency Metrics
# Power Density
    # = power consumption / gate density
    # Measures how much power each logic gate is using.
        # i.e., how dense the power consumption is per unit of logic
    # Interpretation
        # lower values = better efficiency (less power used per gate)
        # higher values = more power per gate. May indicate inefficient design
            # or high-performance/high-power chips

raw_chip_df = raw_chip_df.withColumn\
    ('power_density', \
        col('power_consumption') / col('gate_density'))

# Thermal Efficiency Ratio
    # = thermal dispersion efficiency / power_consumption
    # Ratio of how effective the chip is at dissipating heat relative to 
        # how much power it uses.
    # Interpretation
        # lower values = may signal chip runs hotter or inefficiently thermally
        # higher values = efficient thermal design i.e., chip is dissipating
            # more heat for each unit of power it consumes

raw_chip_df = raw_chip_df.withColumn\
    ('thermal_efficiency_ratio', \
        col('thermal_dispersion_efficiency') / col('power_consumption'))



In [22]:
# Electromigration Risk Category
    # Convert continuous or percentage variable into categories
    # Reason
        # Makes it easier to segment chip into risk levels,
            # simplifies analysis, and aids in interpretability

# Sort rows by 'electromigration risk' in ascending order
windowSpec = Window.orderBy(col('electromigration_risk'))

# Create new column 'risk_percentile'
# # ranks and categorizes chips based on 
    # relative electromigration risk
ranked_df = raw_chip_df.withColumn('risk_percentile', percent_rank().over(windowSpec))

# Create new column 'electromigration_risk_category' with
    # string labels based on the percentiles.
raw_chip_df = ranked_df.withColumn('electromigration_risk_category', when(col('risk_percentile') <= 0.33, "Low").when((col('risk_percentile') > 0.33) & (col('risk_percentile') <= 0.66), "Medium").otherwise("High"))



In [25]:
# Failure Rate Comparison Encoding
    # Comparison of failure rates of chips with optimized FSDOI
        # techniques and those without
    # Convert categorical data to numerical data to reflect their meaning

rraw_chip_df = raw_chip_df.withColumn(
    "failure_rate_comparison_encoded",
    when(col("failure_rate_comparison") == "2x Reduced", 2.0)
    .when(col("failure_rate_comparison") == "1.5x Reduced", 1.5)
    .when(col("failure_rate_comparison") == "No Change", 1.0)
    .otherwise(None)
)

In [29]:
# Stress Test Delta
    # Calculate difference (delta) between test values and
        # actual design resistance

# thermal test delta = thermal test result - thermal stress
# mechanical test delta = mechanical stress test result - mechanical stress

raw_chip_df = raw_chip_df.withColumn\
    ('thermal_test_delta', col('thermal_test_result') - \
        col('thermal_stress')).withColumn\
            ('mechanical_test_delta', col('mechanical_stress_test_result') - \
                col('mechanical_stress'))

In [30]:
# Deformation per Stress Unit
    # = deformation / mechanical stress
    # Measure of how much the chip deforms per unit of mechanical stress.
    # Interpretation
        # Higher value =  less resistant/prone to physical warping or failure
        # Lower value = more resistant/more structurally robust under load.

raw_chip_df = raw_chip_df.withColumn('deformation_per_stress', \
    col('deformation') / col('mechanical_stress'))


In [31]:
# Vibration Level Numeric Encoding
    # Expected Vibration level the chip can withstand
    # Converted to categorical levels for modeling

raw_chip_df = raw_chip_df.withColumn(
    "vibration_level_encoded",
    when(col("vibration_levels") == "Low", 1)
    .when(col("vibration_levels") == "Medium", 2)
    .when(col("vibration_levels") == "High", 3)
    .otherwise(None)
)

In [32]:
# Failure Mode Binary Flags (One-Hot Encoding)
    # Failure mode values are strings that say what
        # the reason for failure was
    # Convert to abinary flag column (0/1) for filtering or modeling

raw_chip_df = raw_chip_df.withColumn(
    "failure_mode_thermal",
    when(col("failure_mode") == "Thermal Failure", 1).otherwise(0)
).withColumn(
    "failure_mode_mechanical",
    when(col("failure_mode") == "Mechanical Failure", 1).otherwise(0)
).withColumn(
    "failure_mode_electromigration",
    when(col("failure_mode") == "Electromigration", 1).otherwise(0)
)
