In [15]:
# Import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.functions import regexp_extract


In [2]:
# Create a Spark Session
spark = SparkSession.builder\
    .appName("Automotive Chip Reliability ETL")\
        .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/02 16:11:51 WARN Utils: Your hostname, Evans-MacBook-Pro-2.local, resolves to a loopback address: 127.0.0.1; using 192.168.0.149 instead (on interface en0)
25/06/02 16:11:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/02 16:11:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load the CSV file into a Spark DataFrame
raw_chip_df = spark.read.csv('data/automotive_chips_reliability_dataset.csv', header=True, inferSchema=True)

Row(Chip ID='Chip001', Technology Node=5, FSDOI Type='Thick Layer', Gate Density=989, Substrate Type='Silicon', Power Consumption=3.12, Thermal Dispersion Efficiency=70.364436513031, Mechanical Stress Resistance=195.48182400614584, Temperature Range='-20°C to 85°C', Humidity Levels=80, Vibration Levels='Medium', External Load Stress=48, Thermal Stress=124.04184890445367, Mechanical Stress=219.88458531609044, Electrical Density=49.10287370072312, Electromigration Risk=0.03273841845647966, Deformation=0.4980688861428193, Thermal Test Result=33.00966001869167, Mechanical Stress Test Result=145.85045256562228, Failure Mode='Thermal Failure', Failure Rate (%)=4.069810220267985, MTTF (hours)=2482, Reliability Score=7.033093793285942, Failure Rate Comparison='2x Reduced', Statistical Significance='p-value < 0.05')

In [5]:
raw_chip_df.printSchema()

root
 |-- Chip ID: string (nullable = true)
 |-- Technology Node: integer (nullable = true)
 |-- FSDOI Type: string (nullable = true)
 |-- Gate Density: integer (nullable = true)
 |-- Substrate Type: string (nullable = true)
 |-- Power Consumption: double (nullable = true)
 |-- Thermal Dispersion Efficiency: double (nullable = true)
 |-- Mechanical Stress Resistance: double (nullable = true)
 |-- Temperature Range: string (nullable = true)
 |-- Humidity Levels: integer (nullable = true)
 |-- Vibration Levels: string (nullable = true)
 |-- External Load Stress: integer (nullable = true)
 |-- Thermal Stress: double (nullable = true)
 |-- Mechanical Stress: double (nullable = true)
 |-- Electrical Density: double (nullable = true)
 |-- Electromigration Risk: double (nullable = true)
 |-- Deformation: double (nullable = true)
 |-- Thermal Test Result: double (nullable = true)
 |-- Mechanical Stress Test Result: double (nullable = true)
 |-- Failure Mode: string (nullable = true)
 |-- Failu

In [6]:
raw_chip_df.show(5)

+-------+---------------+-----------+------------+-----------------+-----------------+-----------------------------+----------------------------+-----------------+---------------+----------------+--------------------+------------------+------------------+------------------+---------------------+-------------------+-------------------+-----------------------------+------------------+-----------------+------------+-----------------+-----------------------+------------------------+
|Chip ID|Technology Node| FSDOI Type|Gate Density|   Substrate Type|Power Consumption|Thermal Dispersion Efficiency|Mechanical Stress Resistance|Temperature Range|Humidity Levels|Vibration Levels|External Load Stress|    Thermal Stress| Mechanical Stress|Electrical Density|Electromigration Risk|        Deformation|Thermal Test Result|Mechanical Stress Test Result|      Failure Mode| Failure Rate (%)|MTTF (hours)|Reliability Score|Failure Rate Comparison|Statistical Significance|
+-------+---------------+-------

In [7]:
raw_chip_df.columns

['Chip ID',
 'Technology Node',
 'FSDOI Type',
 'Gate Density',
 'Substrate Type',
 'Power Consumption',
 'Thermal Dispersion Efficiency',
 'Mechanical Stress Resistance',
 'Temperature Range',
 'Humidity Levels',
 'Vibration Levels',
 'External Load Stress',
 'Thermal Stress',
 'Mechanical Stress',
 'Electrical Density',
 'Electromigration Risk',
 'Deformation',
 'Thermal Test Result',
 'Mechanical Stress Test Result',
 'Failure Mode',
 'Failure Rate (%)',
 'MTTF (hours)',
 'Reliability Score',
 'Failure Rate Comparison',
 'Statistical Significance']

In [10]:
# Before we verify the exact meaning of these columns and what they 
# represent, we run some typical/standard transformations 

# Standardize column names
raw_chip_df = raw_chip_df.select([col(c).alias(c.lower()\
    .replace(" ", "_").replace("(", "").replace(")", "")) \
        for c in raw_chip_df.columns])

In [13]:
# Handle missing values
raw_chip_df.select([F.count(F.when(F.col(c).isNull(), \
    c)).alias(c) for c in raw_chip_df.columns]).show()

+-------+---------------+----------+------------+--------------+-----------------+-----------------------------+----------------------------+-----------------+---------------+----------------+--------------------+--------------+-----------------+------------------+---------------------+-----------+-------------------+-----------------------------+------------+--------------+----------+-----------------+-----------------------+------------------------+
|chip_id|technology_node|fsdoi_type|gate_density|substrate_type|power_consumption|thermal_dispersion_efficiency|mechanical_stress_resistance|temperature_range|humidity_levels|vibration_levels|external_load_stress|thermal_stress|mechanical_stress|electrical_density|electromigration_risk|deformation|thermal_test_result|mechanical_stress_test_result|failure_mode|failure_rate_%|mttf_hours|reliability_score|failure_rate_comparison|statistical_significance|
+-------+---------------+----------+------------+--------------+-----------------+------

In [16]:
# Split or Parse Composite Columns
raw_chip_df = raw_chip_df.withColumn("temp_min", regexp_extract\
    ("temperature_range", r"(-?\d+)", 1).cast("int"))
raw_chip_df = raw_chip_df.withColumn("temp_max", regexp_extract\
    ("temperature_range", r"to\s*(-?\d+)", 1).cast("int"))

In [18]:
raw_chip_df.printSchema()

root
 |-- chip_id: string (nullable = true)
 |-- technology_node: integer (nullable = true)
 |-- fsdoi_type: string (nullable = true)
 |-- gate_density: integer (nullable = true)
 |-- substrate_type: string (nullable = true)
 |-- power_consumption: double (nullable = true)
 |-- thermal_dispersion_efficiency: double (nullable = true)
 |-- mechanical_stress_resistance: double (nullable = true)
 |-- temperature_range: string (nullable = true)
 |-- humidity_levels: integer (nullable = true)
 |-- vibration_levels: string (nullable = true)
 |-- external_load_stress: integer (nullable = true)
 |-- thermal_stress: double (nullable = true)
 |-- mechanical_stress: double (nullable = true)
 |-- electrical_density: double (nullable = true)
 |-- electromigration_risk: double (nullable = true)
 |-- deformation: double (nullable = true)
 |-- thermal_test_result: double (nullable = true)
 |-- mechanical_stress_test_result: double (nullable = true)
 |-- failure_mode: string (nullable = true)
 |-- failu

25/06/02 18:56:21 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 126649 ms exceeds timeout 120000 ms
25/06/02 18:56:21 WARN SparkContext: Killing executors is not supported by current scheduler.
25/06/02 18:56:21 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

## Automotive Chips Reliability Data Dictionary

|          Column Name          | Data Type | Description |
|-------------------------------|-------------|-----------|
| chip_id                       | string  | A unique identifier for each chip in the dataset. |
| technology_node               | integer | The process node used in the chip fabrication (e.g., 14nm, 7nm, 5nm) |
| fsdoi_type                    | string  | The type of Fully Depleted Silicon on Insulator (FSDOI) used in the chip design (e.g., Thin Layer, Thick Layer) |
| gate_density                  | integer | The density of logic gates on the chip, measured in gates per unit area. |
| substrate_type                | string  | The type of substrate used in the chip (e.g., Silicon, Silicon-Germanium).|
| power_consumption             | double  | The chip's power consumption, measured in watts. |
| thermal_dispersion_efficiency | double  | A measure of the chip's ability to dissipate heat, expressed as a percentage. |
| mechanical_stress_resistance  | double  | The ability of the chip to resist mechanical stress, measured in megapascals (MPa). |
| temperature_range             | string  | The operating temperature range the chip is designed to withstand (e.g., -40°C to 125°C). |
| humidity_levels               | integer | The level of humidity the chip can endure, expressed as a percentage (e.g., 20%, 50%, 80%). |
| vibration_levels              | string  | The expected vibration levels the chip can withstand (e.g., Low, Medium, High). |
| external_load_stress          | integer | The amount of external mechanical load the chip is designed to handle, measured in newtons (N). |
| thermal_stress                | double  | The thermal stress experienced by the chip, measured in megapascals (MPa). |
| mechanical_stress             | double  | The mechanical stress experienced by the chip, measured in megapascals (MPa). |
| electrical_density            | double  | The electrical density on the chip, measured in amperes per unit area. |
| electromigration_risk         | double  | The risk of electromigration occurring in the chip, expressed as a percentage or a scale from 0 to 1. |
| deformation                   | double  | The amount of deformation the chip undergoes under stress, measured in microns. |
| thermal_test_result           | double  | The result of thermal testing, measured in degrees Celsius (°C), indicating how well the chip handles thermal stress. |
| mechanical_stress_test_result | double  | The result of mechanical stress testing, measured in megapascals (MPa). |
| failure_mode                  | string  |  The type of failure observed during testing (e.g., Thermal Failure, Mechanical Failure, Electromigration). |
| failure_rate_                 | double  | The rate of failure for the chip, expressed as a percentage. |
| mttf_hours                    | integer | The Mean Time to Failure, measured in hours, indicating the expected operational lifetime of the chip before failure. |
| reliability_score             | double  | A score assigned to the chip based on its reliability, ranging from 0 to 10. |
| failure_rate_comparison       | string  | A comparison of the failure rates between chips optimized with FSDOI techniques and those without, with categories such as "2x Reduced," "1.5x Reduced," or "No Change." |
| statistical_significance      | string  | The statistical significance of the results, typically indicated by p-values (e.g., p-value < 0.05 for statistically significant results). | 
| temp_min                      | integer | I assume this is minimum temp it can withstand |
| temp_max                      | integer | I assume this is max temp it can withstand |