# Data Exploration

**Objective:** Explore NASA C-MAPSS turbofan engine degradation dataset

**Dataset:** FD001 - Run-to-failure simulations  
**Features:** 21 sensors + 3 operational settings  
**Target:** Remaining Useful Life (RUL) in cycles

In [None]:
# --------------------------------------------------------------------------
# Import required libraries for data analysis and visualization
# --------------------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

In [None]:
# --------------------------------------------------------------------------
# Load training data from NASA C-MAPSS dataset
# --------------------------------------------------------------------------

column_names = ['unit_id', 'cycle'] + [f'op_setting_{i}' for i in range(1, 4)] + [f'sensor_{i}' for i in range(1, 22)]

train = pd.read_csv('CMaps/train_FD001.txt', sep='\s+', header=None, names=column_names)

print(f"Dataset shape: {train.shape}")
print(f"Total samples: {train.shape[0]:,}")

In [None]:
# --------------------------------------------------------------------------
# Analyze engine lifecycles: number of engines and lifespan statistics
# --------------------------------------------------------------------------

n_engines = train['unit_id'].nunique()
cycles_per_engine = train.groupby('unit_id')['cycle'].max()

print(f"Number of engines: {n_engines}")
print(f"Cycle range: {train['cycle'].min()} to {train['cycle'].max()}")
print(f"\nEngine lifespan statistics:")
print(f"  Min: {cycles_per_engine.min()} cycles")
print(f"  Max: {cycles_per_engine.max()} cycles")
print(f"  Mean: {cycles_per_engine.mean():.1f} cycles")
print(f"  Median: {cycles_per_engine.median():.1f} cycles")

In [None]:
# --------------------------------------------------------------------------
# Visualize distribution of engine lifespans
# --------------------------------------------------------------------------
fig, ax = plt.subplots(figsize=(10, 5))

ax.hist(cycles_per_engine, bins=20, edgecolor='black', alpha=0.7)
ax.axvline(cycles_per_engine.mean(), color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {cycles_per_engine.mean():.0f}')
ax.axvline(cycles_per_engine.median(), color='green', linestyle='--', linewidth=2, 
           label=f'Median: {cycles_per_engine.median():.0f}')

ax.set_xlabel('Engine Lifespan (cycles)', fontsize=12)
ax.set_ylabel('Number of Engines', fontsize=12)
ax.set_title('Distribution of Engine Lifespans', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# --------------------------------------------------------------------------
# Calculate Remaining Useful Life (RUL) for each sample
# RUL = max_cycle - current_cycle for each engine
# --------------------------------------------------------------------------

max_cycles = train.groupby('unit_id')['cycle'].max().reset_index()
max_cycles.columns = ['unit_id', 'max_cycle']

train = train.merge(max_cycles, on='unit_id', how='left')
train['RUL'] = train['max_cycle'] - train['cycle']
train = train.drop('max_cycle', axis=1)

print("RUL statistics:")
print(train['RUL'].describe())

In [None]:
# --------------------------------------------------------------------------
# Visualize RUL degradation over time for a single engine
# --------------------------------------------------------------------------
engine_1 = train[train['unit_id'] == 1].sort_values('cycle')

fig, ax = plt.subplots(figsize=(12, 5))

ax.plot(engine_1['cycle'], engine_1['RUL'], linewidth=2.5, color='#e74c3c')
ax.fill_between(engine_1['cycle'], engine_1['RUL'], alpha=0.3, color='#e74c3c')

ax.set_xlabel('Cycle', fontsize=12)
ax.set_ylabel('Remaining Useful Life (RUL)', fontsize=12)
ax.set_title('Engine 1: RUL Degradation Over Time', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# --------------------------------------------------------------------------
# Compare RUL degradation patterns across multiple engines
# --------------------------------------------------------------------------

fig, ax = plt.subplots(figsize=(12, 6))

sample_engines = np.random.choice(train['unit_id'].unique(), 10, replace=False)

for engine_id in sample_engines:
    engine_data = train[train['unit_id'] == engine_id].sort_values('cycle')
    ax.plot(engine_data['cycle'], engine_data['RUL'], 
            linewidth=2, alpha=0.6, label=f'Engine {engine_id}')

ax.set_xlabel('Cycle', fontsize=12)
ax.set_ylabel('Remaining Useful Life (RUL)', fontsize=12)
ax.set_title('RUL Degradation: Sample of 10 Engines', fontsize=14, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# --------------------------------------------------------------------------
# Analyze sensor data: get all sensor columns and display statistics
# --------------------------------------------------------------------------

sensor_cols = [col for col in train.columns if col.startswith('sensor_')]
print(f"Total sensors: {len(sensor_cols)}")

train[sensor_cols].describe()

In [None]:
# --------------------------------------------------------------------------
# Calculate sensor variance to identify constant/useless sensors
# Sensors with variance < 0.001 provide no information
# --------------------------------------------------------------------------

sensor_variance = train[sensor_cols].var().sort_values(ascending=False)

print("Sensor variance (sorted):")
print(sensor_variance)

constant_sensors = sensor_variance[sensor_variance < 0.001].index.tolist()
useful_sensors = sensor_variance[sensor_variance >= 0.001].index.tolist()

print(f"\nConstant sensors (variance < 0.001): {len(constant_sensors)}")
print(f"Useful sensors: {len(useful_sensors)}")

In [None]:
# --------------------------------------------------------------------------
# Visualize sensor variance to identify which sensors to remove
# --------------------------------------------------------------------------

fig, ax = plt.subplots(figsize=(12, 6))

sensor_variance.plot(kind='bar', ax=ax, color='steelblue', alpha=0.7)
ax.axhline(y=0.001, color='red', linestyle='--', linewidth=2, label='Threshold (0.001)')

ax.set_xlabel('Sensor', fontsize=12)
ax.set_ylabel('Variance', fontsize=12)
ax.set_title('Sensor Variance Analysis', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# --------------------------------------------------------------------------
# Plot sensor trends over time for sample engine to observe patterns
# --------------------------------------------------------------------------

sensors_to_plot = useful_sensors[:6]

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

engine_1 = train[train['unit_id'] == 1].sort_values('cycle')

for i, sensor in enumerate(sensors_to_plot):
    axes[i].plot(engine_1['cycle'], engine_1[sensor], linewidth=2)
    axes[i].set_xlabel('Cycle', fontsize=10)
    axes[i].set_ylabel('Value', fontsize=10)
    axes[i].set_title(sensor, fontsize=11, fontweight='bold')
    axes[i].grid(True, alpha=0.3)

plt.suptitle('Sensor Trends for Engine 1', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# --------------------------------------------------------------------------
# Calculate correlation between sensors and RUL to identify predictive features
# --------------------------------------------------------------------------

correlations = train[useful_sensors + ['RUL']].corr()['RUL'].sort_values(ascending=False)
correlations = correlations.drop('RUL')

print("Sensor correlation with RUL (sorted):")
print(correlations)

print(f"\nMost positively correlated: {correlations.idxmax()} ({correlations.max():.3f})")
print(f"Most negatively correlated: {correlations.idxmin()} ({correlations.min():.3f})")

In [None]:
# --------------------------------------------------------------------------
# Visualize sensor correlations with RUL
# --------------------------------------------------------------------------

fig, ax = plt.subplots(figsize=(10, 8))

correlations.plot(kind='barh', ax=ax, color='teal', alpha=0.7)
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Correlation with RUL', fontsize=12)
ax.set_ylabel('Sensor', fontsize=12)
ax.set_title('Sensor Correlation with RUL', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()