# NASA Turbofan Engine Data Exploration

This notebook explores the NASA Turbofan Engine Degradation Simulation Dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load Dataset

In [None]:
# Define column names
columns = ['engine_id', 'cycle'] + [f'setting_{i}' for i in range(1, 4)] + [f'sensor_{i}' for i in range(1, 22)]

# Load training data
train_df = pd.read_csv('../data/raw/train_FD001.txt', sep=' ', names=columns)

# Load test data  
test_df = pd.read_csv('../data/raw/test_FD001.txt', sep=' ', names=columns)

# Load RUL truth
rul_df = pd.read_csv('../data/raw/RUL_FD001.txt', names=['RUL'])

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"RUL truth shape: {rul_df.shape}")

## Data Overview

In [None]:
# Display basic info
print("Training Data Info:")
print(train_df.info())
print("\nFirst 5 rows:")
print(train_df.head())

In [None]:
# Calculate RUL for training data
def calculate_rul(df):
    """Calculate Remaining Useful Life for each row"""
    df_rul = df.copy()
    df_rul['RUL'] = df_rul.groupby('engine_id')['cycle'].transform('max') - df_rul['cycle']
    return df_rul

train_df_rul = calculate_rul(train_df)
print("Training data with RUL:")
print(train_df_rul[['engine_id', 'cycle', 'RUL']].head(10))

## Engine Lifecycle Analysis

In [None]:
# Engine lifecycle statistics
lifecycle_stats = train_df.groupby('engine_id')['cycle'].max().describe()
print("Engine Lifecycle Statistics (cycles):")
print(lifecycle_stats)

# Plot lifecycle distribution
plt.figure(figsize=(10, 6))
plt.hist(train_df.groupby('engine_id')['cycle'].max(), bins=20, alpha=0.7, edgecolor='black')
plt.xlabel('Engine Lifecycle (cycles)')
plt.ylabel('Number of Engines')
plt.title('Distribution of Engine Lifecycles')
plt.grid(True, alpha=0.3)
plt.show()

## Sensor Data Analysis

In [None]:
# Select key sensors for analysis
key_sensors = ['sensor_2', 'sensor_3', 'sensor_4', 'sensor_7', 'sensor_11', 'sensor_12']

# Plot sensor degradation for a sample engine
sample_engine = train_df_rul[train_df_rul['engine_id'] == 1]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, sensor in enumerate(key_sensors):
    axes[i].plot(sample_engine['cycle'], sample_engine[sensor], 'b-', alpha=0.7)
    axes[i].set_xlabel('Cycle')
    axes[i].set_ylabel(sensor)
    axes[i].set_title(f'{sensor} Degradation Pattern')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
sensor_cols = [col for col in train_df.columns if col.startswith('sensor_')]
correlation_matrix = train_df[sensor_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Sensor Correlation Matrix')
plt.tight_layout()
plt.show()

## RUL Distribution Analysis

In [None]:
# RUL distribution in training data
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(train_df_rul['RUL'], bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('Remaining Useful Life (RUL)')
plt.ylabel('Frequency')
plt.title('RUL Distribution in Training Data')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.hist(rul_df['RUL'], bins=20, alpha=0.7, edgecolor='black', color='orange')
plt.xlabel('Remaining Useful Life (RUL)')
plt.ylabel('Frequency')
plt.title('RUL Distribution in Test Truth Data')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Training RUL stats: {train_df_rul['RUL'].describe()}")
print(f"Test RUL stats: {rul_df['RUL'].describe()}")

## Data Quality Assessment

In [None]:
# Check for missing values
print("Missing values in training data:")
print(train_df.isnull().sum().sum())

print("\nMissing values in test data:")
print(test_df.isnull().sum().sum())

# Check data types
print("\nData types:")
print(train_df.dtypes.value_counts())

In [None]:
# Save processed data for ML pipeline
train_df_rul.to_csv('../data/processed/train_with_rul.csv', index=False)
test_df.to_csv('../data/processed/test_data.csv', index=False)
rul_df.to_csv('../data/processed/test_rul_truth.csv', index=False)

print("Processed data saved to data/processed/")