# Capstone 20.1 â€“ Initial Report & Exploratory Data Analysis (EDA)
**Project:** Fatigue Life Prediction of LPBF AlSi10Mg  
**Program:** Berkeley Professional Certificate in Machine Learning & AI  
**Author:** Erfan Maleki  

This notebook performs data cleaning, feature engineering, exploratory analysis, and a baseline regression model for predicting fatigue life (cycles to failure).  
Dataset: `Capstone data- Fatigue of LPBF AlSi10Mg.xlsx`


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor

pd.set_option("display.max_columns", 200)
plt.rcParams["figure.figsize"] = (7,4)


In [None]:
# Load dataset
data_path = "Capstone data- Fatigue of LPBF AlSi10Mg.xlsx"
df = pd.read_excel(data_path)
print("Shape:", df.shape)
df.head()


## 1. Data Overview

In [None]:
df.info()


In [None]:
df.describe(include="all").T


## 2. Data Cleaning

In [None]:
# Missing values
missing = df.isna().sum()
missing[missing>0]


In [None]:
# If any missing values appear, impute numeric with median
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

for c in num_cols:
    if df[c].isna().any():
        df[c] = df[c].fillna(df[c].median())

# Drop duplicated rows if any
df = df.drop_duplicates()
print("After cleaning shape:", df.shape)


## 3. Target Variable Distribution

In [None]:
target_col = "Fatigue life (Cycle)"
y = df[target_col]

plt.hist(y, bins=20)
plt.title("Fatigue Life Distribution (Cycles)")
plt.xlabel("Fatigue Life (cycles)")
plt.ylabel("Count")
plt.grid(True)
plt.show()

# Log10 target for physics-consistent modeling
df["log_fatigue_life"] = np.log10(df[target_col])

plt.hist(df["log_fatigue_life"], bins=20)
plt.title("Log10 Fatigue Life Distribution")
plt.xlabel("log10(Fatigue Life)")
plt.ylabel("Count")
plt.grid(True)
plt.show()


## 4. Categorical Feature Summary

In [None]:
if cat_cols:
    for c in cat_cols:
        print("\n", c)
        display(df[c].value_counts())


## 5. Correlation Analysis (Numeric Features)

In [None]:
corr = df[num_cols + ["log_fatigue_life"]].corr()

plt.imshow(corr, aspect="auto")
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# Top correlations with fatigue life
display(corr["log_fatigue_life"].sort_values(ascending=False))


## 6. Key Relationships

In [None]:
# Stress amplitude vs fatigue life
if "Stress amplitude (Mpa)" in df.columns:
    plt.scatter(df["Stress amplitude (Mpa)"], df["log_fatigue_life"])
    plt.title("Stress Amplitude vs Log Fatigue Life")
    plt.xlabel("Stress amplitude (MPa)")
    plt.ylabel("log10(Fatigue life)")
    plt.grid(True)
    plt.show()

# Surface roughness vs fatigue life (if available)
rough_cols = [c for c in df.columns if "roughness" in c.lower()]
for c in rough_cols:
    plt.scatter(df[c], df["log_fatigue_life"])
    plt.title(f"{c} vs Log Fatigue Life")
    plt.xlabel(c)
    plt.ylabel("log10(Fatigue life)")
    plt.grid(True)
    plt.show()


## 7. Outlier Detection (IQR)

In [None]:
outlier_summary = {}
for c in num_cols:
    q1, q3 = df[c].quantile([0.25, 0.75])
    iqr = q3 - q1
    low, high = q1 - 1.5*iqr, q3 + 1.5*iqr
    outliers = df[(df[c] < low) | (df[c] > high)]
    outlier_summary[c] = len(outliers)

pd.Series(outlier_summary).sort_values(ascending=False).head(10)


## 8. Feature Engineering

In [None]:
# Example engineered feature: normalize stress by UTS if columns exist
if "Ultimate strength (MPa)" in df.columns and "Stress amplitude (Mpa)" in df.columns:
    df["stress_ratio"] = df["Stress amplitude (Mpa)"] / df["Ultimate strength (MPa)"]

df.head()


## 9. Baseline Regression Model (Linear Regression)

In [None]:
X = df.drop(columns=[target_col, "log_fatigue_life"])
y_log = df["log_fatigue_life"]

cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()
num_cols = X.select_dtypes(include=np.number).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

baseline = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

baseline.fit(X_train, y_train)
pred = baseline.predict(X_test)

rmse = mean_squared_error(y_test, pred, squared=False)
mae  = mean_absolute_error(y_test, pred)
r2   = r2_score(y_test, pred)

print("Baseline Linear Regression (log10 life)")
print("RMSE:", rmse)
print("MAE :", mae)
print("R2  :", r2)


## 10. Advanced Models (for comparison)

In [None]:
models = {
    "Random Forest": RandomForestRegressor(n_estimators=500, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=600, learning_rate=0.02, max_depth=3, random_state=42),
    "HistGradientBoosting": HistGradientBoostingRegressor(max_depth=6, max_iter=800, learning_rate=0.03, random_state=42),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=0.001, random_state=42)
}

results = []
for name, mdl in models.items():
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", mdl)
    ])
    pipe.fit(X_train, y_train)
    p = pipe.predict(X_test)
    results.append({
        "Model": name,
        "RMSE": mean_squared_error(y_test, p, squared=False),
        "MAE": mean_absolute_error(y_test, p),
        "R2": r2_score(y_test, p)
    })

results_df = pd.DataFrame(results).sort_values("RMSE")
results_df


## 11. Conclusions (Module 20.1)

- The dataset shows strong nonlinear relationships between process/surface variables and fatigue life.  
- Baseline Linear Regression provides a reasonable starting point but does not capture all trends.  
- Boosting-based models (especially HistGradientBoosting) are promising for Module 24 refinement.  
- Next steps: hyperparameter tuning, cross-validation, SHAP interpretability, and final model selection.
